{ "best_global_step": 60426, "best_metric": 0.650672197341919, "best_model_checkpoint": "saves_multiple/ia3/llama-3-8b-instruct/train_math_qa_123_1760637726/checkpoint-60426", "epoch": 20.0, "eval_steps": 6714, "global_step": 134280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007447125409591898, "grad_norm": 5.792025566101074, "learning_rate": 1.4894250819183796e-08, "loss": 1.1615, "num_input_tokens_seen": 2976, "step": 5 }, { "epoch": 0.0014894250819183796, "grad_norm": 2.9788265228271484, "learning_rate": 3.351206434316354e-08, "loss": 1.2409, "num_input_tokens_seen": 5920, "step": 10 }, { "epoch": 0.002234137622877569, "grad_norm": 3.665639877319336, "learning_rate": 5.2129877867143284e-08, "loss": 1.2236, "num_input_tokens_seen": 8896, "step": 15 }, { "epoch": 0.002978850163836759, "grad_norm": 4.059674263000488, "learning_rate": 7.074769139112303e-08, "loss": 1.4184, "num_input_tokens_seen": 11744, "step": 20 }, { "epoch": 0.0037235627047959487, "grad_norm": 2.5596258640289307, "learning_rate": 8.936550491510277e-08, "loss": 0.6777, "num_input_tokens_seen": 14496, "step": 25 }, { "epoch": 0.004468275245755138, "grad_norm": 4.450432300567627, "learning_rate": 1.0798331843908253e-07, "loss": 1.2229, "num_input_tokens_seen": 17440, "step": 30 }, { "epoch": 0.005212987786714328, "grad_norm": 5.851156234741211, "learning_rate": 1.2660113196306226e-07, "loss": 1.2585, "num_input_tokens_seen": 20352, "step": 35 }, { "epoch": 0.005957700327673518, "grad_norm": 4.039055347442627, "learning_rate": 1.45218945487042e-07, "loss": 1.3754, "num_input_tokens_seen": 23328, "step": 40 }, { "epoch": 0.006702412868632708, "grad_norm": 3.6211838722229004, "learning_rate": 1.6383675901102174e-07, "loss": 1.2717, "num_input_tokens_seen": 26304, "step": 45 }, { "epoch": 0.0074471254095918975, "grad_norm": 5.67529821395874, "learning_rate": 1.824545725350015e-07, "loss": 1.1989, "num_input_tokens_seen": 29184, "step": 50 }, { "epoch": 0.008191837950551088, "grad_norm": 3.3544466495513916, "learning_rate": 2.0107238605898125e-07, "loss": 1.5436, "num_input_tokens_seen": 32544, "step": 55 }, { "epoch": 0.008936550491510277, "grad_norm": 4.5297651290893555, "learning_rate": 2.1969019958296101e-07, "loss": 1.0981, "num_input_tokens_seen": 35520, "step": 60 }, { "epoch": 0.009681263032469467, "grad_norm": 3.0109190940856934, "learning_rate": 2.3830801310694073e-07, "loss": 1.1133, "num_input_tokens_seen": 38400, "step": 65 }, { "epoch": 0.010425975573428656, "grad_norm": 4.135786056518555, "learning_rate": 2.569258266309205e-07, "loss": 1.2761, "num_input_tokens_seen": 41408, "step": 70 }, { "epoch": 0.011170688114387846, "grad_norm": 3.959832191467285, "learning_rate": 2.755436401549002e-07, "loss": 1.228, "num_input_tokens_seen": 44352, "step": 75 }, { "epoch": 0.011915400655347037, "grad_norm": 2.782316207885742, "learning_rate": 2.9416145367888e-07, "loss": 1.0921, "num_input_tokens_seen": 46912, "step": 80 }, { "epoch": 0.012660113196306225, "grad_norm": 4.461288928985596, "learning_rate": 3.1277926720285975e-07, "loss": 1.0619, "num_input_tokens_seen": 49472, "step": 85 }, { "epoch": 0.013404825737265416, "grad_norm": 5.393271446228027, "learning_rate": 3.3139708072683946e-07, "loss": 1.2598, "num_input_tokens_seen": 52096, "step": 90 }, { "epoch": 0.014149538278224605, "grad_norm": 3.6600146293640137, "learning_rate": 3.5001489425081923e-07, "loss": 1.0828, "num_input_tokens_seen": 54944, "step": 95 }, { "epoch": 0.014894250819183795, "grad_norm": 3.8630900382995605, "learning_rate": 3.6863270777479894e-07, "loss": 1.0262, "num_input_tokens_seen": 57664, "step": 100 }, { "epoch": 0.015638963360142984, "grad_norm": 5.240833282470703, "learning_rate": 3.872505212987787e-07, "loss": 1.3555, "num_input_tokens_seen": 60448, "step": 105 }, { "epoch": 0.016383675901102176, "grad_norm": 4.559704303741455, "learning_rate": 4.058683348227585e-07, "loss": 1.2381, "num_input_tokens_seen": 63808, "step": 110 }, { "epoch": 0.017128388442061365, "grad_norm": 5.734770774841309, "learning_rate": 4.244861483467382e-07, "loss": 1.3244, "num_input_tokens_seen": 66464, "step": 115 }, { "epoch": 0.017873100983020553, "grad_norm": 3.793649673461914, "learning_rate": 4.431039618707179e-07, "loss": 1.2466, "num_input_tokens_seen": 69856, "step": 120 }, { "epoch": 0.018617813523979745, "grad_norm": 5.127392768859863, "learning_rate": 4.617217753946977e-07, "loss": 1.3284, "num_input_tokens_seen": 72768, "step": 125 }, { "epoch": 0.019362526064938934, "grad_norm": 3.6398215293884277, "learning_rate": 4.803395889186774e-07, "loss": 1.152, "num_input_tokens_seen": 75616, "step": 130 }, { "epoch": 0.020107238605898123, "grad_norm": 4.51300048828125, "learning_rate": 4.989574024426572e-07, "loss": 1.2144, "num_input_tokens_seen": 78656, "step": 135 }, { "epoch": 0.02085195114685731, "grad_norm": 3.345590114593506, "learning_rate": 5.17575215966637e-07, "loss": 1.2081, "num_input_tokens_seen": 81760, "step": 140 }, { "epoch": 0.021596663687816504, "grad_norm": 5.7820844650268555, "learning_rate": 5.361930294906167e-07, "loss": 1.6574, "num_input_tokens_seen": 84928, "step": 145 }, { "epoch": 0.022341376228775692, "grad_norm": 4.579047203063965, "learning_rate": 5.548108430145964e-07, "loss": 1.5161, "num_input_tokens_seen": 88096, "step": 150 }, { "epoch": 0.02308608876973488, "grad_norm": 2.8096120357513428, "learning_rate": 5.734286565385761e-07, "loss": 0.811, "num_input_tokens_seen": 90912, "step": 155 }, { "epoch": 0.023830801310694073, "grad_norm": 3.9186463356018066, "learning_rate": 5.920464700625559e-07, "loss": 1.0904, "num_input_tokens_seen": 93696, "step": 160 }, { "epoch": 0.024575513851653262, "grad_norm": 3.6444194316864014, "learning_rate": 6.106642835865357e-07, "loss": 1.4485, "num_input_tokens_seen": 96992, "step": 165 }, { "epoch": 0.02532022639261245, "grad_norm": 3.2868235111236572, "learning_rate": 6.292820971105154e-07, "loss": 1.269, "num_input_tokens_seen": 99744, "step": 170 }, { "epoch": 0.026064938933571643, "grad_norm": 3.7774174213409424, "learning_rate": 6.478999106344952e-07, "loss": 1.2426, "num_input_tokens_seen": 102432, "step": 175 }, { "epoch": 0.02680965147453083, "grad_norm": 4.083743572235107, "learning_rate": 6.665177241584749e-07, "loss": 1.3559, "num_input_tokens_seen": 105440, "step": 180 }, { "epoch": 0.02755436401549002, "grad_norm": 3.3425536155700684, "learning_rate": 6.851355376824546e-07, "loss": 1.7095, "num_input_tokens_seen": 108320, "step": 185 }, { "epoch": 0.02829907655644921, "grad_norm": 6.311239242553711, "learning_rate": 7.037533512064343e-07, "loss": 1.6704, "num_input_tokens_seen": 111264, "step": 190 }, { "epoch": 0.0290437890974084, "grad_norm": 3.24592661857605, "learning_rate": 7.223711647304142e-07, "loss": 1.2563, "num_input_tokens_seen": 114432, "step": 195 }, { "epoch": 0.02978850163836759, "grad_norm": 5.044783115386963, "learning_rate": 7.409889782543939e-07, "loss": 1.1379, "num_input_tokens_seen": 117248, "step": 200 }, { "epoch": 0.03053321417932678, "grad_norm": 4.948826789855957, "learning_rate": 7.596067917783736e-07, "loss": 1.2472, "num_input_tokens_seen": 120512, "step": 205 }, { "epoch": 0.03127792672028597, "grad_norm": 3.7148029804229736, "learning_rate": 7.782246053023533e-07, "loss": 1.1802, "num_input_tokens_seen": 123776, "step": 210 }, { "epoch": 0.032022639261245156, "grad_norm": 5.323946475982666, "learning_rate": 7.96842418826333e-07, "loss": 1.1706, "num_input_tokens_seen": 126880, "step": 215 }, { "epoch": 0.03276735180220435, "grad_norm": 4.468817234039307, "learning_rate": 8.154602323503128e-07, "loss": 1.5999, "num_input_tokens_seen": 130112, "step": 220 }, { "epoch": 0.03351206434316354, "grad_norm": 3.678264617919922, "learning_rate": 8.340780458742925e-07, "loss": 1.122, "num_input_tokens_seen": 132704, "step": 225 }, { "epoch": 0.03425677688412273, "grad_norm": 4.670085430145264, "learning_rate": 8.526958593982724e-07, "loss": 1.666, "num_input_tokens_seen": 135968, "step": 230 }, { "epoch": 0.03500148942508192, "grad_norm": 3.6119418144226074, "learning_rate": 8.713136729222521e-07, "loss": 1.1681, "num_input_tokens_seen": 138752, "step": 235 }, { "epoch": 0.035746201966041107, "grad_norm": 4.252447605133057, "learning_rate": 8.899314864462318e-07, "loss": 1.0307, "num_input_tokens_seen": 141792, "step": 240 }, { "epoch": 0.036490914507000295, "grad_norm": 3.436521530151367, "learning_rate": 9.085492999702115e-07, "loss": 0.9019, "num_input_tokens_seen": 144640, "step": 245 }, { "epoch": 0.03723562704795949, "grad_norm": 3.146587610244751, "learning_rate": 9.271671134941912e-07, "loss": 0.9228, "num_input_tokens_seen": 147648, "step": 250 }, { "epoch": 0.03798033958891868, "grad_norm": 2.4952447414398193, "learning_rate": 9.457849270181709e-07, "loss": 1.0745, "num_input_tokens_seen": 150432, "step": 255 }, { "epoch": 0.03872505212987787, "grad_norm": 2.7659177780151367, "learning_rate": 9.644027405421507e-07, "loss": 1.0961, "num_input_tokens_seen": 153312, "step": 260 }, { "epoch": 0.03946976467083706, "grad_norm": 5.689547538757324, "learning_rate": 9.830205540661306e-07, "loss": 1.3626, "num_input_tokens_seen": 156288, "step": 265 }, { "epoch": 0.040214477211796246, "grad_norm": 6.353632926940918, "learning_rate": 1.0016383675901103e-06, "loss": 1.5461, "num_input_tokens_seen": 159136, "step": 270 }, { "epoch": 0.040959189752755434, "grad_norm": 4.8942036628723145, "learning_rate": 1.02025618111409e-06, "loss": 0.9752, "num_input_tokens_seen": 162144, "step": 275 }, { "epoch": 0.04170390229371462, "grad_norm": 2.5092110633850098, "learning_rate": 1.0388739946380697e-06, "loss": 0.9178, "num_input_tokens_seen": 164832, "step": 280 }, { "epoch": 0.04244861483467382, "grad_norm": 5.134583473205566, "learning_rate": 1.0574918081620494e-06, "loss": 1.6165, "num_input_tokens_seen": 167744, "step": 285 }, { "epoch": 0.04319332737563301, "grad_norm": 4.565773010253906, "learning_rate": 1.0761096216860292e-06, "loss": 0.9751, "num_input_tokens_seen": 170528, "step": 290 }, { "epoch": 0.043938039916592196, "grad_norm": 3.884310483932495, "learning_rate": 1.0947274352100089e-06, "loss": 1.31, "num_input_tokens_seen": 173440, "step": 295 }, { "epoch": 0.044682752457551385, "grad_norm": 7.695960998535156, "learning_rate": 1.1133452487339888e-06, "loss": 1.6738, "num_input_tokens_seen": 176160, "step": 300 }, { "epoch": 0.045427464998510574, "grad_norm": 7.885486125946045, "learning_rate": 1.1319630622579685e-06, "loss": 1.5349, "num_input_tokens_seen": 178816, "step": 305 }, { "epoch": 0.04617217753946976, "grad_norm": 6.9694342613220215, "learning_rate": 1.1505808757819482e-06, "loss": 1.806, "num_input_tokens_seen": 181664, "step": 310 }, { "epoch": 0.04691689008042895, "grad_norm": 5.054891586303711, "learning_rate": 1.169198689305928e-06, "loss": 1.1251, "num_input_tokens_seen": 184384, "step": 315 }, { "epoch": 0.04766160262138815, "grad_norm": 5.299906253814697, "learning_rate": 1.1878165028299077e-06, "loss": 1.2646, "num_input_tokens_seen": 187072, "step": 320 }, { "epoch": 0.048406315162347335, "grad_norm": 3.260540246963501, "learning_rate": 1.2064343163538874e-06, "loss": 1.0715, "num_input_tokens_seen": 189920, "step": 325 }, { "epoch": 0.049151027703306524, "grad_norm": 4.224701404571533, "learning_rate": 1.2250521298778673e-06, "loss": 1.2599, "num_input_tokens_seen": 192608, "step": 330 }, { "epoch": 0.04989574024426571, "grad_norm": 6.7507147789001465, "learning_rate": 1.243669943401847e-06, "loss": 1.2066, "num_input_tokens_seen": 195744, "step": 335 }, { "epoch": 0.0506404527852249, "grad_norm": 3.700134754180908, "learning_rate": 1.2622877569258267e-06, "loss": 1.084, "num_input_tokens_seen": 198272, "step": 340 }, { "epoch": 0.05138516532618409, "grad_norm": 2.8239753246307373, "learning_rate": 1.2809055704498064e-06, "loss": 1.1449, "num_input_tokens_seen": 201152, "step": 345 }, { "epoch": 0.052129877867143286, "grad_norm": 3.4670310020446777, "learning_rate": 1.2995233839737862e-06, "loss": 1.0111, "num_input_tokens_seen": 203936, "step": 350 }, { "epoch": 0.052874590408102475, "grad_norm": 3.5843801498413086, "learning_rate": 1.3181411974977659e-06, "loss": 1.2835, "num_input_tokens_seen": 206976, "step": 355 }, { "epoch": 0.05361930294906166, "grad_norm": 8.526297569274902, "learning_rate": 1.3367590110217456e-06, "loss": 1.3261, "num_input_tokens_seen": 210048, "step": 360 }, { "epoch": 0.05436401549002085, "grad_norm": 6.884479999542236, "learning_rate": 1.3553768245457255e-06, "loss": 1.5041, "num_input_tokens_seen": 212896, "step": 365 }, { "epoch": 0.05510872803098004, "grad_norm": 3.0756924152374268, "learning_rate": 1.3739946380697052e-06, "loss": 1.1195, "num_input_tokens_seen": 215872, "step": 370 }, { "epoch": 0.05585344057193923, "grad_norm": 3.425605297088623, "learning_rate": 1.392612451593685e-06, "loss": 1.0714, "num_input_tokens_seen": 218624, "step": 375 }, { "epoch": 0.05659815311289842, "grad_norm": 6.517431735992432, "learning_rate": 1.4112302651176647e-06, "loss": 1.3627, "num_input_tokens_seen": 221408, "step": 380 }, { "epoch": 0.057342865653857614, "grad_norm": 4.966618061065674, "learning_rate": 1.4298480786416444e-06, "loss": 1.2547, "num_input_tokens_seen": 224224, "step": 385 }, { "epoch": 0.0580875781948168, "grad_norm": 4.826229095458984, "learning_rate": 1.448465892165624e-06, "loss": 1.1718, "num_input_tokens_seen": 227136, "step": 390 }, { "epoch": 0.05883229073577599, "grad_norm": 3.8181965351104736, "learning_rate": 1.4670837056896038e-06, "loss": 1.4227, "num_input_tokens_seen": 230272, "step": 395 }, { "epoch": 0.05957700327673518, "grad_norm": 2.6535439491271973, "learning_rate": 1.4857015192135837e-06, "loss": 1.1221, "num_input_tokens_seen": 233568, "step": 400 }, { "epoch": 0.06032171581769437, "grad_norm": 4.482980728149414, "learning_rate": 1.5043193327375634e-06, "loss": 1.4183, "num_input_tokens_seen": 236384, "step": 405 }, { "epoch": 0.06106642835865356, "grad_norm": 3.5972604751586914, "learning_rate": 1.5229371462615432e-06, "loss": 1.0915, "num_input_tokens_seen": 239424, "step": 410 }, { "epoch": 0.06181114089961275, "grad_norm": 2.639066696166992, "learning_rate": 1.5415549597855229e-06, "loss": 1.1577, "num_input_tokens_seen": 242240, "step": 415 }, { "epoch": 0.06255585344057193, "grad_norm": 3.250744342803955, "learning_rate": 1.5601727733095026e-06, "loss": 1.1804, "num_input_tokens_seen": 244800, "step": 420 }, { "epoch": 0.06330056598153112, "grad_norm": 3.842384099960327, "learning_rate": 1.5787905868334823e-06, "loss": 1.2763, "num_input_tokens_seen": 247968, "step": 425 }, { "epoch": 0.06404527852249031, "grad_norm": 4.18927001953125, "learning_rate": 1.597408400357462e-06, "loss": 1.6346, "num_input_tokens_seen": 250720, "step": 430 }, { "epoch": 0.06478999106344951, "grad_norm": 3.2182395458221436, "learning_rate": 1.6160262138814417e-06, "loss": 0.9124, "num_input_tokens_seen": 253952, "step": 435 }, { "epoch": 0.0655347036044087, "grad_norm": 3.682915210723877, "learning_rate": 1.6346440274054214e-06, "loss": 1.4715, "num_input_tokens_seen": 256576, "step": 440 }, { "epoch": 0.06627941614536789, "grad_norm": 7.905789375305176, "learning_rate": 1.6532618409294012e-06, "loss": 1.2465, "num_input_tokens_seen": 259488, "step": 445 }, { "epoch": 0.06702412868632708, "grad_norm": 4.527535915374756, "learning_rate": 1.6718796544533813e-06, "loss": 1.1466, "num_input_tokens_seen": 262112, "step": 450 }, { "epoch": 0.06776884122728627, "grad_norm": 3.49835205078125, "learning_rate": 1.690497467977361e-06, "loss": 1.085, "num_input_tokens_seen": 265120, "step": 455 }, { "epoch": 0.06851355376824546, "grad_norm": 4.109592914581299, "learning_rate": 1.7091152815013407e-06, "loss": 1.3196, "num_input_tokens_seen": 267872, "step": 460 }, { "epoch": 0.06925826630920465, "grad_norm": 3.765496253967285, "learning_rate": 1.7277330950253204e-06, "loss": 1.3545, "num_input_tokens_seen": 270464, "step": 465 }, { "epoch": 0.07000297885016384, "grad_norm": 4.143918037414551, "learning_rate": 1.7463509085493002e-06, "loss": 1.4008, "num_input_tokens_seen": 273152, "step": 470 }, { "epoch": 0.07074769139112302, "grad_norm": 6.3840155601501465, "learning_rate": 1.7649687220732799e-06, "loss": 1.1088, "num_input_tokens_seen": 275968, "step": 475 }, { "epoch": 0.07149240393208221, "grad_norm": 4.908646106719971, "learning_rate": 1.7835865355972596e-06, "loss": 1.3262, "num_input_tokens_seen": 278848, "step": 480 }, { "epoch": 0.0722371164730414, "grad_norm": 5.123385429382324, "learning_rate": 1.8022043491212393e-06, "loss": 1.3852, "num_input_tokens_seen": 281664, "step": 485 }, { "epoch": 0.07298182901400059, "grad_norm": 3.378004789352417, "learning_rate": 1.820822162645219e-06, "loss": 1.1687, "num_input_tokens_seen": 285056, "step": 490 }, { "epoch": 0.07372654155495978, "grad_norm": 3.1964144706726074, "learning_rate": 1.8394399761691987e-06, "loss": 1.4381, "num_input_tokens_seen": 287872, "step": 495 }, { "epoch": 0.07447125409591898, "grad_norm": 5.373366832733154, "learning_rate": 1.8580577896931784e-06, "loss": 1.2575, "num_input_tokens_seen": 290752, "step": 500 }, { "epoch": 0.07521596663687817, "grad_norm": 2.0569071769714355, "learning_rate": 1.8766756032171582e-06, "loss": 1.0899, "num_input_tokens_seen": 293824, "step": 505 }, { "epoch": 0.07596067917783736, "grad_norm": 6.035353660583496, "learning_rate": 1.8952934167411379e-06, "loss": 1.4945, "num_input_tokens_seen": 296672, "step": 510 }, { "epoch": 0.07670539171879655, "grad_norm": 3.7828123569488525, "learning_rate": 1.913911230265118e-06, "loss": 1.3496, "num_input_tokens_seen": 299776, "step": 515 }, { "epoch": 0.07745010425975574, "grad_norm": 6.939604759216309, "learning_rate": 1.9325290437890977e-06, "loss": 1.4139, "num_input_tokens_seen": 302720, "step": 520 }, { "epoch": 0.07819481680071493, "grad_norm": 3.3749945163726807, "learning_rate": 1.9511468573130772e-06, "loss": 1.3143, "num_input_tokens_seen": 305472, "step": 525 }, { "epoch": 0.07893952934167411, "grad_norm": 5.909675598144531, "learning_rate": 1.969764670837057e-06, "loss": 1.2427, "num_input_tokens_seen": 308672, "step": 530 }, { "epoch": 0.0796842418826333, "grad_norm": 3.8312039375305176, "learning_rate": 1.9883824843610367e-06, "loss": 1.0104, "num_input_tokens_seen": 311904, "step": 535 }, { "epoch": 0.08042895442359249, "grad_norm": 3.272775173187256, "learning_rate": 2.0070002978850166e-06, "loss": 1.0556, "num_input_tokens_seen": 314688, "step": 540 }, { "epoch": 0.08117366696455168, "grad_norm": 3.0029361248016357, "learning_rate": 2.025618111408996e-06, "loss": 1.1556, "num_input_tokens_seen": 317472, "step": 545 }, { "epoch": 0.08191837950551087, "grad_norm": 2.9224696159362793, "learning_rate": 2.044235924932976e-06, "loss": 1.0031, "num_input_tokens_seen": 320128, "step": 550 }, { "epoch": 0.08266309204647006, "grad_norm": 3.7100565433502197, "learning_rate": 2.0628537384569555e-06, "loss": 1.0733, "num_input_tokens_seen": 322944, "step": 555 }, { "epoch": 0.08340780458742925, "grad_norm": 2.938786506652832, "learning_rate": 2.0814715519809354e-06, "loss": 1.015, "num_input_tokens_seen": 325856, "step": 560 }, { "epoch": 0.08415251712838845, "grad_norm": 6.790724277496338, "learning_rate": 2.100089365504915e-06, "loss": 1.4554, "num_input_tokens_seen": 328800, "step": 565 }, { "epoch": 0.08489722966934764, "grad_norm": 5.304668426513672, "learning_rate": 2.118707179028895e-06, "loss": 1.4116, "num_input_tokens_seen": 331424, "step": 570 }, { "epoch": 0.08564194221030683, "grad_norm": 3.7245516777038574, "learning_rate": 2.1373249925528744e-06, "loss": 1.2458, "num_input_tokens_seen": 334112, "step": 575 }, { "epoch": 0.08638665475126601, "grad_norm": 3.591296911239624, "learning_rate": 2.1559428060768547e-06, "loss": 1.1707, "num_input_tokens_seen": 336992, "step": 580 }, { "epoch": 0.0871313672922252, "grad_norm": 3.603546380996704, "learning_rate": 2.1745606196008342e-06, "loss": 1.4092, "num_input_tokens_seen": 340224, "step": 585 }, { "epoch": 0.08787607983318439, "grad_norm": 5.7089152336120605, "learning_rate": 2.193178433124814e-06, "loss": 1.0599, "num_input_tokens_seen": 343040, "step": 590 }, { "epoch": 0.08862079237414358, "grad_norm": 4.614741325378418, "learning_rate": 2.2117962466487937e-06, "loss": 1.0945, "num_input_tokens_seen": 345696, "step": 595 }, { "epoch": 0.08936550491510277, "grad_norm": 2.930844783782959, "learning_rate": 2.2304140601727736e-06, "loss": 1.2021, "num_input_tokens_seen": 348992, "step": 600 }, { "epoch": 0.09011021745606196, "grad_norm": 1.8115154504776, "learning_rate": 2.249031873696753e-06, "loss": 1.039, "num_input_tokens_seen": 352032, "step": 605 }, { "epoch": 0.09085492999702115, "grad_norm": 6.338430404663086, "learning_rate": 2.267649687220733e-06, "loss": 0.8212, "num_input_tokens_seen": 354656, "step": 610 }, { "epoch": 0.09159964253798034, "grad_norm": 3.484919786453247, "learning_rate": 2.2862675007447125e-06, "loss": 1.0787, "num_input_tokens_seen": 357504, "step": 615 }, { "epoch": 0.09234435507893952, "grad_norm": 2.3765580654144287, "learning_rate": 2.3048853142686924e-06, "loss": 0.944, "num_input_tokens_seen": 360256, "step": 620 }, { "epoch": 0.09308906761989871, "grad_norm": 2.7118079662323, "learning_rate": 2.323503127792672e-06, "loss": 1.0984, "num_input_tokens_seen": 363072, "step": 625 }, { "epoch": 0.0938337801608579, "grad_norm": 6.990606784820557, "learning_rate": 2.342120941316652e-06, "loss": 1.52, "num_input_tokens_seen": 366272, "step": 630 }, { "epoch": 0.0945784927018171, "grad_norm": 2.3873343467712402, "learning_rate": 2.3607387548406314e-06, "loss": 1.076, "num_input_tokens_seen": 369376, "step": 635 }, { "epoch": 0.0953232052427763, "grad_norm": 5.0540452003479, "learning_rate": 2.3793565683646113e-06, "loss": 1.1734, "num_input_tokens_seen": 372224, "step": 640 }, { "epoch": 0.09606791778373548, "grad_norm": 4.922595977783203, "learning_rate": 2.3979743818885912e-06, "loss": 1.4633, "num_input_tokens_seen": 375232, "step": 645 }, { "epoch": 0.09681263032469467, "grad_norm": 2.404585838317871, "learning_rate": 2.416592195412571e-06, "loss": 1.2217, "num_input_tokens_seen": 378272, "step": 650 }, { "epoch": 0.09755734286565386, "grad_norm": 3.3274576663970947, "learning_rate": 2.4352100089365507e-06, "loss": 0.8314, "num_input_tokens_seen": 381152, "step": 655 }, { "epoch": 0.09830205540661305, "grad_norm": 4.7009358406066895, "learning_rate": 2.4538278224605306e-06, "loss": 1.1103, "num_input_tokens_seen": 384096, "step": 660 }, { "epoch": 0.09904676794757224, "grad_norm": 3.380542516708374, "learning_rate": 2.47244563598451e-06, "loss": 0.9169, "num_input_tokens_seen": 387296, "step": 665 }, { "epoch": 0.09979148048853143, "grad_norm": 3.8783583641052246, "learning_rate": 2.49106344950849e-06, "loss": 1.2166, "num_input_tokens_seen": 390656, "step": 670 }, { "epoch": 0.10053619302949061, "grad_norm": 4.1778035163879395, "learning_rate": 2.5096812630324695e-06, "loss": 1.1392, "num_input_tokens_seen": 393568, "step": 675 }, { "epoch": 0.1012809055704498, "grad_norm": 3.325423002243042, "learning_rate": 2.5282990765564494e-06, "loss": 0.7388, "num_input_tokens_seen": 396320, "step": 680 }, { "epoch": 0.10202561811140899, "grad_norm": 2.936307907104492, "learning_rate": 2.546916890080429e-06, "loss": 1.1571, "num_input_tokens_seen": 399328, "step": 685 }, { "epoch": 0.10277033065236818, "grad_norm": 5.219481945037842, "learning_rate": 2.565534703604409e-06, "loss": 1.146, "num_input_tokens_seen": 402176, "step": 690 }, { "epoch": 0.10351504319332737, "grad_norm": 4.153476238250732, "learning_rate": 2.5841525171283884e-06, "loss": 1.1596, "num_input_tokens_seen": 405088, "step": 695 }, { "epoch": 0.10425975573428657, "grad_norm": 5.321521759033203, "learning_rate": 2.6027703306523683e-06, "loss": 1.0694, "num_input_tokens_seen": 408288, "step": 700 }, { "epoch": 0.10500446827524576, "grad_norm": 3.1302683353424072, "learning_rate": 2.621388144176348e-06, "loss": 1.2119, "num_input_tokens_seen": 411488, "step": 705 }, { "epoch": 0.10574918081620495, "grad_norm": 4.151510715484619, "learning_rate": 2.6400059577003277e-06, "loss": 1.0914, "num_input_tokens_seen": 414272, "step": 710 }, { "epoch": 0.10649389335716414, "grad_norm": 3.521610736846924, "learning_rate": 2.6586237712243077e-06, "loss": 1.3718, "num_input_tokens_seen": 417184, "step": 715 }, { "epoch": 0.10723860589812333, "grad_norm": 7.504931449890137, "learning_rate": 2.6772415847482876e-06, "loss": 1.5052, "num_input_tokens_seen": 420096, "step": 720 }, { "epoch": 0.10798331843908252, "grad_norm": 5.981927871704102, "learning_rate": 2.695859398272267e-06, "loss": 1.1, "num_input_tokens_seen": 422944, "step": 725 }, { "epoch": 0.1087280309800417, "grad_norm": 2.2889535427093506, "learning_rate": 2.714477211796247e-06, "loss": 1.0472, "num_input_tokens_seen": 425824, "step": 730 }, { "epoch": 0.10947274352100089, "grad_norm": 3.375128746032715, "learning_rate": 2.7330950253202265e-06, "loss": 0.9236, "num_input_tokens_seen": 428832, "step": 735 }, { "epoch": 0.11021745606196008, "grad_norm": 6.099715232849121, "learning_rate": 2.7517128388442064e-06, "loss": 1.822, "num_input_tokens_seen": 432032, "step": 740 }, { "epoch": 0.11096216860291927, "grad_norm": 3.7251009941101074, "learning_rate": 2.770330652368186e-06, "loss": 1.3096, "num_input_tokens_seen": 434752, "step": 745 }, { "epoch": 0.11170688114387846, "grad_norm": 5.9435553550720215, "learning_rate": 2.788948465892166e-06, "loss": 1.2654, "num_input_tokens_seen": 437568, "step": 750 }, { "epoch": 0.11245159368483765, "grad_norm": 4.627353191375732, "learning_rate": 2.8075662794161454e-06, "loss": 1.001, "num_input_tokens_seen": 440736, "step": 755 }, { "epoch": 0.11319630622579684, "grad_norm": 4.120850563049316, "learning_rate": 2.8261840929401253e-06, "loss": 1.0267, "num_input_tokens_seen": 443456, "step": 760 }, { "epoch": 0.11394101876675604, "grad_norm": 2.925396680831909, "learning_rate": 2.844801906464105e-06, "loss": 1.2063, "num_input_tokens_seen": 446464, "step": 765 }, { "epoch": 0.11468573130771523, "grad_norm": 2.874056100845337, "learning_rate": 2.8634197199880847e-06, "loss": 1.1722, "num_input_tokens_seen": 449216, "step": 770 }, { "epoch": 0.11543044384867442, "grad_norm": 3.222374439239502, "learning_rate": 2.8820375335120642e-06, "loss": 0.9591, "num_input_tokens_seen": 452064, "step": 775 }, { "epoch": 0.1161751563896336, "grad_norm": 4.575685024261475, "learning_rate": 2.9006553470360446e-06, "loss": 1.513, "num_input_tokens_seen": 454912, "step": 780 }, { "epoch": 0.1169198689305928, "grad_norm": 5.4289231300354, "learning_rate": 2.919273160560024e-06, "loss": 1.4472, "num_input_tokens_seen": 457984, "step": 785 }, { "epoch": 0.11766458147155198, "grad_norm": 5.987713813781738, "learning_rate": 2.937890974084004e-06, "loss": 1.349, "num_input_tokens_seen": 460704, "step": 790 }, { "epoch": 0.11840929401251117, "grad_norm": 6.89257287979126, "learning_rate": 2.9565087876079835e-06, "loss": 1.3356, "num_input_tokens_seen": 463552, "step": 795 }, { "epoch": 0.11915400655347036, "grad_norm": 7.8299970626831055, "learning_rate": 2.9751266011319634e-06, "loss": 1.2199, "num_input_tokens_seen": 466400, "step": 800 }, { "epoch": 0.11989871909442955, "grad_norm": 4.755053520202637, "learning_rate": 2.993744414655943e-06, "loss": 1.2578, "num_input_tokens_seen": 469312, "step": 805 }, { "epoch": 0.12064343163538874, "grad_norm": 3.5088727474212646, "learning_rate": 3.012362228179923e-06, "loss": 1.0648, "num_input_tokens_seen": 472256, "step": 810 }, { "epoch": 0.12138814417634793, "grad_norm": 9.076359748840332, "learning_rate": 3.0309800417039024e-06, "loss": 0.9563, "num_input_tokens_seen": 475104, "step": 815 }, { "epoch": 0.12213285671730711, "grad_norm": 4.465707302093506, "learning_rate": 3.0495978552278823e-06, "loss": 1.4468, "num_input_tokens_seen": 477952, "step": 820 }, { "epoch": 0.1228775692582663, "grad_norm": 2.4944396018981934, "learning_rate": 3.068215668751862e-06, "loss": 1.5512, "num_input_tokens_seen": 480704, "step": 825 }, { "epoch": 0.1236222817992255, "grad_norm": 4.666263580322266, "learning_rate": 3.0868334822758417e-06, "loss": 1.1518, "num_input_tokens_seen": 483680, "step": 830 }, { "epoch": 0.1243669943401847, "grad_norm": 5.195840358734131, "learning_rate": 3.1054512957998212e-06, "loss": 1.4324, "num_input_tokens_seen": 486720, "step": 835 }, { "epoch": 0.12511170688114387, "grad_norm": 5.533604621887207, "learning_rate": 3.124069109323801e-06, "loss": 1.1174, "num_input_tokens_seen": 489536, "step": 840 }, { "epoch": 0.12585641942210307, "grad_norm": 5.370842456817627, "learning_rate": 3.142686922847781e-06, "loss": 1.1043, "num_input_tokens_seen": 492256, "step": 845 }, { "epoch": 0.12660113196306225, "grad_norm": 3.7240633964538574, "learning_rate": 3.1613047363717606e-06, "loss": 1.0661, "num_input_tokens_seen": 495168, "step": 850 }, { "epoch": 0.12734584450402145, "grad_norm": 3.851127862930298, "learning_rate": 3.1799225498957405e-06, "loss": 1.2769, "num_input_tokens_seen": 498240, "step": 855 }, { "epoch": 0.12809055704498062, "grad_norm": 4.088725566864014, "learning_rate": 3.19854036341972e-06, "loss": 0.9476, "num_input_tokens_seen": 501216, "step": 860 }, { "epoch": 0.12883526958593983, "grad_norm": 5.208695888519287, "learning_rate": 3.2171581769437e-06, "loss": 1.319, "num_input_tokens_seen": 504352, "step": 865 }, { "epoch": 0.12957998212689903, "grad_norm": 4.503544807434082, "learning_rate": 3.2357759904676794e-06, "loss": 1.3217, "num_input_tokens_seen": 507168, "step": 870 }, { "epoch": 0.1303246946678582, "grad_norm": 5.913437843322754, "learning_rate": 3.2543938039916594e-06, "loss": 1.2704, "num_input_tokens_seen": 510112, "step": 875 }, { "epoch": 0.1310694072088174, "grad_norm": 3.8106729984283447, "learning_rate": 3.2730116175156393e-06, "loss": 0.9632, "num_input_tokens_seen": 512896, "step": 880 }, { "epoch": 0.13181411974977658, "grad_norm": 3.176953077316284, "learning_rate": 3.291629431039619e-06, "loss": 1.3209, "num_input_tokens_seen": 515744, "step": 885 }, { "epoch": 0.13255883229073578, "grad_norm": 2.4244043827056885, "learning_rate": 3.3102472445635987e-06, "loss": 1.4807, "num_input_tokens_seen": 518624, "step": 890 }, { "epoch": 0.13330354483169496, "grad_norm": 4.957663059234619, "learning_rate": 3.3288650580875782e-06, "loss": 1.3402, "num_input_tokens_seen": 521824, "step": 895 }, { "epoch": 0.13404825737265416, "grad_norm": 2.374913215637207, "learning_rate": 3.347482871611558e-06, "loss": 1.1142, "num_input_tokens_seen": 524800, "step": 900 }, { "epoch": 0.13479296991361334, "grad_norm": 2.403346300125122, "learning_rate": 3.3661006851355377e-06, "loss": 0.7676, "num_input_tokens_seen": 527712, "step": 905 }, { "epoch": 0.13553768245457254, "grad_norm": 2.7889842987060547, "learning_rate": 3.3847184986595176e-06, "loss": 1.0829, "num_input_tokens_seen": 531200, "step": 910 }, { "epoch": 0.1362823949955317, "grad_norm": 4.226366996765137, "learning_rate": 3.403336312183497e-06, "loss": 1.1611, "num_input_tokens_seen": 534080, "step": 915 }, { "epoch": 0.13702710753649092, "grad_norm": 2.4442055225372314, "learning_rate": 3.421954125707477e-06, "loss": 0.8325, "num_input_tokens_seen": 536800, "step": 920 }, { "epoch": 0.1377718200774501, "grad_norm": 6.069619178771973, "learning_rate": 3.4405719392314565e-06, "loss": 1.1416, "num_input_tokens_seen": 539904, "step": 925 }, { "epoch": 0.1385165326184093, "grad_norm": 3.734894037246704, "learning_rate": 3.4591897527554364e-06, "loss": 1.0498, "num_input_tokens_seen": 542560, "step": 930 }, { "epoch": 0.1392612451593685, "grad_norm": 4.921815872192383, "learning_rate": 3.477807566279416e-06, "loss": 1.4129, "num_input_tokens_seen": 545696, "step": 935 }, { "epoch": 0.14000595770032767, "grad_norm": 5.230145454406738, "learning_rate": 3.496425379803396e-06, "loss": 1.2528, "num_input_tokens_seen": 548640, "step": 940 }, { "epoch": 0.14075067024128687, "grad_norm": 3.312917947769165, "learning_rate": 3.5150431933273762e-06, "loss": 0.8015, "num_input_tokens_seen": 551296, "step": 945 }, { "epoch": 0.14149538278224605, "grad_norm": 2.5974411964416504, "learning_rate": 3.5336610068513553e-06, "loss": 1.0014, "num_input_tokens_seen": 554176, "step": 950 }, { "epoch": 0.14224009532320525, "grad_norm": 2.3032946586608887, "learning_rate": 3.5522788203753356e-06, "loss": 1.0612, "num_input_tokens_seen": 556928, "step": 955 }, { "epoch": 0.14298480786416443, "grad_norm": 3.544867515563965, "learning_rate": 3.570896633899315e-06, "loss": 1.0176, "num_input_tokens_seen": 560096, "step": 960 }, { "epoch": 0.14372952040512363, "grad_norm": 3.888812303543091, "learning_rate": 3.589514447423295e-06, "loss": 1.3021, "num_input_tokens_seen": 563104, "step": 965 }, { "epoch": 0.1444742329460828, "grad_norm": 2.5690393447875977, "learning_rate": 3.6081322609472746e-06, "loss": 1.0481, "num_input_tokens_seen": 566048, "step": 970 }, { "epoch": 0.145218945487042, "grad_norm": 4.813748836517334, "learning_rate": 3.6267500744712545e-06, "loss": 1.1964, "num_input_tokens_seen": 568544, "step": 975 }, { "epoch": 0.14596365802800118, "grad_norm": 6.905218124389648, "learning_rate": 3.645367887995234e-06, "loss": 1.3739, "num_input_tokens_seen": 571616, "step": 980 }, { "epoch": 0.14670837056896038, "grad_norm": 6.298303127288818, "learning_rate": 3.663985701519214e-06, "loss": 1.1534, "num_input_tokens_seen": 574496, "step": 985 }, { "epoch": 0.14745308310991956, "grad_norm": 4.304102897644043, "learning_rate": 3.6826035150431934e-06, "loss": 1.1594, "num_input_tokens_seen": 577280, "step": 990 }, { "epoch": 0.14819779565087876, "grad_norm": 4.083035945892334, "learning_rate": 3.7012213285671734e-06, "loss": 1.1382, "num_input_tokens_seen": 580096, "step": 995 }, { "epoch": 0.14894250819183796, "grad_norm": 4.823789119720459, "learning_rate": 3.719839142091153e-06, "loss": 1.2027, "num_input_tokens_seen": 583040, "step": 1000 }, { "epoch": 0.14968722073279714, "grad_norm": 4.2687153816223145, "learning_rate": 3.738456955615133e-06, "loss": 1.0283, "num_input_tokens_seen": 586016, "step": 1005 }, { "epoch": 0.15043193327375634, "grad_norm": 3.3241329193115234, "learning_rate": 3.7570747691391127e-06, "loss": 1.3989, "num_input_tokens_seen": 589056, "step": 1010 }, { "epoch": 0.15117664581471552, "grad_norm": 4.989994525909424, "learning_rate": 3.7756925826630922e-06, "loss": 1.5612, "num_input_tokens_seen": 591936, "step": 1015 }, { "epoch": 0.15192135835567472, "grad_norm": 3.707167863845825, "learning_rate": 3.794310396187072e-06, "loss": 1.0226, "num_input_tokens_seen": 594848, "step": 1020 }, { "epoch": 0.1526660708966339, "grad_norm": 6.685235023498535, "learning_rate": 3.8129282097110517e-06, "loss": 1.3576, "num_input_tokens_seen": 597792, "step": 1025 }, { "epoch": 0.1534107834375931, "grad_norm": 4.877352714538574, "learning_rate": 3.831546023235032e-06, "loss": 1.5729, "num_input_tokens_seen": 600768, "step": 1030 }, { "epoch": 0.15415549597855227, "grad_norm": 5.42733907699585, "learning_rate": 3.850163836759011e-06, "loss": 1.2406, "num_input_tokens_seen": 603616, "step": 1035 }, { "epoch": 0.15490020851951147, "grad_norm": 5.0593132972717285, "learning_rate": 3.8687816502829914e-06, "loss": 1.2319, "num_input_tokens_seen": 606432, "step": 1040 }, { "epoch": 0.15564492106047065, "grad_norm": 4.798023700714111, "learning_rate": 3.8873994638069705e-06, "loss": 1.2702, "num_input_tokens_seen": 609536, "step": 1045 }, { "epoch": 0.15638963360142985, "grad_norm": 2.9069886207580566, "learning_rate": 3.9060172773309504e-06, "loss": 1.0255, "num_input_tokens_seen": 612256, "step": 1050 }, { "epoch": 0.15713434614238903, "grad_norm": 2.1971654891967773, "learning_rate": 3.92463509085493e-06, "loss": 0.8952, "num_input_tokens_seen": 615136, "step": 1055 }, { "epoch": 0.15787905868334823, "grad_norm": 2.16636323928833, "learning_rate": 3.94325290437891e-06, "loss": 1.0355, "num_input_tokens_seen": 618144, "step": 1060 }, { "epoch": 0.15862377122430743, "grad_norm": 6.315716743469238, "learning_rate": 3.961870717902889e-06, "loss": 1.3264, "num_input_tokens_seen": 621440, "step": 1065 }, { "epoch": 0.1593684837652666, "grad_norm": 4.566975116729736, "learning_rate": 3.980488531426869e-06, "loss": 1.1514, "num_input_tokens_seen": 624352, "step": 1070 }, { "epoch": 0.1601131963062258, "grad_norm": 5.466822147369385, "learning_rate": 3.999106344950849e-06, "loss": 1.3711, "num_input_tokens_seen": 627392, "step": 1075 }, { "epoch": 0.16085790884718498, "grad_norm": 5.531928062438965, "learning_rate": 4.017724158474829e-06, "loss": 1.5571, "num_input_tokens_seen": 630368, "step": 1080 }, { "epoch": 0.16160262138814419, "grad_norm": 3.650953531265259, "learning_rate": 4.036341971998809e-06, "loss": 1.4015, "num_input_tokens_seen": 633504, "step": 1085 }, { "epoch": 0.16234733392910336, "grad_norm": 5.64626932144165, "learning_rate": 4.054959785522788e-06, "loss": 0.9495, "num_input_tokens_seen": 636640, "step": 1090 }, { "epoch": 0.16309204647006256, "grad_norm": 4.353403568267822, "learning_rate": 4.073577599046768e-06, "loss": 1.1681, "num_input_tokens_seen": 639744, "step": 1095 }, { "epoch": 0.16383675901102174, "grad_norm": 3.220083475112915, "learning_rate": 4.092195412570748e-06, "loss": 1.3029, "num_input_tokens_seen": 642464, "step": 1100 }, { "epoch": 0.16458147155198094, "grad_norm": 2.2999939918518066, "learning_rate": 4.110813226094728e-06, "loss": 1.2022, "num_input_tokens_seen": 645472, "step": 1105 }, { "epoch": 0.16532618409294011, "grad_norm": 5.150084018707275, "learning_rate": 4.129431039618707e-06, "loss": 1.13, "num_input_tokens_seen": 648320, "step": 1110 }, { "epoch": 0.16607089663389932, "grad_norm": 4.461713790893555, "learning_rate": 4.148048853142687e-06, "loss": 0.9956, "num_input_tokens_seen": 651168, "step": 1115 }, { "epoch": 0.1668156091748585, "grad_norm": 3.205512523651123, "learning_rate": 4.166666666666667e-06, "loss": 1.3606, "num_input_tokens_seen": 654144, "step": 1120 }, { "epoch": 0.1675603217158177, "grad_norm": 4.1704511642456055, "learning_rate": 4.185284480190647e-06, "loss": 1.1003, "num_input_tokens_seen": 656928, "step": 1125 }, { "epoch": 0.1683050342567769, "grad_norm": 4.133433818817139, "learning_rate": 4.203902293714626e-06, "loss": 1.093, "num_input_tokens_seen": 659392, "step": 1130 }, { "epoch": 0.16904974679773607, "grad_norm": 2.0351665019989014, "learning_rate": 4.222520107238606e-06, "loss": 0.9126, "num_input_tokens_seen": 662464, "step": 1135 }, { "epoch": 0.16979445933869527, "grad_norm": 6.294524192810059, "learning_rate": 4.241137920762586e-06, "loss": 1.1134, "num_input_tokens_seen": 665408, "step": 1140 }, { "epoch": 0.17053917187965445, "grad_norm": 3.487893581390381, "learning_rate": 4.259755734286566e-06, "loss": 1.1989, "num_input_tokens_seen": 668320, "step": 1145 }, { "epoch": 0.17128388442061365, "grad_norm": 3.7062249183654785, "learning_rate": 4.278373547810546e-06, "loss": 1.192, "num_input_tokens_seen": 671168, "step": 1150 }, { "epoch": 0.17202859696157283, "grad_norm": 4.371123790740967, "learning_rate": 4.296991361334525e-06, "loss": 1.3329, "num_input_tokens_seen": 674368, "step": 1155 }, { "epoch": 0.17277330950253203, "grad_norm": 3.8677048683166504, "learning_rate": 4.3156091748585054e-06, "loss": 1.0263, "num_input_tokens_seen": 677056, "step": 1160 }, { "epoch": 0.1735180220434912, "grad_norm": 3.3615572452545166, "learning_rate": 4.3342269883824845e-06, "loss": 0.8676, "num_input_tokens_seen": 680000, "step": 1165 }, { "epoch": 0.1742627345844504, "grad_norm": 2.090850830078125, "learning_rate": 4.3528448019064644e-06, "loss": 0.9762, "num_input_tokens_seen": 682752, "step": 1170 }, { "epoch": 0.17500744712540958, "grad_norm": 2.0665810108184814, "learning_rate": 4.3714626154304435e-06, "loss": 0.9893, "num_input_tokens_seen": 685600, "step": 1175 }, { "epoch": 0.17575215966636878, "grad_norm": 3.871007204055786, "learning_rate": 4.390080428954424e-06, "loss": 0.9839, "num_input_tokens_seen": 688448, "step": 1180 }, { "epoch": 0.17649687220732796, "grad_norm": 4.789863109588623, "learning_rate": 4.408698242478403e-06, "loss": 1.0809, "num_input_tokens_seen": 691328, "step": 1185 }, { "epoch": 0.17724158474828716, "grad_norm": 1.6741219758987427, "learning_rate": 4.427316056002383e-06, "loss": 1.0121, "num_input_tokens_seen": 694208, "step": 1190 }, { "epoch": 0.17798629728924636, "grad_norm": 4.779961109161377, "learning_rate": 4.445933869526362e-06, "loss": 1.3999, "num_input_tokens_seen": 696928, "step": 1195 }, { "epoch": 0.17873100983020554, "grad_norm": 4.646459102630615, "learning_rate": 4.464551683050343e-06, "loss": 1.3284, "num_input_tokens_seen": 699776, "step": 1200 }, { "epoch": 0.17947572237116474, "grad_norm": 4.781398296356201, "learning_rate": 4.483169496574322e-06, "loss": 1.1445, "num_input_tokens_seen": 702688, "step": 1205 }, { "epoch": 0.18022043491212392, "grad_norm": 4.448430061340332, "learning_rate": 4.501787310098302e-06, "loss": 1.0633, "num_input_tokens_seen": 705536, "step": 1210 }, { "epoch": 0.18096514745308312, "grad_norm": 5.171677112579346, "learning_rate": 4.520405123622282e-06, "loss": 1.3464, "num_input_tokens_seen": 708544, "step": 1215 }, { "epoch": 0.1817098599940423, "grad_norm": 3.5719072818756104, "learning_rate": 4.539022937146262e-06, "loss": 1.0619, "num_input_tokens_seen": 711200, "step": 1220 }, { "epoch": 0.1824545725350015, "grad_norm": 3.5361976623535156, "learning_rate": 4.557640750670242e-06, "loss": 1.192, "num_input_tokens_seen": 714176, "step": 1225 }, { "epoch": 0.18319928507596067, "grad_norm": 3.3758037090301514, "learning_rate": 4.576258564194221e-06, "loss": 0.8675, "num_input_tokens_seen": 716896, "step": 1230 }, { "epoch": 0.18394399761691987, "grad_norm": 3.040863513946533, "learning_rate": 4.594876377718201e-06, "loss": 0.8505, "num_input_tokens_seen": 719680, "step": 1235 }, { "epoch": 0.18468871015787905, "grad_norm": 4.2554931640625, "learning_rate": 4.613494191242181e-06, "loss": 1.1135, "num_input_tokens_seen": 722336, "step": 1240 }, { "epoch": 0.18543342269883825, "grad_norm": 2.9195773601531982, "learning_rate": 4.632112004766161e-06, "loss": 0.8139, "num_input_tokens_seen": 725472, "step": 1245 }, { "epoch": 0.18617813523979743, "grad_norm": 2.2607421875, "learning_rate": 4.65072981829014e-06, "loss": 1.0027, "num_input_tokens_seen": 728512, "step": 1250 }, { "epoch": 0.18692284778075663, "grad_norm": 2.0067479610443115, "learning_rate": 4.66934763181412e-06, "loss": 0.8921, "num_input_tokens_seen": 731424, "step": 1255 }, { "epoch": 0.1876675603217158, "grad_norm": 1.2973779439926147, "learning_rate": 4.6879654453381e-06, "loss": 0.9198, "num_input_tokens_seen": 734368, "step": 1260 }, { "epoch": 0.188412272862675, "grad_norm": 4.573488712310791, "learning_rate": 4.70658325886208e-06, "loss": 1.336, "num_input_tokens_seen": 737408, "step": 1265 }, { "epoch": 0.1891569854036342, "grad_norm": 6.921488285064697, "learning_rate": 4.725201072386059e-06, "loss": 1.0799, "num_input_tokens_seen": 740576, "step": 1270 }, { "epoch": 0.18990169794459338, "grad_norm": 3.414719820022583, "learning_rate": 4.743818885910039e-06, "loss": 1.2077, "num_input_tokens_seen": 743200, "step": 1275 }, { "epoch": 0.1906464104855526, "grad_norm": 3.011674165725708, "learning_rate": 4.7624366994340194e-06, "loss": 0.8113, "num_input_tokens_seen": 746176, "step": 1280 }, { "epoch": 0.19139112302651176, "grad_norm": 2.737560749053955, "learning_rate": 4.7810545129579985e-06, "loss": 0.8745, "num_input_tokens_seen": 749152, "step": 1285 }, { "epoch": 0.19213583556747096, "grad_norm": 1.789839267730713, "learning_rate": 4.7996723264819784e-06, "loss": 0.814, "num_input_tokens_seen": 752128, "step": 1290 }, { "epoch": 0.19288054810843014, "grad_norm": 3.740668296813965, "learning_rate": 4.8182901400059575e-06, "loss": 0.7733, "num_input_tokens_seen": 755072, "step": 1295 }, { "epoch": 0.19362526064938934, "grad_norm": 2.4523777961730957, "learning_rate": 4.836907953529938e-06, "loss": 0.9658, "num_input_tokens_seen": 758016, "step": 1300 }, { "epoch": 0.19436997319034852, "grad_norm": 3.597930669784546, "learning_rate": 4.855525767053917e-06, "loss": 0.9166, "num_input_tokens_seen": 760928, "step": 1305 }, { "epoch": 0.19511468573130772, "grad_norm": 1.569560170173645, "learning_rate": 4.874143580577897e-06, "loss": 0.7799, "num_input_tokens_seen": 763776, "step": 1310 }, { "epoch": 0.1958593982722669, "grad_norm": 2.4613494873046875, "learning_rate": 4.892761394101876e-06, "loss": 0.7477, "num_input_tokens_seen": 766432, "step": 1315 }, { "epoch": 0.1966041108132261, "grad_norm": 3.8612332344055176, "learning_rate": 4.911379207625857e-06, "loss": 1.0943, "num_input_tokens_seen": 769280, "step": 1320 }, { "epoch": 0.19734882335418527, "grad_norm": 1.6533459424972534, "learning_rate": 4.929997021149836e-06, "loss": 0.8041, "num_input_tokens_seen": 772224, "step": 1325 }, { "epoch": 0.19809353589514447, "grad_norm": 2.54742431640625, "learning_rate": 4.948614834673816e-06, "loss": 0.9791, "num_input_tokens_seen": 774944, "step": 1330 }, { "epoch": 0.19883824843610368, "grad_norm": 1.8585518598556519, "learning_rate": 4.967232648197795e-06, "loss": 0.9006, "num_input_tokens_seen": 777664, "step": 1335 }, { "epoch": 0.19958296097706285, "grad_norm": 5.564144611358643, "learning_rate": 4.985850461721776e-06, "loss": 1.0741, "num_input_tokens_seen": 780480, "step": 1340 }, { "epoch": 0.20032767351802205, "grad_norm": 2.337695837020874, "learning_rate": 5.004468275245756e-06, "loss": 0.9224, "num_input_tokens_seen": 783296, "step": 1345 }, { "epoch": 0.20107238605898123, "grad_norm": 3.1221916675567627, "learning_rate": 5.023086088769735e-06, "loss": 0.833, "num_input_tokens_seen": 786016, "step": 1350 }, { "epoch": 0.20181709859994043, "grad_norm": 1.7560042142868042, "learning_rate": 5.041703902293715e-06, "loss": 0.6179, "num_input_tokens_seen": 788960, "step": 1355 }, { "epoch": 0.2025618111408996, "grad_norm": 2.8151473999023438, "learning_rate": 5.060321715817695e-06, "loss": 0.8228, "num_input_tokens_seen": 791936, "step": 1360 }, { "epoch": 0.2033065236818588, "grad_norm": 1.2527474164962769, "learning_rate": 5.078939529341675e-06, "loss": 1.0837, "num_input_tokens_seen": 794720, "step": 1365 }, { "epoch": 0.20405123622281798, "grad_norm": 1.3958572149276733, "learning_rate": 5.097557342865654e-06, "loss": 0.6454, "num_input_tokens_seen": 797376, "step": 1370 }, { "epoch": 0.20479594876377719, "grad_norm": 2.669590711593628, "learning_rate": 5.116175156389634e-06, "loss": 0.8067, "num_input_tokens_seen": 800192, "step": 1375 }, { "epoch": 0.20554066130473636, "grad_norm": 2.68503475189209, "learning_rate": 5.134792969913614e-06, "loss": 1.0173, "num_input_tokens_seen": 803104, "step": 1380 }, { "epoch": 0.20628537384569556, "grad_norm": 2.2689075469970703, "learning_rate": 5.153410783437594e-06, "loss": 0.9664, "num_input_tokens_seen": 806496, "step": 1385 }, { "epoch": 0.20703008638665474, "grad_norm": 1.9469488859176636, "learning_rate": 5.172028596961573e-06, "loss": 0.9054, "num_input_tokens_seen": 809344, "step": 1390 }, { "epoch": 0.20777479892761394, "grad_norm": 2.208940267562866, "learning_rate": 5.190646410485553e-06, "loss": 0.8519, "num_input_tokens_seen": 812256, "step": 1395 }, { "epoch": 0.20851951146857314, "grad_norm": 1.8173909187316895, "learning_rate": 5.209264224009533e-06, "loss": 0.9498, "num_input_tokens_seen": 814816, "step": 1400 }, { "epoch": 0.20926422400953232, "grad_norm": 4.024951934814453, "learning_rate": 5.2278820375335125e-06, "loss": 0.8678, "num_input_tokens_seen": 817440, "step": 1405 }, { "epoch": 0.21000893655049152, "grad_norm": 1.740728497505188, "learning_rate": 5.2464998510574924e-06, "loss": 0.9797, "num_input_tokens_seen": 820640, "step": 1410 }, { "epoch": 0.2107536490914507, "grad_norm": 1.8490926027297974, "learning_rate": 5.2651176645814715e-06, "loss": 0.7466, "num_input_tokens_seen": 823488, "step": 1415 }, { "epoch": 0.2114983616324099, "grad_norm": 1.1268441677093506, "learning_rate": 5.2837354781054514e-06, "loss": 0.9326, "num_input_tokens_seen": 826240, "step": 1420 }, { "epoch": 0.21224307417336907, "grad_norm": 1.5419509410858154, "learning_rate": 5.302353291629431e-06, "loss": 0.812, "num_input_tokens_seen": 828992, "step": 1425 }, { "epoch": 0.21298778671432828, "grad_norm": 2.1566343307495117, "learning_rate": 5.320971105153411e-06, "loss": 0.8176, "num_input_tokens_seen": 831648, "step": 1430 }, { "epoch": 0.21373249925528745, "grad_norm": 1.3511950969696045, "learning_rate": 5.33958891867739e-06, "loss": 0.8049, "num_input_tokens_seen": 834336, "step": 1435 }, { "epoch": 0.21447721179624665, "grad_norm": 2.2728662490844727, "learning_rate": 5.35820673220137e-06, "loss": 1.0181, "num_input_tokens_seen": 837536, "step": 1440 }, { "epoch": 0.21522192433720583, "grad_norm": 1.8930604457855225, "learning_rate": 5.37682454572535e-06, "loss": 0.8952, "num_input_tokens_seen": 841088, "step": 1445 }, { "epoch": 0.21596663687816503, "grad_norm": 3.882507562637329, "learning_rate": 5.39544235924933e-06, "loss": 0.9391, "num_input_tokens_seen": 844000, "step": 1450 }, { "epoch": 0.2167113494191242, "grad_norm": 1.5084308385849, "learning_rate": 5.414060172773309e-06, "loss": 0.8445, "num_input_tokens_seen": 846912, "step": 1455 }, { "epoch": 0.2174560619600834, "grad_norm": 1.9142967462539673, "learning_rate": 5.43267798629729e-06, "loss": 0.8764, "num_input_tokens_seen": 849952, "step": 1460 }, { "epoch": 0.2182007745010426, "grad_norm": 1.8960736989974976, "learning_rate": 5.451295799821269e-06, "loss": 0.8825, "num_input_tokens_seen": 852768, "step": 1465 }, { "epoch": 0.21894548704200179, "grad_norm": 2.089672803878784, "learning_rate": 5.469913613345249e-06, "loss": 0.9587, "num_input_tokens_seen": 855808, "step": 1470 }, { "epoch": 0.219690199582961, "grad_norm": 2.7318007946014404, "learning_rate": 5.488531426869229e-06, "loss": 0.9417, "num_input_tokens_seen": 858624, "step": 1475 }, { "epoch": 0.22043491212392016, "grad_norm": 3.6314728260040283, "learning_rate": 5.507149240393209e-06, "loss": 0.9584, "num_input_tokens_seen": 861600, "step": 1480 }, { "epoch": 0.22117962466487937, "grad_norm": 1.5361987352371216, "learning_rate": 5.525767053917189e-06, "loss": 0.7601, "num_input_tokens_seen": 864768, "step": 1485 }, { "epoch": 0.22192433720583854, "grad_norm": 3.5853967666625977, "learning_rate": 5.544384867441168e-06, "loss": 0.9486, "num_input_tokens_seen": 867776, "step": 1490 }, { "epoch": 0.22266904974679774, "grad_norm": 0.9458749890327454, "learning_rate": 5.563002680965148e-06, "loss": 0.5884, "num_input_tokens_seen": 871104, "step": 1495 }, { "epoch": 0.22341376228775692, "grad_norm": 2.9670143127441406, "learning_rate": 5.581620494489128e-06, "loss": 0.8163, "num_input_tokens_seen": 874304, "step": 1500 }, { "epoch": 0.22415847482871612, "grad_norm": 1.8501492738723755, "learning_rate": 5.600238308013108e-06, "loss": 1.0126, "num_input_tokens_seen": 877184, "step": 1505 }, { "epoch": 0.2249031873696753, "grad_norm": 1.4858020544052124, "learning_rate": 5.618856121537087e-06, "loss": 0.8438, "num_input_tokens_seen": 880128, "step": 1510 }, { "epoch": 0.2256478999106345, "grad_norm": 1.3511018753051758, "learning_rate": 5.637473935061067e-06, "loss": 0.7447, "num_input_tokens_seen": 883328, "step": 1515 }, { "epoch": 0.22639261245159367, "grad_norm": 2.6578738689422607, "learning_rate": 5.656091748585047e-06, "loss": 0.9806, "num_input_tokens_seen": 886304, "step": 1520 }, { "epoch": 0.22713732499255287, "grad_norm": 2.120285749435425, "learning_rate": 5.6747095621090265e-06, "loss": 1.0807, "num_input_tokens_seen": 889280, "step": 1525 }, { "epoch": 0.22788203753351208, "grad_norm": 1.3676351308822632, "learning_rate": 5.693327375633006e-06, "loss": 0.722, "num_input_tokens_seen": 892000, "step": 1530 }, { "epoch": 0.22862675007447125, "grad_norm": 3.1576757431030273, "learning_rate": 5.7119451891569855e-06, "loss": 0.9068, "num_input_tokens_seen": 894912, "step": 1535 }, { "epoch": 0.22937146261543045, "grad_norm": 1.108577847480774, "learning_rate": 5.7305630026809654e-06, "loss": 0.7577, "num_input_tokens_seen": 897600, "step": 1540 }, { "epoch": 0.23011617515638963, "grad_norm": 1.472694993019104, "learning_rate": 5.749180816204945e-06, "loss": 1.2097, "num_input_tokens_seen": 900800, "step": 1545 }, { "epoch": 0.23086088769734883, "grad_norm": 4.3501667976379395, "learning_rate": 5.767798629728925e-06, "loss": 0.8957, "num_input_tokens_seen": 903712, "step": 1550 }, { "epoch": 0.231605600238308, "grad_norm": 3.5790696144104004, "learning_rate": 5.786416443252904e-06, "loss": 1.1268, "num_input_tokens_seen": 906560, "step": 1555 }, { "epoch": 0.2323503127792672, "grad_norm": 2.117215871810913, "learning_rate": 5.805034256776884e-06, "loss": 1.0721, "num_input_tokens_seen": 909344, "step": 1560 }, { "epoch": 0.23309502532022638, "grad_norm": 2.173712730407715, "learning_rate": 5.823652070300864e-06, "loss": 1.0202, "num_input_tokens_seen": 911968, "step": 1565 }, { "epoch": 0.2338397378611856, "grad_norm": 4.353397846221924, "learning_rate": 5.842269883824844e-06, "loss": 0.8209, "num_input_tokens_seen": 914816, "step": 1570 }, { "epoch": 0.23458445040214476, "grad_norm": 2.607712745666504, "learning_rate": 5.860887697348823e-06, "loss": 0.6848, "num_input_tokens_seen": 917696, "step": 1575 }, { "epoch": 0.23532916294310396, "grad_norm": 4.5794148445129395, "learning_rate": 5.879505510872803e-06, "loss": 1.0153, "num_input_tokens_seen": 920992, "step": 1580 }, { "epoch": 0.23607387548406314, "grad_norm": 1.831196904182434, "learning_rate": 5.898123324396783e-06, "loss": 0.6559, "num_input_tokens_seen": 923968, "step": 1585 }, { "epoch": 0.23681858802502234, "grad_norm": 1.674731731414795, "learning_rate": 5.916741137920763e-06, "loss": 0.8914, "num_input_tokens_seen": 926656, "step": 1590 }, { "epoch": 0.23756330056598154, "grad_norm": 1.2813236713409424, "learning_rate": 5.935358951444742e-06, "loss": 0.8077, "num_input_tokens_seen": 929696, "step": 1595 }, { "epoch": 0.23830801310694072, "grad_norm": 1.2951805591583252, "learning_rate": 5.953976764968722e-06, "loss": 0.8415, "num_input_tokens_seen": 932608, "step": 1600 }, { "epoch": 0.23905272564789992, "grad_norm": 1.1656330823898315, "learning_rate": 5.972594578492702e-06, "loss": 0.8298, "num_input_tokens_seen": 935424, "step": 1605 }, { "epoch": 0.2397974381888591, "grad_norm": 1.6610596179962158, "learning_rate": 5.991212392016682e-06, "loss": 0.6576, "num_input_tokens_seen": 938240, "step": 1610 }, { "epoch": 0.2405421507298183, "grad_norm": 1.832188606262207, "learning_rate": 6.009830205540662e-06, "loss": 0.5455, "num_input_tokens_seen": 941152, "step": 1615 }, { "epoch": 0.24128686327077747, "grad_norm": 1.6313422918319702, "learning_rate": 6.028448019064641e-06, "loss": 0.8617, "num_input_tokens_seen": 943936, "step": 1620 }, { "epoch": 0.24203157581173668, "grad_norm": 2.8191394805908203, "learning_rate": 6.047065832588622e-06, "loss": 0.8887, "num_input_tokens_seen": 946944, "step": 1625 }, { "epoch": 0.24277628835269585, "grad_norm": 1.505018711090088, "learning_rate": 6.065683646112601e-06, "loss": 0.6671, "num_input_tokens_seen": 949696, "step": 1630 }, { "epoch": 0.24352100089365505, "grad_norm": 1.925840973854065, "learning_rate": 6.084301459636581e-06, "loss": 0.651, "num_input_tokens_seen": 952608, "step": 1635 }, { "epoch": 0.24426571343461423, "grad_norm": 1.0662744045257568, "learning_rate": 6.10291927316056e-06, "loss": 0.8597, "num_input_tokens_seen": 955712, "step": 1640 }, { "epoch": 0.24501042597557343, "grad_norm": 2.0644869804382324, "learning_rate": 6.1215370866845405e-06, "loss": 0.6783, "num_input_tokens_seen": 959392, "step": 1645 }, { "epoch": 0.2457551385165326, "grad_norm": 1.6901899576187134, "learning_rate": 6.14015490020852e-06, "loss": 0.8576, "num_input_tokens_seen": 962464, "step": 1650 }, { "epoch": 0.2464998510574918, "grad_norm": 2.0858585834503174, "learning_rate": 6.1587727137324995e-06, "loss": 0.9779, "num_input_tokens_seen": 965408, "step": 1655 }, { "epoch": 0.247244563598451, "grad_norm": 2.2598953247070312, "learning_rate": 6.1773905272564794e-06, "loss": 0.8833, "num_input_tokens_seen": 968288, "step": 1660 }, { "epoch": 0.2479892761394102, "grad_norm": 1.199305534362793, "learning_rate": 6.196008340780459e-06, "loss": 0.611, "num_input_tokens_seen": 971168, "step": 1665 }, { "epoch": 0.2487339886803694, "grad_norm": 1.471063494682312, "learning_rate": 6.2146261543044384e-06, "loss": 0.8508, "num_input_tokens_seen": 974016, "step": 1670 }, { "epoch": 0.24947870122132856, "grad_norm": 3.24381685256958, "learning_rate": 6.233243967828418e-06, "loss": 0.9507, "num_input_tokens_seen": 977056, "step": 1675 }, { "epoch": 0.25022341376228774, "grad_norm": 1.8426152467727661, "learning_rate": 6.251861781352398e-06, "loss": 0.7504, "num_input_tokens_seen": 980384, "step": 1680 }, { "epoch": 0.25096812630324694, "grad_norm": 1.3172024488449097, "learning_rate": 6.270479594876379e-06, "loss": 0.8156, "num_input_tokens_seen": 983264, "step": 1685 }, { "epoch": 0.25171283884420614, "grad_norm": 2.46494722366333, "learning_rate": 6.289097408400357e-06, "loss": 0.9036, "num_input_tokens_seen": 986496, "step": 1690 }, { "epoch": 0.25245755138516535, "grad_norm": 2.2367467880249023, "learning_rate": 6.307715221924337e-06, "loss": 0.9504, "num_input_tokens_seen": 989248, "step": 1695 }, { "epoch": 0.2532022639261245, "grad_norm": 2.633582592010498, "learning_rate": 6.326333035448317e-06, "loss": 1.08, "num_input_tokens_seen": 992000, "step": 1700 }, { "epoch": 0.2539469764670837, "grad_norm": 2.215968132019043, "learning_rate": 6.344950848972298e-06, "loss": 0.9253, "num_input_tokens_seen": 994784, "step": 1705 }, { "epoch": 0.2546916890080429, "grad_norm": 1.2791651487350464, "learning_rate": 6.363568662496276e-06, "loss": 0.7652, "num_input_tokens_seen": 997696, "step": 1710 }, { "epoch": 0.2554364015490021, "grad_norm": 5.001497745513916, "learning_rate": 6.382186476020256e-06, "loss": 0.942, "num_input_tokens_seen": 1000416, "step": 1715 }, { "epoch": 0.25618111408996125, "grad_norm": 1.22700035572052, "learning_rate": 6.400804289544236e-06, "loss": 0.6539, "num_input_tokens_seen": 1002976, "step": 1720 }, { "epoch": 0.25692582663092045, "grad_norm": 1.6145871877670288, "learning_rate": 6.419422103068217e-06, "loss": 0.8479, "num_input_tokens_seen": 1006048, "step": 1725 }, { "epoch": 0.25767053917187965, "grad_norm": 1.5091429948806763, "learning_rate": 6.438039916592195e-06, "loss": 0.8018, "num_input_tokens_seen": 1009184, "step": 1730 }, { "epoch": 0.25841525171283886, "grad_norm": 1.6996792554855347, "learning_rate": 6.456657730116175e-06, "loss": 0.7309, "num_input_tokens_seen": 1011904, "step": 1735 }, { "epoch": 0.25915996425379806, "grad_norm": 1.508298635482788, "learning_rate": 6.475275543640155e-06, "loss": 0.7577, "num_input_tokens_seen": 1014432, "step": 1740 }, { "epoch": 0.2599046767947572, "grad_norm": 2.7686972618103027, "learning_rate": 6.493893357164136e-06, "loss": 0.9819, "num_input_tokens_seen": 1017376, "step": 1745 }, { "epoch": 0.2606493893357164, "grad_norm": 1.8218746185302734, "learning_rate": 6.5125111706881156e-06, "loss": 0.8241, "num_input_tokens_seen": 1020192, "step": 1750 }, { "epoch": 0.2613941018766756, "grad_norm": 1.404852032661438, "learning_rate": 6.531128984212094e-06, "loss": 0.8144, "num_input_tokens_seen": 1023360, "step": 1755 }, { "epoch": 0.2621388144176348, "grad_norm": 1.3104546070098877, "learning_rate": 6.549746797736074e-06, "loss": 0.956, "num_input_tokens_seen": 1026208, "step": 1760 }, { "epoch": 0.26288352695859396, "grad_norm": 1.4312180280685425, "learning_rate": 6.5683646112600545e-06, "loss": 0.7147, "num_input_tokens_seen": 1029024, "step": 1765 }, { "epoch": 0.26362823949955316, "grad_norm": 2.317688465118408, "learning_rate": 6.5869824247840344e-06, "loss": 0.829, "num_input_tokens_seen": 1032064, "step": 1770 }, { "epoch": 0.26437295204051237, "grad_norm": 1.6936208009719849, "learning_rate": 6.605600238308013e-06, "loss": 0.675, "num_input_tokens_seen": 1034880, "step": 1775 }, { "epoch": 0.26511766458147157, "grad_norm": 1.868006706237793, "learning_rate": 6.624218051831993e-06, "loss": 0.8096, "num_input_tokens_seen": 1037696, "step": 1780 }, { "epoch": 0.2658623771224307, "grad_norm": 1.2643306255340576, "learning_rate": 6.642835865355973e-06, "loss": 0.6628, "num_input_tokens_seen": 1040448, "step": 1785 }, { "epoch": 0.2666070896633899, "grad_norm": 1.2180570363998413, "learning_rate": 6.661453678879953e-06, "loss": 0.9224, "num_input_tokens_seen": 1043104, "step": 1790 }, { "epoch": 0.2673518022043491, "grad_norm": 1.817516565322876, "learning_rate": 6.6800714924039315e-06, "loss": 0.7662, "num_input_tokens_seen": 1045824, "step": 1795 }, { "epoch": 0.2680965147453083, "grad_norm": 1.828444480895996, "learning_rate": 6.6986893059279114e-06, "loss": 1.062, "num_input_tokens_seen": 1048672, "step": 1800 }, { "epoch": 0.2688412272862675, "grad_norm": 1.2498120069503784, "learning_rate": 6.717307119451892e-06, "loss": 0.7912, "num_input_tokens_seen": 1051360, "step": 1805 }, { "epoch": 0.2695859398272267, "grad_norm": 2.0421414375305176, "learning_rate": 6.735924932975872e-06, "loss": 0.7859, "num_input_tokens_seen": 1054304, "step": 1810 }, { "epoch": 0.2703306523681859, "grad_norm": 1.222965121269226, "learning_rate": 6.754542746499852e-06, "loss": 0.7135, "num_input_tokens_seen": 1057952, "step": 1815 }, { "epoch": 0.2710753649091451, "grad_norm": 1.6140923500061035, "learning_rate": 6.77316056002383e-06, "loss": 0.7456, "num_input_tokens_seen": 1060704, "step": 1820 }, { "epoch": 0.2718200774501043, "grad_norm": 2.5804691314697266, "learning_rate": 6.791778373547811e-06, "loss": 0.6368, "num_input_tokens_seen": 1063360, "step": 1825 }, { "epoch": 0.2725647899910634, "grad_norm": 2.049485683441162, "learning_rate": 6.810396187071791e-06, "loss": 1.0635, "num_input_tokens_seen": 1066112, "step": 1830 }, { "epoch": 0.27330950253202263, "grad_norm": 1.1180497407913208, "learning_rate": 6.829014000595771e-06, "loss": 0.9614, "num_input_tokens_seen": 1068992, "step": 1835 }, { "epoch": 0.27405421507298183, "grad_norm": 1.1272625923156738, "learning_rate": 6.847631814119749e-06, "loss": 0.6345, "num_input_tokens_seen": 1071872, "step": 1840 }, { "epoch": 0.27479892761394104, "grad_norm": 1.2537180185317993, "learning_rate": 6.86624962764373e-06, "loss": 0.6518, "num_input_tokens_seen": 1074752, "step": 1845 }, { "epoch": 0.2755436401549002, "grad_norm": 4.0649542808532715, "learning_rate": 6.88486744116771e-06, "loss": 0.8972, "num_input_tokens_seen": 1077728, "step": 1850 }, { "epoch": 0.2762883526958594, "grad_norm": 1.9732966423034668, "learning_rate": 6.90348525469169e-06, "loss": 0.9852, "num_input_tokens_seen": 1080512, "step": 1855 }, { "epoch": 0.2770330652368186, "grad_norm": 1.9460111856460571, "learning_rate": 6.922103068215669e-06, "loss": 0.8705, "num_input_tokens_seen": 1083424, "step": 1860 }, { "epoch": 0.2777777777777778, "grad_norm": 1.5316709280014038, "learning_rate": 6.940720881739649e-06, "loss": 0.8059, "num_input_tokens_seen": 1086208, "step": 1865 }, { "epoch": 0.278522490318737, "grad_norm": 2.3212170600891113, "learning_rate": 6.959338695263629e-06, "loss": 0.7347, "num_input_tokens_seen": 1089152, "step": 1870 }, { "epoch": 0.27926720285969614, "grad_norm": 1.2166106700897217, "learning_rate": 6.977956508787609e-06, "loss": 0.5297, "num_input_tokens_seen": 1092000, "step": 1875 }, { "epoch": 0.28001191540065534, "grad_norm": 3.446941614151001, "learning_rate": 6.9965743223115886e-06, "loss": 0.9111, "num_input_tokens_seen": 1095008, "step": 1880 }, { "epoch": 0.28075662794161454, "grad_norm": 3.011988639831543, "learning_rate": 7.015192135835568e-06, "loss": 0.6667, "num_input_tokens_seen": 1098080, "step": 1885 }, { "epoch": 0.28150134048257375, "grad_norm": 1.6476571559906006, "learning_rate": 7.033809949359548e-06, "loss": 0.8501, "num_input_tokens_seen": 1100736, "step": 1890 }, { "epoch": 0.2822460530235329, "grad_norm": 1.2539979219436646, "learning_rate": 7.0524277628835275e-06, "loss": 0.8958, "num_input_tokens_seen": 1103744, "step": 1895 }, { "epoch": 0.2829907655644921, "grad_norm": 2.6569886207580566, "learning_rate": 7.0710455764075074e-06, "loss": 0.9087, "num_input_tokens_seen": 1106304, "step": 1900 }, { "epoch": 0.2837354781054513, "grad_norm": 1.801713466644287, "learning_rate": 7.0896633899314865e-06, "loss": 0.9553, "num_input_tokens_seen": 1109472, "step": 1905 }, { "epoch": 0.2844801906464105, "grad_norm": 3.1823670864105225, "learning_rate": 7.1082812034554664e-06, "loss": 0.9927, "num_input_tokens_seen": 1112192, "step": 1910 }, { "epoch": 0.28522490318736965, "grad_norm": 1.19089674949646, "learning_rate": 7.126899016979446e-06, "loss": 0.8329, "num_input_tokens_seen": 1115168, "step": 1915 }, { "epoch": 0.28596961572832885, "grad_norm": 2.962311267852783, "learning_rate": 7.145516830503426e-06, "loss": 0.8227, "num_input_tokens_seen": 1118016, "step": 1920 }, { "epoch": 0.28671432826928805, "grad_norm": 2.443284273147583, "learning_rate": 7.164134644027405e-06, "loss": 0.9107, "num_input_tokens_seen": 1120704, "step": 1925 }, { "epoch": 0.28745904081024726, "grad_norm": 1.8428343534469604, "learning_rate": 7.182752457551385e-06, "loss": 0.671, "num_input_tokens_seen": 1123552, "step": 1930 }, { "epoch": 0.28820375335120646, "grad_norm": 1.5920943021774292, "learning_rate": 7.201370271075365e-06, "loss": 0.8496, "num_input_tokens_seen": 1126368, "step": 1935 }, { "epoch": 0.2889484658921656, "grad_norm": 1.444139003753662, "learning_rate": 7.219988084599345e-06, "loss": 0.5826, "num_input_tokens_seen": 1129312, "step": 1940 }, { "epoch": 0.2896931784331248, "grad_norm": 2.2103796005249023, "learning_rate": 7.238605898123325e-06, "loss": 0.8657, "num_input_tokens_seen": 1132256, "step": 1945 }, { "epoch": 0.290437890974084, "grad_norm": 1.138819932937622, "learning_rate": 7.257223711647304e-06, "loss": 0.6962, "num_input_tokens_seen": 1135008, "step": 1950 }, { "epoch": 0.2911826035150432, "grad_norm": 1.7011805772781372, "learning_rate": 7.275841525171284e-06, "loss": 0.8646, "num_input_tokens_seen": 1137728, "step": 1955 }, { "epoch": 0.29192731605600236, "grad_norm": 2.082810640335083, "learning_rate": 7.294459338695264e-06, "loss": 1.0854, "num_input_tokens_seen": 1140512, "step": 1960 }, { "epoch": 0.29267202859696156, "grad_norm": 2.11747407913208, "learning_rate": 7.313077152219244e-06, "loss": 0.9017, "num_input_tokens_seen": 1143616, "step": 1965 }, { "epoch": 0.29341674113792077, "grad_norm": 2.318066120147705, "learning_rate": 7.331694965743223e-06, "loss": 0.8331, "num_input_tokens_seen": 1146496, "step": 1970 }, { "epoch": 0.29416145367887997, "grad_norm": 1.7689933776855469, "learning_rate": 7.350312779267203e-06, "loss": 1.0079, "num_input_tokens_seen": 1149600, "step": 1975 }, { "epoch": 0.2949061662198391, "grad_norm": 1.819512963294983, "learning_rate": 7.368930592791183e-06, "loss": 0.7849, "num_input_tokens_seen": 1152320, "step": 1980 }, { "epoch": 0.2956508787607983, "grad_norm": 1.4357765913009644, "learning_rate": 7.387548406315163e-06, "loss": 0.7298, "num_input_tokens_seen": 1155648, "step": 1985 }, { "epoch": 0.2963955913017575, "grad_norm": 1.3277618885040283, "learning_rate": 7.406166219839142e-06, "loss": 0.7318, "num_input_tokens_seen": 1158336, "step": 1990 }, { "epoch": 0.2971403038427167, "grad_norm": 1.3160343170166016, "learning_rate": 7.424784033363122e-06, "loss": 0.8667, "num_input_tokens_seen": 1161280, "step": 1995 }, { "epoch": 0.2978850163836759, "grad_norm": 1.111945629119873, "learning_rate": 7.443401846887102e-06, "loss": 0.6388, "num_input_tokens_seen": 1164064, "step": 2000 }, { "epoch": 0.2986297289246351, "grad_norm": 1.0759544372558594, "learning_rate": 7.462019660411082e-06, "loss": 0.7363, "num_input_tokens_seen": 1166880, "step": 2005 }, { "epoch": 0.2993744414655943, "grad_norm": 1.9260283708572388, "learning_rate": 7.480637473935062e-06, "loss": 0.9513, "num_input_tokens_seen": 1169792, "step": 2010 }, { "epoch": 0.3001191540065535, "grad_norm": 1.6126545667648315, "learning_rate": 7.499255287459041e-06, "loss": 0.6825, "num_input_tokens_seen": 1172608, "step": 2015 }, { "epoch": 0.3008638665475127, "grad_norm": 1.1053402423858643, "learning_rate": 7.517873100983021e-06, "loss": 0.7272, "num_input_tokens_seen": 1175168, "step": 2020 }, { "epoch": 0.30160857908847183, "grad_norm": 1.6804510354995728, "learning_rate": 7.5364909145070005e-06, "loss": 0.7886, "num_input_tokens_seen": 1178016, "step": 2025 }, { "epoch": 0.30235329162943103, "grad_norm": 1.5024374723434448, "learning_rate": 7.555108728030981e-06, "loss": 0.87, "num_input_tokens_seen": 1180800, "step": 2030 }, { "epoch": 0.30309800417039023, "grad_norm": 2.296454429626465, "learning_rate": 7.5737265415549595e-06, "loss": 1.1332, "num_input_tokens_seen": 1183776, "step": 2035 }, { "epoch": 0.30384271671134944, "grad_norm": 1.9136425256729126, "learning_rate": 7.5923443550789394e-06, "loss": 0.7672, "num_input_tokens_seen": 1186880, "step": 2040 }, { "epoch": 0.3045874292523086, "grad_norm": 1.1794923543930054, "learning_rate": 7.610962168602919e-06, "loss": 0.9119, "num_input_tokens_seen": 1189856, "step": 2045 }, { "epoch": 0.3053321417932678, "grad_norm": 1.9668488502502441, "learning_rate": 7.6295799821269e-06, "loss": 0.7101, "num_input_tokens_seen": 1192736, "step": 2050 }, { "epoch": 0.306076854334227, "grad_norm": 1.0352853536605835, "learning_rate": 7.64819779565088e-06, "loss": 0.6526, "num_input_tokens_seen": 1195808, "step": 2055 }, { "epoch": 0.3068215668751862, "grad_norm": 1.325402855873108, "learning_rate": 7.666815609174859e-06, "loss": 0.6526, "num_input_tokens_seen": 1198656, "step": 2060 }, { "epoch": 0.3075662794161454, "grad_norm": 1.6897698640823364, "learning_rate": 7.685433422698839e-06, "loss": 0.7228, "num_input_tokens_seen": 1201664, "step": 2065 }, { "epoch": 0.30831099195710454, "grad_norm": 1.6984574794769287, "learning_rate": 7.704051236222819e-06, "loss": 0.739, "num_input_tokens_seen": 1204384, "step": 2070 }, { "epoch": 0.30905570449806374, "grad_norm": 2.311335563659668, "learning_rate": 7.722669049746799e-06, "loss": 0.795, "num_input_tokens_seen": 1207360, "step": 2075 }, { "epoch": 0.30980041703902295, "grad_norm": 1.7598222494125366, "learning_rate": 7.741286863270777e-06, "loss": 0.7386, "num_input_tokens_seen": 1210144, "step": 2080 }, { "epoch": 0.31054512957998215, "grad_norm": 0.9383725523948669, "learning_rate": 7.759904676794757e-06, "loss": 0.8389, "num_input_tokens_seen": 1213088, "step": 2085 }, { "epoch": 0.3112898421209413, "grad_norm": 1.575223445892334, "learning_rate": 7.778522490318737e-06, "loss": 0.7201, "num_input_tokens_seen": 1216416, "step": 2090 }, { "epoch": 0.3120345546619005, "grad_norm": 1.8564280271530151, "learning_rate": 7.797140303842717e-06, "loss": 0.8237, "num_input_tokens_seen": 1219424, "step": 2095 }, { "epoch": 0.3127792672028597, "grad_norm": 2.4677302837371826, "learning_rate": 7.815758117366697e-06, "loss": 0.7784, "num_input_tokens_seen": 1222528, "step": 2100 }, { "epoch": 0.3135239797438189, "grad_norm": 1.2712091207504272, "learning_rate": 7.834375930890677e-06, "loss": 0.8423, "num_input_tokens_seen": 1225408, "step": 2105 }, { "epoch": 0.31426869228477805, "grad_norm": 1.44298255443573, "learning_rate": 7.852993744414657e-06, "loss": 0.9778, "num_input_tokens_seen": 1228384, "step": 2110 }, { "epoch": 0.31501340482573725, "grad_norm": 1.6518528461456299, "learning_rate": 7.871611557938637e-06, "loss": 0.709, "num_input_tokens_seen": 1230880, "step": 2115 }, { "epoch": 0.31575811736669646, "grad_norm": 1.3215601444244385, "learning_rate": 7.890229371462615e-06, "loss": 0.8609, "num_input_tokens_seen": 1233920, "step": 2120 }, { "epoch": 0.31650282990765566, "grad_norm": 1.5923945903778076, "learning_rate": 7.908847184986595e-06, "loss": 0.7381, "num_input_tokens_seen": 1236704, "step": 2125 }, { "epoch": 0.31724754244861486, "grad_norm": 1.57099449634552, "learning_rate": 7.927464998510575e-06, "loss": 0.6714, "num_input_tokens_seen": 1239616, "step": 2130 }, { "epoch": 0.317992254989574, "grad_norm": 1.1229532957077026, "learning_rate": 7.946082812034555e-06, "loss": 0.7549, "num_input_tokens_seen": 1242464, "step": 2135 }, { "epoch": 0.3187369675305332, "grad_norm": 1.7400884628295898, "learning_rate": 7.964700625558536e-06, "loss": 0.9281, "num_input_tokens_seen": 1245440, "step": 2140 }, { "epoch": 0.3194816800714924, "grad_norm": 1.7106796503067017, "learning_rate": 7.983318439082515e-06, "loss": 0.834, "num_input_tokens_seen": 1248416, "step": 2145 }, { "epoch": 0.3202263926124516, "grad_norm": 1.38954496383667, "learning_rate": 8.001936252606494e-06, "loss": 0.693, "num_input_tokens_seen": 1251008, "step": 2150 }, { "epoch": 0.32097110515341076, "grad_norm": 1.322581171989441, "learning_rate": 8.020554066130474e-06, "loss": 0.763, "num_input_tokens_seen": 1253888, "step": 2155 }, { "epoch": 0.32171581769436997, "grad_norm": 1.3481465578079224, "learning_rate": 8.039171879654454e-06, "loss": 0.5699, "num_input_tokens_seen": 1256512, "step": 2160 }, { "epoch": 0.32246053023532917, "grad_norm": 1.3585723638534546, "learning_rate": 8.057789693178433e-06, "loss": 0.8676, "num_input_tokens_seen": 1259232, "step": 2165 }, { "epoch": 0.32320524277628837, "grad_norm": 1.318018913269043, "learning_rate": 8.076407506702412e-06, "loss": 0.6781, "num_input_tokens_seen": 1261952, "step": 2170 }, { "epoch": 0.3239499553172475, "grad_norm": 1.684404730796814, "learning_rate": 8.095025320226392e-06, "loss": 0.8188, "num_input_tokens_seen": 1264704, "step": 2175 }, { "epoch": 0.3246946678582067, "grad_norm": 1.601052165031433, "learning_rate": 8.113643133750374e-06, "loss": 0.6939, "num_input_tokens_seen": 1267424, "step": 2180 }, { "epoch": 0.3254393803991659, "grad_norm": 1.5644097328186035, "learning_rate": 8.132260947274352e-06, "loss": 0.7054, "num_input_tokens_seen": 1270496, "step": 2185 }, { "epoch": 0.3261840929401251, "grad_norm": 1.739795446395874, "learning_rate": 8.150878760798332e-06, "loss": 0.6152, "num_input_tokens_seen": 1273248, "step": 2190 }, { "epoch": 0.32692880548108433, "grad_norm": 1.2880733013153076, "learning_rate": 8.169496574322312e-06, "loss": 0.6454, "num_input_tokens_seen": 1276288, "step": 2195 }, { "epoch": 0.3276735180220435, "grad_norm": 2.6219136714935303, "learning_rate": 8.188114387846292e-06, "loss": 0.7866, "num_input_tokens_seen": 1279584, "step": 2200 }, { "epoch": 0.3284182305630027, "grad_norm": 2.8870761394500732, "learning_rate": 8.206732201370272e-06, "loss": 1.106, "num_input_tokens_seen": 1282656, "step": 2205 }, { "epoch": 0.3291629431039619, "grad_norm": 1.7137027978897095, "learning_rate": 8.22535001489425e-06, "loss": 0.8277, "num_input_tokens_seen": 1285568, "step": 2210 }, { "epoch": 0.3299076556449211, "grad_norm": 1.3544594049453735, "learning_rate": 8.24396782841823e-06, "loss": 0.8899, "num_input_tokens_seen": 1288448, "step": 2215 }, { "epoch": 0.33065236818588023, "grad_norm": 1.151637077331543, "learning_rate": 8.262585641942212e-06, "loss": 0.679, "num_input_tokens_seen": 1291264, "step": 2220 }, { "epoch": 0.33139708072683943, "grad_norm": 1.1262234449386597, "learning_rate": 8.281203455466192e-06, "loss": 0.7066, "num_input_tokens_seen": 1294240, "step": 2225 }, { "epoch": 0.33214179326779864, "grad_norm": 1.3164881467819214, "learning_rate": 8.29982126899017e-06, "loss": 0.8106, "num_input_tokens_seen": 1296992, "step": 2230 }, { "epoch": 0.33288650580875784, "grad_norm": 0.8128387928009033, "learning_rate": 8.31843908251415e-06, "loss": 0.8496, "num_input_tokens_seen": 1299808, "step": 2235 }, { "epoch": 0.333631218349717, "grad_norm": 1.0105576515197754, "learning_rate": 8.33705689603813e-06, "loss": 0.6392, "num_input_tokens_seen": 1302816, "step": 2240 }, { "epoch": 0.3343759308906762, "grad_norm": 1.8528896570205688, "learning_rate": 8.35567470956211e-06, "loss": 0.8104, "num_input_tokens_seen": 1305600, "step": 2245 }, { "epoch": 0.3351206434316354, "grad_norm": 1.2903165817260742, "learning_rate": 8.374292523086088e-06, "loss": 0.5924, "num_input_tokens_seen": 1309088, "step": 2250 }, { "epoch": 0.3358653559725946, "grad_norm": 2.46720814704895, "learning_rate": 8.392910336610068e-06, "loss": 0.8171, "num_input_tokens_seen": 1311968, "step": 2255 }, { "epoch": 0.3366100685135538, "grad_norm": 1.4885683059692383, "learning_rate": 8.41152815013405e-06, "loss": 0.819, "num_input_tokens_seen": 1315040, "step": 2260 }, { "epoch": 0.33735478105451294, "grad_norm": 1.6476901769638062, "learning_rate": 8.43014596365803e-06, "loss": 0.7396, "num_input_tokens_seen": 1318304, "step": 2265 }, { "epoch": 0.33809949359547214, "grad_norm": 1.5111632347106934, "learning_rate": 8.44876377718201e-06, "loss": 0.8406, "num_input_tokens_seen": 1320960, "step": 2270 }, { "epoch": 0.33884420613643135, "grad_norm": 1.7110440731048584, "learning_rate": 8.467381590705988e-06, "loss": 0.8806, "num_input_tokens_seen": 1323712, "step": 2275 }, { "epoch": 0.33958891867739055, "grad_norm": 1.5025529861450195, "learning_rate": 8.485999404229967e-06, "loss": 0.7731, "num_input_tokens_seen": 1326752, "step": 2280 }, { "epoch": 0.3403336312183497, "grad_norm": 1.1091848611831665, "learning_rate": 8.504617217753947e-06, "loss": 0.6077, "num_input_tokens_seen": 1329600, "step": 2285 }, { "epoch": 0.3410783437593089, "grad_norm": 1.113959789276123, "learning_rate": 8.523235031277927e-06, "loss": 0.8485, "num_input_tokens_seen": 1332448, "step": 2290 }, { "epoch": 0.3418230563002681, "grad_norm": 1.318873643875122, "learning_rate": 8.541852844801907e-06, "loss": 0.6466, "num_input_tokens_seen": 1335072, "step": 2295 }, { "epoch": 0.3425677688412273, "grad_norm": 1.0015020370483398, "learning_rate": 8.560470658325887e-06, "loss": 0.8337, "num_input_tokens_seen": 1337856, "step": 2300 }, { "epoch": 0.34331248138218645, "grad_norm": 1.3514775037765503, "learning_rate": 8.579088471849867e-06, "loss": 0.773, "num_input_tokens_seen": 1340896, "step": 2305 }, { "epoch": 0.34405719392314565, "grad_norm": 1.019507646560669, "learning_rate": 8.597706285373847e-06, "loss": 0.7703, "num_input_tokens_seen": 1343840, "step": 2310 }, { "epoch": 0.34480190646410486, "grad_norm": 1.3414556980133057, "learning_rate": 8.616324098897825e-06, "loss": 0.7776, "num_input_tokens_seen": 1346688, "step": 2315 }, { "epoch": 0.34554661900506406, "grad_norm": 1.0721497535705566, "learning_rate": 8.634941912421805e-06, "loss": 0.7794, "num_input_tokens_seen": 1349312, "step": 2320 }, { "epoch": 0.34629133154602326, "grad_norm": 1.4961419105529785, "learning_rate": 8.653559725945785e-06, "loss": 0.6349, "num_input_tokens_seen": 1352320, "step": 2325 }, { "epoch": 0.3470360440869824, "grad_norm": 1.0181649923324585, "learning_rate": 8.672177539469765e-06, "loss": 0.7988, "num_input_tokens_seen": 1355168, "step": 2330 }, { "epoch": 0.3477807566279416, "grad_norm": 0.930562436580658, "learning_rate": 8.690795352993745e-06, "loss": 0.7356, "num_input_tokens_seen": 1358112, "step": 2335 }, { "epoch": 0.3485254691689008, "grad_norm": 1.113263726234436, "learning_rate": 8.709413166517725e-06, "loss": 0.8729, "num_input_tokens_seen": 1360992, "step": 2340 }, { "epoch": 0.34927018170986, "grad_norm": 3.09352445602417, "learning_rate": 8.728030980041705e-06, "loss": 0.9308, "num_input_tokens_seen": 1363744, "step": 2345 }, { "epoch": 0.35001489425081916, "grad_norm": 1.2722022533416748, "learning_rate": 8.746648793565685e-06, "loss": 0.9955, "num_input_tokens_seen": 1366848, "step": 2350 }, { "epoch": 0.35075960679177837, "grad_norm": 0.9390389323234558, "learning_rate": 8.765266607089665e-06, "loss": 0.6318, "num_input_tokens_seen": 1369536, "step": 2355 }, { "epoch": 0.35150431933273757, "grad_norm": 1.8550686836242676, "learning_rate": 8.783884420613643e-06, "loss": 0.7291, "num_input_tokens_seen": 1372064, "step": 2360 }, { "epoch": 0.35224903187369677, "grad_norm": 0.8922443985939026, "learning_rate": 8.802502234137623e-06, "loss": 0.711, "num_input_tokens_seen": 1375136, "step": 2365 }, { "epoch": 0.3529937444146559, "grad_norm": 1.3344990015029907, "learning_rate": 8.821120047661603e-06, "loss": 0.9946, "num_input_tokens_seen": 1377952, "step": 2370 }, { "epoch": 0.3537384569556151, "grad_norm": 2.3072638511657715, "learning_rate": 8.839737861185583e-06, "loss": 0.7531, "num_input_tokens_seen": 1380864, "step": 2375 }, { "epoch": 0.3544831694965743, "grad_norm": 1.5941203832626343, "learning_rate": 8.858355674709563e-06, "loss": 0.7615, "num_input_tokens_seen": 1384000, "step": 2380 }, { "epoch": 0.3552278820375335, "grad_norm": 1.09621262550354, "learning_rate": 8.876973488233543e-06, "loss": 0.5375, "num_input_tokens_seen": 1387232, "step": 2385 }, { "epoch": 0.35597259457849273, "grad_norm": 2.52889084815979, "learning_rate": 8.895591301757522e-06, "loss": 0.8771, "num_input_tokens_seen": 1390112, "step": 2390 }, { "epoch": 0.3567173071194519, "grad_norm": 1.4475032091140747, "learning_rate": 8.914209115281502e-06, "loss": 0.9302, "num_input_tokens_seen": 1393472, "step": 2395 }, { "epoch": 0.3574620196604111, "grad_norm": 0.9513954520225525, "learning_rate": 8.93282692880548e-06, "loss": 0.5934, "num_input_tokens_seen": 1396288, "step": 2400 }, { "epoch": 0.3582067322013703, "grad_norm": 6.5080084800720215, "learning_rate": 8.95144474232946e-06, "loss": 0.8778, "num_input_tokens_seen": 1399360, "step": 2405 }, { "epoch": 0.3589514447423295, "grad_norm": 1.080478549003601, "learning_rate": 8.97006255585344e-06, "loss": 1.0348, "num_input_tokens_seen": 1402528, "step": 2410 }, { "epoch": 0.35969615728328863, "grad_norm": 0.8418897390365601, "learning_rate": 8.98868036937742e-06, "loss": 0.6441, "num_input_tokens_seen": 1405472, "step": 2415 }, { "epoch": 0.36044086982424783, "grad_norm": 1.1863175630569458, "learning_rate": 9.0072981829014e-06, "loss": 0.7073, "num_input_tokens_seen": 1408512, "step": 2420 }, { "epoch": 0.36118558236520704, "grad_norm": 1.8516162633895874, "learning_rate": 9.02591599642538e-06, "loss": 1.0279, "num_input_tokens_seen": 1411264, "step": 2425 }, { "epoch": 0.36193029490616624, "grad_norm": 1.39749276638031, "learning_rate": 9.04453380994936e-06, "loss": 0.9804, "num_input_tokens_seen": 1413792, "step": 2430 }, { "epoch": 0.3626750074471254, "grad_norm": 1.316960334777832, "learning_rate": 9.06315162347334e-06, "loss": 0.7551, "num_input_tokens_seen": 1416800, "step": 2435 }, { "epoch": 0.3634197199880846, "grad_norm": 1.175498604774475, "learning_rate": 9.08176943699732e-06, "loss": 0.7941, "num_input_tokens_seen": 1419552, "step": 2440 }, { "epoch": 0.3641644325290438, "grad_norm": 1.9055023193359375, "learning_rate": 9.100387250521298e-06, "loss": 0.5968, "num_input_tokens_seen": 1422272, "step": 2445 }, { "epoch": 0.364909145070003, "grad_norm": 1.5793323516845703, "learning_rate": 9.119005064045278e-06, "loss": 0.8107, "num_input_tokens_seen": 1425120, "step": 2450 }, { "epoch": 0.3656538576109622, "grad_norm": 2.1768882274627686, "learning_rate": 9.137622877569258e-06, "loss": 0.7912, "num_input_tokens_seen": 1427840, "step": 2455 }, { "epoch": 0.36639857015192134, "grad_norm": 1.1703392267227173, "learning_rate": 9.156240691093238e-06, "loss": 0.6421, "num_input_tokens_seen": 1430944, "step": 2460 }, { "epoch": 0.36714328269288055, "grad_norm": 1.050952672958374, "learning_rate": 9.174858504617218e-06, "loss": 0.7216, "num_input_tokens_seen": 1433792, "step": 2465 }, { "epoch": 0.36788799523383975, "grad_norm": 1.2009414434432983, "learning_rate": 9.193476318141198e-06, "loss": 0.689, "num_input_tokens_seen": 1436704, "step": 2470 }, { "epoch": 0.36863270777479895, "grad_norm": 1.793312668800354, "learning_rate": 9.212094131665178e-06, "loss": 0.7906, "num_input_tokens_seen": 1439360, "step": 2475 }, { "epoch": 0.3693774203157581, "grad_norm": 1.1401499509811401, "learning_rate": 9.230711945189158e-06, "loss": 0.6694, "num_input_tokens_seen": 1442016, "step": 2480 }, { "epoch": 0.3701221328567173, "grad_norm": 1.185157299041748, "learning_rate": 9.249329758713138e-06, "loss": 0.8128, "num_input_tokens_seen": 1444800, "step": 2485 }, { "epoch": 0.3708668453976765, "grad_norm": 1.7875691652297974, "learning_rate": 9.267947572237116e-06, "loss": 0.8465, "num_input_tokens_seen": 1447488, "step": 2490 }, { "epoch": 0.3716115579386357, "grad_norm": 1.0807828903198242, "learning_rate": 9.286565385761096e-06, "loss": 0.8774, "num_input_tokens_seen": 1450144, "step": 2495 }, { "epoch": 0.37235627047959485, "grad_norm": 0.9528098702430725, "learning_rate": 9.305183199285077e-06, "loss": 0.7808, "num_input_tokens_seen": 1453184, "step": 2500 }, { "epoch": 0.37310098302055406, "grad_norm": 1.4371073246002197, "learning_rate": 9.323801012809057e-06, "loss": 0.7173, "num_input_tokens_seen": 1456128, "step": 2505 }, { "epoch": 0.37384569556151326, "grad_norm": 0.9309978485107422, "learning_rate": 9.342418826333036e-06, "loss": 0.6442, "num_input_tokens_seen": 1459168, "step": 2510 }, { "epoch": 0.37459040810247246, "grad_norm": 1.2402350902557373, "learning_rate": 9.361036639857016e-06, "loss": 0.7839, "num_input_tokens_seen": 1462080, "step": 2515 }, { "epoch": 0.3753351206434316, "grad_norm": 1.5235214233398438, "learning_rate": 9.379654453380995e-06, "loss": 0.749, "num_input_tokens_seen": 1464896, "step": 2520 }, { "epoch": 0.3760798331843908, "grad_norm": 1.594736099243164, "learning_rate": 9.398272266904975e-06, "loss": 0.8984, "num_input_tokens_seen": 1467648, "step": 2525 }, { "epoch": 0.37682454572535, "grad_norm": 1.309902310371399, "learning_rate": 9.416890080428954e-06, "loss": 0.7305, "num_input_tokens_seen": 1470464, "step": 2530 }, { "epoch": 0.3775692582663092, "grad_norm": 1.2765631675720215, "learning_rate": 9.435507893952934e-06, "loss": 0.6057, "num_input_tokens_seen": 1473248, "step": 2535 }, { "epoch": 0.3783139708072684, "grad_norm": 1.4661785364151, "learning_rate": 9.454125707476915e-06, "loss": 0.755, "num_input_tokens_seen": 1476256, "step": 2540 }, { "epoch": 0.37905868334822757, "grad_norm": 1.0592116117477417, "learning_rate": 9.472743521000895e-06, "loss": 0.8283, "num_input_tokens_seen": 1479136, "step": 2545 }, { "epoch": 0.37980339588918677, "grad_norm": 1.57676100730896, "learning_rate": 9.491361334524875e-06, "loss": 0.5939, "num_input_tokens_seen": 1481696, "step": 2550 }, { "epoch": 0.38054810843014597, "grad_norm": 1.3439558744430542, "learning_rate": 9.509979148048853e-06, "loss": 0.8597, "num_input_tokens_seen": 1484640, "step": 2555 }, { "epoch": 0.3812928209711052, "grad_norm": 1.7428786754608154, "learning_rate": 9.528596961572833e-06, "loss": 0.858, "num_input_tokens_seen": 1487296, "step": 2560 }, { "epoch": 0.3820375335120643, "grad_norm": 1.1453701257705688, "learning_rate": 9.547214775096813e-06, "loss": 0.8549, "num_input_tokens_seen": 1490208, "step": 2565 }, { "epoch": 0.3827822460530235, "grad_norm": 2.211277961730957, "learning_rate": 9.565832588620793e-06, "loss": 0.8532, "num_input_tokens_seen": 1493152, "step": 2570 }, { "epoch": 0.3835269585939827, "grad_norm": 1.0701372623443604, "learning_rate": 9.584450402144771e-06, "loss": 0.8734, "num_input_tokens_seen": 1496032, "step": 2575 }, { "epoch": 0.38427167113494193, "grad_norm": 2.3874590396881104, "learning_rate": 9.603068215668753e-06, "loss": 0.9013, "num_input_tokens_seen": 1498880, "step": 2580 }, { "epoch": 0.3850163836759011, "grad_norm": 1.4711110591888428, "learning_rate": 9.621686029192733e-06, "loss": 0.6888, "num_input_tokens_seen": 1501696, "step": 2585 }, { "epoch": 0.3857610962168603, "grad_norm": 1.145965814590454, "learning_rate": 9.640303842716713e-06, "loss": 0.7324, "num_input_tokens_seen": 1504448, "step": 2590 }, { "epoch": 0.3865058087578195, "grad_norm": 1.3572437763214111, "learning_rate": 9.658921656240691e-06, "loss": 0.6797, "num_input_tokens_seen": 1507520, "step": 2595 }, { "epoch": 0.3872505212987787, "grad_norm": 2.6447596549987793, "learning_rate": 9.677539469764671e-06, "loss": 0.8824, "num_input_tokens_seen": 1510176, "step": 2600 }, { "epoch": 0.3879952338397379, "grad_norm": 0.7594215869903564, "learning_rate": 9.69615728328865e-06, "loss": 0.9133, "num_input_tokens_seen": 1512992, "step": 2605 }, { "epoch": 0.38873994638069703, "grad_norm": 1.3856624364852905, "learning_rate": 9.71477509681263e-06, "loss": 0.8865, "num_input_tokens_seen": 1515680, "step": 2610 }, { "epoch": 0.38948465892165623, "grad_norm": 2.141563892364502, "learning_rate": 9.73339291033661e-06, "loss": 0.8631, "num_input_tokens_seen": 1518400, "step": 2615 }, { "epoch": 0.39022937146261544, "grad_norm": 1.2357163429260254, "learning_rate": 9.75201072386059e-06, "loss": 0.6779, "num_input_tokens_seen": 1522560, "step": 2620 }, { "epoch": 0.39097408400357464, "grad_norm": 1.6539034843444824, "learning_rate": 9.77062853738457e-06, "loss": 0.8409, "num_input_tokens_seen": 1525408, "step": 2625 }, { "epoch": 0.3917187965445338, "grad_norm": 1.560851812362671, "learning_rate": 9.78924635090855e-06, "loss": 0.6801, "num_input_tokens_seen": 1528256, "step": 2630 }, { "epoch": 0.392463509085493, "grad_norm": 2.1804163455963135, "learning_rate": 9.80786416443253e-06, "loss": 0.7862, "num_input_tokens_seen": 1531264, "step": 2635 }, { "epoch": 0.3932082216264522, "grad_norm": 1.071980595588684, "learning_rate": 9.826481977956509e-06, "loss": 0.8304, "num_input_tokens_seen": 1534272, "step": 2640 }, { "epoch": 0.3939529341674114, "grad_norm": 2.1326022148132324, "learning_rate": 9.845099791480489e-06, "loss": 0.8093, "num_input_tokens_seen": 1537472, "step": 2645 }, { "epoch": 0.39469764670837054, "grad_norm": 1.6852003335952759, "learning_rate": 9.863717605004468e-06, "loss": 0.905, "num_input_tokens_seen": 1540352, "step": 2650 }, { "epoch": 0.39544235924932974, "grad_norm": 1.1310460567474365, "learning_rate": 9.882335418528448e-06, "loss": 0.8718, "num_input_tokens_seen": 1543328, "step": 2655 }, { "epoch": 0.39618707179028895, "grad_norm": 1.1712886095046997, "learning_rate": 9.900953232052428e-06, "loss": 0.7302, "num_input_tokens_seen": 1546272, "step": 2660 }, { "epoch": 0.39693178433124815, "grad_norm": 1.6858136653900146, "learning_rate": 9.919571045576408e-06, "loss": 0.9437, "num_input_tokens_seen": 1549280, "step": 2665 }, { "epoch": 0.39767649687220735, "grad_norm": 1.6186420917510986, "learning_rate": 9.938188859100388e-06, "loss": 0.7402, "num_input_tokens_seen": 1552256, "step": 2670 }, { "epoch": 0.3984212094131665, "grad_norm": 1.4986881017684937, "learning_rate": 9.956806672624368e-06, "loss": 0.7117, "num_input_tokens_seen": 1555168, "step": 2675 }, { "epoch": 0.3991659219541257, "grad_norm": 3.199300527572632, "learning_rate": 9.975424486148348e-06, "loss": 0.8002, "num_input_tokens_seen": 1558112, "step": 2680 }, { "epoch": 0.3999106344950849, "grad_norm": 3.0649614334106445, "learning_rate": 9.994042299672326e-06, "loss": 1.035, "num_input_tokens_seen": 1561184, "step": 2685 }, { "epoch": 0.4006553470360441, "grad_norm": 1.5240471363067627, "learning_rate": 1.0012660113196306e-05, "loss": 0.7171, "num_input_tokens_seen": 1563936, "step": 2690 }, { "epoch": 0.40140005957700325, "grad_norm": 1.3670531511306763, "learning_rate": 1.0031277926720286e-05, "loss": 0.6976, "num_input_tokens_seen": 1566592, "step": 2695 }, { "epoch": 0.40214477211796246, "grad_norm": 1.6254600286483765, "learning_rate": 1.0049895740244266e-05, "loss": 0.8996, "num_input_tokens_seen": 1569568, "step": 2700 }, { "epoch": 0.40288948465892166, "grad_norm": 1.2221065759658813, "learning_rate": 1.0068513553768246e-05, "loss": 0.7673, "num_input_tokens_seen": 1572384, "step": 2705 }, { "epoch": 0.40363419719988086, "grad_norm": 1.0827887058258057, "learning_rate": 1.0087131367292226e-05, "loss": 0.6352, "num_input_tokens_seen": 1575296, "step": 2710 }, { "epoch": 0.40437890974084, "grad_norm": 1.1053358316421509, "learning_rate": 1.0105749180816206e-05, "loss": 0.8164, "num_input_tokens_seen": 1578208, "step": 2715 }, { "epoch": 0.4051236222817992, "grad_norm": 1.45697820186615, "learning_rate": 1.0124366994340186e-05, "loss": 0.6403, "num_input_tokens_seen": 1581408, "step": 2720 }, { "epoch": 0.4058683348227584, "grad_norm": 0.9232391119003296, "learning_rate": 1.0142984807864164e-05, "loss": 0.7735, "num_input_tokens_seen": 1584096, "step": 2725 }, { "epoch": 0.4066130473637176, "grad_norm": 1.9282737970352173, "learning_rate": 1.0161602621388144e-05, "loss": 0.7802, "num_input_tokens_seen": 1587008, "step": 2730 }, { "epoch": 0.4073577599046768, "grad_norm": 2.141083002090454, "learning_rate": 1.0180220434912124e-05, "loss": 0.7855, "num_input_tokens_seen": 1589824, "step": 2735 }, { "epoch": 0.40810247244563597, "grad_norm": 1.458178162574768, "learning_rate": 1.0198838248436104e-05, "loss": 0.7616, "num_input_tokens_seen": 1592800, "step": 2740 }, { "epoch": 0.40884718498659517, "grad_norm": 1.3060270547866821, "learning_rate": 1.0217456061960085e-05, "loss": 0.8057, "num_input_tokens_seen": 1595808, "step": 2745 }, { "epoch": 0.40959189752755437, "grad_norm": 1.2648887634277344, "learning_rate": 1.0236073875484064e-05, "loss": 0.8524, "num_input_tokens_seen": 1598592, "step": 2750 }, { "epoch": 0.4103366100685136, "grad_norm": 2.0453872680664062, "learning_rate": 1.0254691689008044e-05, "loss": 0.7422, "num_input_tokens_seen": 1601824, "step": 2755 }, { "epoch": 0.4110813226094727, "grad_norm": 1.6149463653564453, "learning_rate": 1.0273309502532023e-05, "loss": 0.7537, "num_input_tokens_seen": 1604672, "step": 2760 }, { "epoch": 0.4118260351504319, "grad_norm": 2.095818281173706, "learning_rate": 1.0291927316056003e-05, "loss": 0.7499, "num_input_tokens_seen": 1607584, "step": 2765 }, { "epoch": 0.4125707476913911, "grad_norm": 1.819615364074707, "learning_rate": 1.0310545129579982e-05, "loss": 0.6764, "num_input_tokens_seen": 1610496, "step": 2770 }, { "epoch": 0.41331546023235033, "grad_norm": 0.816627025604248, "learning_rate": 1.0329162943103962e-05, "loss": 0.7523, "num_input_tokens_seen": 1613472, "step": 2775 }, { "epoch": 0.4140601727733095, "grad_norm": 0.999885082244873, "learning_rate": 1.0347780756627941e-05, "loss": 0.7885, "num_input_tokens_seen": 1616416, "step": 2780 }, { "epoch": 0.4148048853142687, "grad_norm": 1.4912328720092773, "learning_rate": 1.0366398570151923e-05, "loss": 0.7308, "num_input_tokens_seen": 1619456, "step": 2785 }, { "epoch": 0.4155495978552279, "grad_norm": 1.125145435333252, "learning_rate": 1.0385016383675901e-05, "loss": 0.7621, "num_input_tokens_seen": 1622528, "step": 2790 }, { "epoch": 0.4162943103961871, "grad_norm": 1.6052634716033936, "learning_rate": 1.0403634197199881e-05, "loss": 0.9128, "num_input_tokens_seen": 1625280, "step": 2795 }, { "epoch": 0.4170390229371463, "grad_norm": 1.2597339153289795, "learning_rate": 1.0422252010723861e-05, "loss": 0.6469, "num_input_tokens_seen": 1628192, "step": 2800 }, { "epoch": 0.41778373547810543, "grad_norm": 2.1609318256378174, "learning_rate": 1.0440869824247841e-05, "loss": 0.723, "num_input_tokens_seen": 1631264, "step": 2805 }, { "epoch": 0.41852844801906464, "grad_norm": 1.931019902229309, "learning_rate": 1.0459487637771821e-05, "loss": 0.9086, "num_input_tokens_seen": 1634080, "step": 2810 }, { "epoch": 0.41927316056002384, "grad_norm": 2.088672161102295, "learning_rate": 1.04781054512958e-05, "loss": 0.6726, "num_input_tokens_seen": 1637056, "step": 2815 }, { "epoch": 0.42001787310098304, "grad_norm": 1.9208868741989136, "learning_rate": 1.049672326481978e-05, "loss": 0.8, "num_input_tokens_seen": 1640064, "step": 2820 }, { "epoch": 0.4207625856419422, "grad_norm": 1.1489875316619873, "learning_rate": 1.051534107834376e-05, "loss": 0.7904, "num_input_tokens_seen": 1642784, "step": 2825 }, { "epoch": 0.4215072981829014, "grad_norm": 2.0614283084869385, "learning_rate": 1.053395889186774e-05, "loss": 0.6555, "num_input_tokens_seen": 1645536, "step": 2830 }, { "epoch": 0.4222520107238606, "grad_norm": 1.4690543413162231, "learning_rate": 1.0552576705391719e-05, "loss": 0.8072, "num_input_tokens_seen": 1648128, "step": 2835 }, { "epoch": 0.4229967232648198, "grad_norm": 1.1709951162338257, "learning_rate": 1.0571194518915699e-05, "loss": 0.7844, "num_input_tokens_seen": 1650880, "step": 2840 }, { "epoch": 0.42374143580577894, "grad_norm": 1.3264358043670654, "learning_rate": 1.0589812332439679e-05, "loss": 0.6795, "num_input_tokens_seen": 1653664, "step": 2845 }, { "epoch": 0.42448614834673815, "grad_norm": 1.38809072971344, "learning_rate": 1.0608430145963659e-05, "loss": 0.6632, "num_input_tokens_seen": 1656640, "step": 2850 }, { "epoch": 0.42523086088769735, "grad_norm": 1.5717179775238037, "learning_rate": 1.0627047959487637e-05, "loss": 0.6427, "num_input_tokens_seen": 1659296, "step": 2855 }, { "epoch": 0.42597557342865655, "grad_norm": 0.7367070317268372, "learning_rate": 1.0645665773011617e-05, "loss": 0.694, "num_input_tokens_seen": 1662112, "step": 2860 }, { "epoch": 0.42672028596961575, "grad_norm": 1.6192220449447632, "learning_rate": 1.0664283586535598e-05, "loss": 0.8204, "num_input_tokens_seen": 1664832, "step": 2865 }, { "epoch": 0.4274649985105749, "grad_norm": 1.2613048553466797, "learning_rate": 1.0682901400059578e-05, "loss": 0.684, "num_input_tokens_seen": 1667872, "step": 2870 }, { "epoch": 0.4282097110515341, "grad_norm": 1.1549440622329712, "learning_rate": 1.0701519213583558e-05, "loss": 0.7016, "num_input_tokens_seen": 1670592, "step": 2875 }, { "epoch": 0.4289544235924933, "grad_norm": 1.282227873802185, "learning_rate": 1.0720137027107537e-05, "loss": 0.8609, "num_input_tokens_seen": 1673472, "step": 2880 }, { "epoch": 0.4296991361334525, "grad_norm": 1.0188043117523193, "learning_rate": 1.0738754840631517e-05, "loss": 0.7562, "num_input_tokens_seen": 1676352, "step": 2885 }, { "epoch": 0.43044384867441166, "grad_norm": 1.1221327781677246, "learning_rate": 1.0757372654155496e-05, "loss": 0.8243, "num_input_tokens_seen": 1679328, "step": 2890 }, { "epoch": 0.43118856121537086, "grad_norm": 1.0307424068450928, "learning_rate": 1.0775990467679476e-05, "loss": 0.7982, "num_input_tokens_seen": 1682240, "step": 2895 }, { "epoch": 0.43193327375633006, "grad_norm": 1.6428232192993164, "learning_rate": 1.0794608281203456e-05, "loss": 0.9052, "num_input_tokens_seen": 1685120, "step": 2900 }, { "epoch": 0.43267798629728926, "grad_norm": 1.240657091140747, "learning_rate": 1.0813226094727436e-05, "loss": 0.8071, "num_input_tokens_seen": 1688000, "step": 2905 }, { "epoch": 0.4334226988382484, "grad_norm": 2.17965030670166, "learning_rate": 1.0831843908251416e-05, "loss": 0.8718, "num_input_tokens_seen": 1690816, "step": 2910 }, { "epoch": 0.4341674113792076, "grad_norm": 1.3338419198989868, "learning_rate": 1.0850461721775396e-05, "loss": 0.7606, "num_input_tokens_seen": 1693600, "step": 2915 }, { "epoch": 0.4349121239201668, "grad_norm": 0.868949830532074, "learning_rate": 1.0869079535299374e-05, "loss": 0.8301, "num_input_tokens_seen": 1696544, "step": 2920 }, { "epoch": 0.435656836461126, "grad_norm": 1.2641701698303223, "learning_rate": 1.0887697348823354e-05, "loss": 0.7613, "num_input_tokens_seen": 1699200, "step": 2925 }, { "epoch": 0.4364015490020852, "grad_norm": 1.663167119026184, "learning_rate": 1.0906315162347334e-05, "loss": 0.7686, "num_input_tokens_seen": 1702080, "step": 2930 }, { "epoch": 0.43714626154304437, "grad_norm": 0.895108163356781, "learning_rate": 1.0924932975871314e-05, "loss": 0.7975, "num_input_tokens_seen": 1705088, "step": 2935 }, { "epoch": 0.43789097408400357, "grad_norm": 1.1223098039627075, "learning_rate": 1.0943550789395294e-05, "loss": 0.7176, "num_input_tokens_seen": 1707808, "step": 2940 }, { "epoch": 0.4386356866249628, "grad_norm": 0.758297324180603, "learning_rate": 1.0962168602919274e-05, "loss": 0.7614, "num_input_tokens_seen": 1710592, "step": 2945 }, { "epoch": 0.439380399165922, "grad_norm": 1.454939365386963, "learning_rate": 1.0980786416443254e-05, "loss": 0.8069, "num_input_tokens_seen": 1713664, "step": 2950 }, { "epoch": 0.4401251117068811, "grad_norm": 1.640636920928955, "learning_rate": 1.0999404229967234e-05, "loss": 0.7666, "num_input_tokens_seen": 1716512, "step": 2955 }, { "epoch": 0.4408698242478403, "grad_norm": 0.9850732684135437, "learning_rate": 1.1018022043491214e-05, "loss": 0.7509, "num_input_tokens_seen": 1719424, "step": 2960 }, { "epoch": 0.4416145367887995, "grad_norm": 1.8887392282485962, "learning_rate": 1.1036639857015192e-05, "loss": 0.7643, "num_input_tokens_seen": 1722208, "step": 2965 }, { "epoch": 0.44235924932975873, "grad_norm": 1.091422200202942, "learning_rate": 1.1055257670539172e-05, "loss": 0.7806, "num_input_tokens_seen": 1725248, "step": 2970 }, { "epoch": 0.4431039618707179, "grad_norm": 1.5713151693344116, "learning_rate": 1.1073875484063152e-05, "loss": 0.7406, "num_input_tokens_seen": 1728192, "step": 2975 }, { "epoch": 0.4438486744116771, "grad_norm": 1.3770461082458496, "learning_rate": 1.1092493297587132e-05, "loss": 0.9073, "num_input_tokens_seen": 1731328, "step": 2980 }, { "epoch": 0.4445933869526363, "grad_norm": 1.6289974451065063, "learning_rate": 1.1111111111111112e-05, "loss": 0.7057, "num_input_tokens_seen": 1734304, "step": 2985 }, { "epoch": 0.4453380994935955, "grad_norm": 1.2865722179412842, "learning_rate": 1.1129728924635092e-05, "loss": 0.7035, "num_input_tokens_seen": 1736896, "step": 2990 }, { "epoch": 0.4460828120345547, "grad_norm": 1.1763246059417725, "learning_rate": 1.1148346738159071e-05, "loss": 0.793, "num_input_tokens_seen": 1739712, "step": 2995 }, { "epoch": 0.44682752457551383, "grad_norm": 1.1684918403625488, "learning_rate": 1.1166964551683051e-05, "loss": 0.7956, "num_input_tokens_seen": 1742528, "step": 3000 }, { "epoch": 0.44757223711647304, "grad_norm": 0.9426589012145996, "learning_rate": 1.1185582365207031e-05, "loss": 0.783, "num_input_tokens_seen": 1745536, "step": 3005 }, { "epoch": 0.44831694965743224, "grad_norm": 1.4202923774719238, "learning_rate": 1.120420017873101e-05, "loss": 0.7605, "num_input_tokens_seen": 1748448, "step": 3010 }, { "epoch": 0.44906166219839144, "grad_norm": 1.5283161401748657, "learning_rate": 1.122281799225499e-05, "loss": 0.667, "num_input_tokens_seen": 1751168, "step": 3015 }, { "epoch": 0.4498063747393506, "grad_norm": 1.144529104232788, "learning_rate": 1.124143580577897e-05, "loss": 0.7055, "num_input_tokens_seen": 1753760, "step": 3020 }, { "epoch": 0.4505510872803098, "grad_norm": 1.4749557971954346, "learning_rate": 1.126005361930295e-05, "loss": 0.6521, "num_input_tokens_seen": 1756832, "step": 3025 }, { "epoch": 0.451295799821269, "grad_norm": 2.1611602306365967, "learning_rate": 1.127867143282693e-05, "loss": 0.7835, "num_input_tokens_seen": 1759840, "step": 3030 }, { "epoch": 0.4520405123622282, "grad_norm": 1.2385696172714233, "learning_rate": 1.129728924635091e-05, "loss": 0.8562, "num_input_tokens_seen": 1762752, "step": 3035 }, { "epoch": 0.45278522490318734, "grad_norm": 1.6031945943832397, "learning_rate": 1.1315907059874889e-05, "loss": 0.8502, "num_input_tokens_seen": 1765568, "step": 3040 }, { "epoch": 0.45352993744414655, "grad_norm": 1.4309428930282593, "learning_rate": 1.1334524873398869e-05, "loss": 0.8506, "num_input_tokens_seen": 1768160, "step": 3045 }, { "epoch": 0.45427464998510575, "grad_norm": 1.1169639825820923, "learning_rate": 1.1353142686922847e-05, "loss": 0.6584, "num_input_tokens_seen": 1771072, "step": 3050 }, { "epoch": 0.45501936252606495, "grad_norm": 1.064744472503662, "learning_rate": 1.1371760500446827e-05, "loss": 0.7654, "num_input_tokens_seen": 1774016, "step": 3055 }, { "epoch": 0.45576407506702415, "grad_norm": 1.1523264646530151, "learning_rate": 1.1390378313970807e-05, "loss": 0.5442, "num_input_tokens_seen": 1777088, "step": 3060 }, { "epoch": 0.4565087876079833, "grad_norm": 1.0006742477416992, "learning_rate": 1.1408996127494787e-05, "loss": 0.6066, "num_input_tokens_seen": 1779840, "step": 3065 }, { "epoch": 0.4572535001489425, "grad_norm": 1.6042157411575317, "learning_rate": 1.1427613941018769e-05, "loss": 0.8223, "num_input_tokens_seen": 1782656, "step": 3070 }, { "epoch": 0.4579982126899017, "grad_norm": 0.8223024010658264, "learning_rate": 1.1446231754542747e-05, "loss": 0.8274, "num_input_tokens_seen": 1785792, "step": 3075 }, { "epoch": 0.4587429252308609, "grad_norm": 1.3406845331192017, "learning_rate": 1.1464849568066727e-05, "loss": 0.7884, "num_input_tokens_seen": 1788896, "step": 3080 }, { "epoch": 0.45948763777182006, "grad_norm": 1.0654945373535156, "learning_rate": 1.1483467381590707e-05, "loss": 0.7398, "num_input_tokens_seen": 1791904, "step": 3085 }, { "epoch": 0.46023235031277926, "grad_norm": 1.0460854768753052, "learning_rate": 1.1502085195114687e-05, "loss": 0.6693, "num_input_tokens_seen": 1794816, "step": 3090 }, { "epoch": 0.46097706285373846, "grad_norm": 0.7936553955078125, "learning_rate": 1.1520703008638665e-05, "loss": 0.9058, "num_input_tokens_seen": 1797536, "step": 3095 }, { "epoch": 0.46172177539469766, "grad_norm": 1.792656421661377, "learning_rate": 1.1539320822162645e-05, "loss": 0.7911, "num_input_tokens_seen": 1800512, "step": 3100 }, { "epoch": 0.4624664879356568, "grad_norm": 1.0698331594467163, "learning_rate": 1.1557938635686626e-05, "loss": 0.7465, "num_input_tokens_seen": 1803328, "step": 3105 }, { "epoch": 0.463211200476616, "grad_norm": 1.3891230821609497, "learning_rate": 1.1576556449210606e-05, "loss": 0.8827, "num_input_tokens_seen": 1806176, "step": 3110 }, { "epoch": 0.4639559130175752, "grad_norm": 1.018436074256897, "learning_rate": 1.1595174262734585e-05, "loss": 0.8189, "num_input_tokens_seen": 1809056, "step": 3115 }, { "epoch": 0.4647006255585344, "grad_norm": 1.2378852367401123, "learning_rate": 1.1613792076258565e-05, "loss": 0.7713, "num_input_tokens_seen": 1812032, "step": 3120 }, { "epoch": 0.4654453380994936, "grad_norm": 1.1060878038406372, "learning_rate": 1.1632409889782545e-05, "loss": 0.7284, "num_input_tokens_seen": 1814976, "step": 3125 }, { "epoch": 0.46619005064045277, "grad_norm": 0.9766548871994019, "learning_rate": 1.1651027703306524e-05, "loss": 0.5815, "num_input_tokens_seen": 1818080, "step": 3130 }, { "epoch": 0.46693476318141197, "grad_norm": 1.2804460525512695, "learning_rate": 1.1669645516830504e-05, "loss": 0.8578, "num_input_tokens_seen": 1821216, "step": 3135 }, { "epoch": 0.4676794757223712, "grad_norm": 0.693361222743988, "learning_rate": 1.1688263330354483e-05, "loss": 0.5885, "num_input_tokens_seen": 1824000, "step": 3140 }, { "epoch": 0.4684241882633304, "grad_norm": 1.8434619903564453, "learning_rate": 1.1706881143878464e-05, "loss": 0.8767, "num_input_tokens_seen": 1826560, "step": 3145 }, { "epoch": 0.4691689008042895, "grad_norm": 1.4270782470703125, "learning_rate": 1.1725498957402444e-05, "loss": 0.8363, "num_input_tokens_seen": 1829248, "step": 3150 }, { "epoch": 0.4699136133452487, "grad_norm": 0.9424343705177307, "learning_rate": 1.1744116770926424e-05, "loss": 0.7756, "num_input_tokens_seen": 1832032, "step": 3155 }, { "epoch": 0.47065832588620793, "grad_norm": 2.059609889984131, "learning_rate": 1.1762734584450402e-05, "loss": 0.5236, "num_input_tokens_seen": 1835328, "step": 3160 }, { "epoch": 0.47140303842716713, "grad_norm": 1.291010856628418, "learning_rate": 1.1781352397974382e-05, "loss": 0.6859, "num_input_tokens_seen": 1838176, "step": 3165 }, { "epoch": 0.4721477509681263, "grad_norm": 1.1951079368591309, "learning_rate": 1.1799970211498362e-05, "loss": 0.8731, "num_input_tokens_seen": 1841824, "step": 3170 }, { "epoch": 0.4728924635090855, "grad_norm": 1.062677025794983, "learning_rate": 1.1818588025022342e-05, "loss": 0.7848, "num_input_tokens_seen": 1845024, "step": 3175 }, { "epoch": 0.4736371760500447, "grad_norm": 1.0079408884048462, "learning_rate": 1.183720583854632e-05, "loss": 0.7032, "num_input_tokens_seen": 1847744, "step": 3180 }, { "epoch": 0.4743818885910039, "grad_norm": 1.5238429307937622, "learning_rate": 1.1855823652070302e-05, "loss": 0.7668, "num_input_tokens_seen": 1850560, "step": 3185 }, { "epoch": 0.4751266011319631, "grad_norm": 0.9092770218849182, "learning_rate": 1.1874441465594282e-05, "loss": 0.71, "num_input_tokens_seen": 1853504, "step": 3190 }, { "epoch": 0.47587131367292224, "grad_norm": 0.9641591310501099, "learning_rate": 1.1893059279118262e-05, "loss": 0.8283, "num_input_tokens_seen": 1856480, "step": 3195 }, { "epoch": 0.47661602621388144, "grad_norm": 0.9495593309402466, "learning_rate": 1.1911677092642242e-05, "loss": 0.7018, "num_input_tokens_seen": 1859392, "step": 3200 }, { "epoch": 0.47736073875484064, "grad_norm": 0.9788819551467896, "learning_rate": 1.193029490616622e-05, "loss": 0.6277, "num_input_tokens_seen": 1862240, "step": 3205 }, { "epoch": 0.47810545129579984, "grad_norm": 1.5313584804534912, "learning_rate": 1.19489127196902e-05, "loss": 0.6573, "num_input_tokens_seen": 1865024, "step": 3210 }, { "epoch": 0.478850163836759, "grad_norm": 1.6025892496109009, "learning_rate": 1.196753053321418e-05, "loss": 0.7153, "num_input_tokens_seen": 1867776, "step": 3215 }, { "epoch": 0.4795948763777182, "grad_norm": 1.0651599168777466, "learning_rate": 1.198614834673816e-05, "loss": 0.7341, "num_input_tokens_seen": 1870752, "step": 3220 }, { "epoch": 0.4803395889186774, "grad_norm": 1.4550551176071167, "learning_rate": 1.200476616026214e-05, "loss": 0.6831, "num_input_tokens_seen": 1873888, "step": 3225 }, { "epoch": 0.4810843014596366, "grad_norm": 1.2376583814620972, "learning_rate": 1.202338397378612e-05, "loss": 0.8625, "num_input_tokens_seen": 1876896, "step": 3230 }, { "epoch": 0.48182901400059575, "grad_norm": 2.4759628772735596, "learning_rate": 1.20420017873101e-05, "loss": 0.8792, "num_input_tokens_seen": 1879360, "step": 3235 }, { "epoch": 0.48257372654155495, "grad_norm": 1.0125057697296143, "learning_rate": 1.206061960083408e-05, "loss": 0.6712, "num_input_tokens_seen": 1882592, "step": 3240 }, { "epoch": 0.48331843908251415, "grad_norm": 1.5972892045974731, "learning_rate": 1.2079237414358058e-05, "loss": 0.793, "num_input_tokens_seen": 1885728, "step": 3245 }, { "epoch": 0.48406315162347335, "grad_norm": 1.2125108242034912, "learning_rate": 1.2097855227882038e-05, "loss": 0.7385, "num_input_tokens_seen": 1888896, "step": 3250 }, { "epoch": 0.48480786416443256, "grad_norm": 1.0415892601013184, "learning_rate": 1.2116473041406018e-05, "loss": 0.8007, "num_input_tokens_seen": 1892000, "step": 3255 }, { "epoch": 0.4855525767053917, "grad_norm": 0.869255781173706, "learning_rate": 1.2135090854929997e-05, "loss": 0.8942, "num_input_tokens_seen": 1894784, "step": 3260 }, { "epoch": 0.4862972892463509, "grad_norm": 1.038616418838501, "learning_rate": 1.2153708668453977e-05, "loss": 0.6954, "num_input_tokens_seen": 1897440, "step": 3265 }, { "epoch": 0.4870420017873101, "grad_norm": 1.664421796798706, "learning_rate": 1.2172326481977957e-05, "loss": 0.7612, "num_input_tokens_seen": 1900352, "step": 3270 }, { "epoch": 0.4877867143282693, "grad_norm": 1.3541709184646606, "learning_rate": 1.2190944295501937e-05, "loss": 0.7813, "num_input_tokens_seen": 1903392, "step": 3275 }, { "epoch": 0.48853142686922846, "grad_norm": 1.2525010108947754, "learning_rate": 1.2209562109025917e-05, "loss": 0.7059, "num_input_tokens_seen": 1906176, "step": 3280 }, { "epoch": 0.48927613941018766, "grad_norm": 0.9057725667953491, "learning_rate": 1.2228179922549897e-05, "loss": 0.6632, "num_input_tokens_seen": 1909024, "step": 3285 }, { "epoch": 0.49002085195114686, "grad_norm": 1.0753122568130493, "learning_rate": 1.2246797736073875e-05, "loss": 0.6771, "num_input_tokens_seen": 1912160, "step": 3290 }, { "epoch": 0.49076556449210607, "grad_norm": 1.0218342542648315, "learning_rate": 1.2265415549597855e-05, "loss": 0.7667, "num_input_tokens_seen": 1915104, "step": 3295 }, { "epoch": 0.4915102770330652, "grad_norm": 1.7506765127182007, "learning_rate": 1.2284033363121835e-05, "loss": 0.6911, "num_input_tokens_seen": 1917984, "step": 3300 }, { "epoch": 0.4922549895740244, "grad_norm": 1.052355408668518, "learning_rate": 1.2302651176645815e-05, "loss": 0.7015, "num_input_tokens_seen": 1921152, "step": 3305 }, { "epoch": 0.4929997021149836, "grad_norm": 1.1135503053665161, "learning_rate": 1.2321268990169795e-05, "loss": 0.7365, "num_input_tokens_seen": 1923872, "step": 3310 }, { "epoch": 0.4937444146559428, "grad_norm": 1.6493642330169678, "learning_rate": 1.2339886803693775e-05, "loss": 0.8041, "num_input_tokens_seen": 1926848, "step": 3315 }, { "epoch": 0.494489127196902, "grad_norm": 0.9348089694976807, "learning_rate": 1.2358504617217755e-05, "loss": 0.7636, "num_input_tokens_seen": 1929920, "step": 3320 }, { "epoch": 0.49523383973786117, "grad_norm": 0.8348360061645508, "learning_rate": 1.2377122430741735e-05, "loss": 0.6911, "num_input_tokens_seen": 1932736, "step": 3325 }, { "epoch": 0.4959785522788204, "grad_norm": 1.1609764099121094, "learning_rate": 1.2395740244265713e-05, "loss": 0.92, "num_input_tokens_seen": 1935392, "step": 3330 }, { "epoch": 0.4967232648197796, "grad_norm": 0.879724383354187, "learning_rate": 1.2414358057789693e-05, "loss": 0.8038, "num_input_tokens_seen": 1938176, "step": 3335 }, { "epoch": 0.4974679773607388, "grad_norm": 1.3781707286834717, "learning_rate": 1.2432975871313673e-05, "loss": 0.552, "num_input_tokens_seen": 1941056, "step": 3340 }, { "epoch": 0.4982126899016979, "grad_norm": 2.0419628620147705, "learning_rate": 1.2451593684837653e-05, "loss": 0.8797, "num_input_tokens_seen": 1943936, "step": 3345 }, { "epoch": 0.4989574024426571, "grad_norm": 1.169779658317566, "learning_rate": 1.2470211498361634e-05, "loss": 0.6712, "num_input_tokens_seen": 1946784, "step": 3350 }, { "epoch": 0.49970211498361633, "grad_norm": 0.7410464286804199, "learning_rate": 1.2488829311885613e-05, "loss": 0.6242, "num_input_tokens_seen": 1949760, "step": 3355 }, { "epoch": 0.5004468275245755, "grad_norm": 1.4758570194244385, "learning_rate": 1.2507447125409594e-05, "loss": 0.8225, "num_input_tokens_seen": 1952352, "step": 3360 }, { "epoch": 0.5011915400655347, "grad_norm": 1.2581541538238525, "learning_rate": 1.252606493893357e-05, "loss": 0.8078, "num_input_tokens_seen": 1955072, "step": 3365 }, { "epoch": 0.5019362526064939, "grad_norm": 2.0262515544891357, "learning_rate": 1.254468275245755e-05, "loss": 0.7856, "num_input_tokens_seen": 1957696, "step": 3370 }, { "epoch": 0.5026809651474531, "grad_norm": 1.3284931182861328, "learning_rate": 1.256330056598153e-05, "loss": 0.8094, "num_input_tokens_seen": 1960736, "step": 3375 }, { "epoch": 0.5034256776884123, "grad_norm": 0.9557269215583801, "learning_rate": 1.258191837950551e-05, "loss": 0.8527, "num_input_tokens_seen": 1963616, "step": 3380 }, { "epoch": 0.5041703902293715, "grad_norm": 0.9725602269172668, "learning_rate": 1.260053619302949e-05, "loss": 0.7086, "num_input_tokens_seen": 1966528, "step": 3385 }, { "epoch": 0.5049151027703307, "grad_norm": 1.1756483316421509, "learning_rate": 1.2619154006553472e-05, "loss": 0.945, "num_input_tokens_seen": 1969312, "step": 3390 }, { "epoch": 0.5056598153112899, "grad_norm": 1.1876115798950195, "learning_rate": 1.2637771820077452e-05, "loss": 0.6903, "num_input_tokens_seen": 1972096, "step": 3395 }, { "epoch": 0.506404527852249, "grad_norm": 1.3952821493148804, "learning_rate": 1.2656389633601432e-05, "loss": 0.8353, "num_input_tokens_seen": 1975040, "step": 3400 }, { "epoch": 0.5071492403932082, "grad_norm": 1.098920226097107, "learning_rate": 1.2675007447125412e-05, "loss": 0.8468, "num_input_tokens_seen": 1977856, "step": 3405 }, { "epoch": 0.5078939529341674, "grad_norm": 1.2631040811538696, "learning_rate": 1.2693625260649388e-05, "loss": 0.5927, "num_input_tokens_seen": 1980608, "step": 3410 }, { "epoch": 0.5086386654751266, "grad_norm": 1.1452946662902832, "learning_rate": 1.2712243074173368e-05, "loss": 0.6808, "num_input_tokens_seen": 1983328, "step": 3415 }, { "epoch": 0.5093833780160858, "grad_norm": 1.6175811290740967, "learning_rate": 1.2730860887697348e-05, "loss": 0.742, "num_input_tokens_seen": 1986400, "step": 3420 }, { "epoch": 0.510128090557045, "grad_norm": 1.345116138458252, "learning_rate": 1.2749478701221328e-05, "loss": 0.7677, "num_input_tokens_seen": 1989248, "step": 3425 }, { "epoch": 0.5108728030980042, "grad_norm": 1.4960005283355713, "learning_rate": 1.276809651474531e-05, "loss": 0.765, "num_input_tokens_seen": 1992224, "step": 3430 }, { "epoch": 0.5116175156389634, "grad_norm": 1.3644003868103027, "learning_rate": 1.278671432826929e-05, "loss": 0.7359, "num_input_tokens_seen": 1995136, "step": 3435 }, { "epoch": 0.5123622281799225, "grad_norm": 1.3059488534927368, "learning_rate": 1.280533214179327e-05, "loss": 0.7687, "num_input_tokens_seen": 1997792, "step": 3440 }, { "epoch": 0.5131069407208817, "grad_norm": 1.202153205871582, "learning_rate": 1.282394995531725e-05, "loss": 0.7876, "num_input_tokens_seen": 2000896, "step": 3445 }, { "epoch": 0.5138516532618409, "grad_norm": 1.6494860649108887, "learning_rate": 1.2842567768841226e-05, "loss": 0.7255, "num_input_tokens_seen": 2003840, "step": 3450 }, { "epoch": 0.5145963658028001, "grad_norm": 0.9094935655593872, "learning_rate": 1.2861185582365206e-05, "loss": 0.8196, "num_input_tokens_seen": 2006496, "step": 3455 }, { "epoch": 0.5153410783437593, "grad_norm": 6.421698570251465, "learning_rate": 1.2879803395889186e-05, "loss": 0.7901, "num_input_tokens_seen": 2009568, "step": 3460 }, { "epoch": 0.5160857908847185, "grad_norm": 0.8670192360877991, "learning_rate": 1.2898421209413166e-05, "loss": 0.809, "num_input_tokens_seen": 2012384, "step": 3465 }, { "epoch": 0.5168305034256777, "grad_norm": 1.1699553728103638, "learning_rate": 1.2917039022937148e-05, "loss": 0.7761, "num_input_tokens_seen": 2015360, "step": 3470 }, { "epoch": 0.5175752159666369, "grad_norm": 0.9634988903999329, "learning_rate": 1.2935656836461127e-05, "loss": 0.8301, "num_input_tokens_seen": 2018048, "step": 3475 }, { "epoch": 0.5183199285075961, "grad_norm": 0.9925034046173096, "learning_rate": 1.2954274649985107e-05, "loss": 0.7747, "num_input_tokens_seen": 2020960, "step": 3480 }, { "epoch": 0.5190646410485552, "grad_norm": 0.6902193427085876, "learning_rate": 1.2972892463509087e-05, "loss": 0.6619, "num_input_tokens_seen": 2023744, "step": 3485 }, { "epoch": 0.5198093535895144, "grad_norm": 1.5540646314620972, "learning_rate": 1.2991510277033067e-05, "loss": 0.8003, "num_input_tokens_seen": 2026592, "step": 3490 }, { "epoch": 0.5205540661304736, "grad_norm": 2.4726786613464355, "learning_rate": 1.3010128090557044e-05, "loss": 0.7863, "num_input_tokens_seen": 2029728, "step": 3495 }, { "epoch": 0.5212987786714328, "grad_norm": 1.1547709703445435, "learning_rate": 1.3028745904081024e-05, "loss": 0.7386, "num_input_tokens_seen": 2032672, "step": 3500 }, { "epoch": 0.522043491212392, "grad_norm": 0.7381396293640137, "learning_rate": 1.3047363717605005e-05, "loss": 0.7768, "num_input_tokens_seen": 2035808, "step": 3505 }, { "epoch": 0.5227882037533512, "grad_norm": 1.0544589757919312, "learning_rate": 1.3065981531128985e-05, "loss": 0.7244, "num_input_tokens_seen": 2038720, "step": 3510 }, { "epoch": 0.5235329162943104, "grad_norm": 1.0934962034225464, "learning_rate": 1.3084599344652965e-05, "loss": 0.8433, "num_input_tokens_seen": 2041664, "step": 3515 }, { "epoch": 0.5242776288352696, "grad_norm": 1.2152750492095947, "learning_rate": 1.3103217158176945e-05, "loss": 0.6579, "num_input_tokens_seen": 2044800, "step": 3520 }, { "epoch": 0.5250223413762288, "grad_norm": 1.2058415412902832, "learning_rate": 1.3121834971700925e-05, "loss": 0.78, "num_input_tokens_seen": 2047392, "step": 3525 }, { "epoch": 0.5257670539171879, "grad_norm": 1.487465262413025, "learning_rate": 1.3140452785224905e-05, "loss": 0.8467, "num_input_tokens_seen": 2050496, "step": 3530 }, { "epoch": 0.5265117664581471, "grad_norm": 0.9980764389038086, "learning_rate": 1.3159070598748885e-05, "loss": 0.7478, "num_input_tokens_seen": 2053984, "step": 3535 }, { "epoch": 0.5272564789991063, "grad_norm": 1.0032377243041992, "learning_rate": 1.3177688412272861e-05, "loss": 0.7174, "num_input_tokens_seen": 2057120, "step": 3540 }, { "epoch": 0.5280011915400655, "grad_norm": 1.8452708721160889, "learning_rate": 1.3196306225796843e-05, "loss": 0.7286, "num_input_tokens_seen": 2059680, "step": 3545 }, { "epoch": 0.5287459040810247, "grad_norm": 1.592655062675476, "learning_rate": 1.3214924039320823e-05, "loss": 0.774, "num_input_tokens_seen": 2062560, "step": 3550 }, { "epoch": 0.5294906166219839, "grad_norm": 1.6403306722640991, "learning_rate": 1.3233541852844803e-05, "loss": 0.734, "num_input_tokens_seen": 2065536, "step": 3555 }, { "epoch": 0.5302353291629431, "grad_norm": 1.8997430801391602, "learning_rate": 1.3252159666368783e-05, "loss": 0.8359, "num_input_tokens_seen": 2068416, "step": 3560 }, { "epoch": 0.5309800417039023, "grad_norm": 1.025429368019104, "learning_rate": 1.3270777479892763e-05, "loss": 0.7472, "num_input_tokens_seen": 2071328, "step": 3565 }, { "epoch": 0.5317247542448614, "grad_norm": 1.1631743907928467, "learning_rate": 1.3289395293416743e-05, "loss": 0.7099, "num_input_tokens_seen": 2074016, "step": 3570 }, { "epoch": 0.5324694667858206, "grad_norm": 1.20582115650177, "learning_rate": 1.3308013106940723e-05, "loss": 0.7902, "num_input_tokens_seen": 2076832, "step": 3575 }, { "epoch": 0.5332141793267798, "grad_norm": 1.5409739017486572, "learning_rate": 1.33266309204647e-05, "loss": 0.6134, "num_input_tokens_seen": 2079648, "step": 3580 }, { "epoch": 0.533958891867739, "grad_norm": 1.055354118347168, "learning_rate": 1.334524873398868e-05, "loss": 0.7923, "num_input_tokens_seen": 2082656, "step": 3585 }, { "epoch": 0.5347036044086982, "grad_norm": 0.9160546064376831, "learning_rate": 1.336386654751266e-05, "loss": 0.7416, "num_input_tokens_seen": 2085664, "step": 3590 }, { "epoch": 0.5354483169496574, "grad_norm": 1.2756807804107666, "learning_rate": 1.338248436103664e-05, "loss": 0.7547, "num_input_tokens_seen": 2088768, "step": 3595 }, { "epoch": 0.5361930294906166, "grad_norm": 1.0747616291046143, "learning_rate": 1.340110217456062e-05, "loss": 0.8142, "num_input_tokens_seen": 2091648, "step": 3600 }, { "epoch": 0.5369377420315758, "grad_norm": 1.7907723188400269, "learning_rate": 1.34197199880846e-05, "loss": 0.6982, "num_input_tokens_seen": 2094816, "step": 3605 }, { "epoch": 0.537682454572535, "grad_norm": 0.7991272211074829, "learning_rate": 1.343833780160858e-05, "loss": 0.6602, "num_input_tokens_seen": 2097696, "step": 3610 }, { "epoch": 0.5384271671134941, "grad_norm": 0.9685553908348083, "learning_rate": 1.345695561513256e-05, "loss": 0.7346, "num_input_tokens_seen": 2100320, "step": 3615 }, { "epoch": 0.5391718796544533, "grad_norm": 1.0172539949417114, "learning_rate": 1.347557342865654e-05, "loss": 0.7594, "num_input_tokens_seen": 2103328, "step": 3620 }, { "epoch": 0.5399165921954125, "grad_norm": 1.106128454208374, "learning_rate": 1.3494191242180519e-05, "loss": 0.6492, "num_input_tokens_seen": 2106240, "step": 3625 }, { "epoch": 0.5406613047363718, "grad_norm": 1.5874128341674805, "learning_rate": 1.3512809055704498e-05, "loss": 0.73, "num_input_tokens_seen": 2109280, "step": 3630 }, { "epoch": 0.541406017277331, "grad_norm": 2.3920741081237793, "learning_rate": 1.3531426869228478e-05, "loss": 0.8228, "num_input_tokens_seen": 2111936, "step": 3635 }, { "epoch": 0.5421507298182902, "grad_norm": 1.7334389686584473, "learning_rate": 1.3550044682752458e-05, "loss": 0.8311, "num_input_tokens_seen": 2114624, "step": 3640 }, { "epoch": 0.5428954423592494, "grad_norm": 1.2710257768630981, "learning_rate": 1.3568662496276438e-05, "loss": 0.8109, "num_input_tokens_seen": 2117664, "step": 3645 }, { "epoch": 0.5436401549002086, "grad_norm": 0.979832112789154, "learning_rate": 1.3587280309800418e-05, "loss": 0.7724, "num_input_tokens_seen": 2120448, "step": 3650 }, { "epoch": 0.5443848674411678, "grad_norm": 1.3906877040863037, "learning_rate": 1.3605898123324398e-05, "loss": 0.7987, "num_input_tokens_seen": 2123552, "step": 3655 }, { "epoch": 0.5451295799821269, "grad_norm": 0.9600347280502319, "learning_rate": 1.3624515936848378e-05, "loss": 0.7004, "num_input_tokens_seen": 2126688, "step": 3660 }, { "epoch": 0.5458742925230861, "grad_norm": 0.9206571578979492, "learning_rate": 1.3643133750372358e-05, "loss": 0.6596, "num_input_tokens_seen": 2130432, "step": 3665 }, { "epoch": 0.5466190050640453, "grad_norm": 1.2896366119384766, "learning_rate": 1.3661751563896336e-05, "loss": 0.6768, "num_input_tokens_seen": 2133376, "step": 3670 }, { "epoch": 0.5473637176050045, "grad_norm": 1.3164297342300415, "learning_rate": 1.3680369377420316e-05, "loss": 0.7588, "num_input_tokens_seen": 2136576, "step": 3675 }, { "epoch": 0.5481084301459637, "grad_norm": 1.3423689603805542, "learning_rate": 1.3698987190944296e-05, "loss": 0.7904, "num_input_tokens_seen": 2139776, "step": 3680 }, { "epoch": 0.5488531426869229, "grad_norm": 2.319775342941284, "learning_rate": 1.3717605004468276e-05, "loss": 0.8317, "num_input_tokens_seen": 2142528, "step": 3685 }, { "epoch": 0.5495978552278821, "grad_norm": 1.332733154296875, "learning_rate": 1.3736222817992256e-05, "loss": 0.7878, "num_input_tokens_seen": 2145472, "step": 3690 }, { "epoch": 0.5503425677688413, "grad_norm": 0.8175024390220642, "learning_rate": 1.3754840631516236e-05, "loss": 0.6958, "num_input_tokens_seen": 2148384, "step": 3695 }, { "epoch": 0.5510872803098004, "grad_norm": 1.0583488941192627, "learning_rate": 1.3773458445040216e-05, "loss": 0.7532, "num_input_tokens_seen": 2151136, "step": 3700 }, { "epoch": 0.5518319928507596, "grad_norm": 1.601696491241455, "learning_rate": 1.3792076258564196e-05, "loss": 0.7174, "num_input_tokens_seen": 2153952, "step": 3705 }, { "epoch": 0.5525767053917188, "grad_norm": 1.711944341659546, "learning_rate": 1.3810694072088174e-05, "loss": 0.8482, "num_input_tokens_seen": 2156736, "step": 3710 }, { "epoch": 0.553321417932678, "grad_norm": 1.0099238157272339, "learning_rate": 1.3829311885612154e-05, "loss": 0.6712, "num_input_tokens_seen": 2159776, "step": 3715 }, { "epoch": 0.5540661304736372, "grad_norm": 1.168778657913208, "learning_rate": 1.3847929699136134e-05, "loss": 0.8116, "num_input_tokens_seen": 2162656, "step": 3720 }, { "epoch": 0.5548108430145964, "grad_norm": 1.7401351928710938, "learning_rate": 1.3866547512660114e-05, "loss": 0.7881, "num_input_tokens_seen": 2165792, "step": 3725 }, { "epoch": 0.5555555555555556, "grad_norm": 1.1455368995666504, "learning_rate": 1.3885165326184094e-05, "loss": 0.8106, "num_input_tokens_seen": 2168704, "step": 3730 }, { "epoch": 0.5563002680965148, "grad_norm": 1.755489706993103, "learning_rate": 1.3903783139708073e-05, "loss": 0.8625, "num_input_tokens_seen": 2171808, "step": 3735 }, { "epoch": 0.557044980637474, "grad_norm": 0.8314875960350037, "learning_rate": 1.3922400953232053e-05, "loss": 0.7731, "num_input_tokens_seen": 2174816, "step": 3740 }, { "epoch": 0.5577896931784331, "grad_norm": 1.0002845525741577, "learning_rate": 1.3941018766756033e-05, "loss": 0.573, "num_input_tokens_seen": 2177536, "step": 3745 }, { "epoch": 0.5585344057193923, "grad_norm": 0.965691864490509, "learning_rate": 1.3959636580280013e-05, "loss": 0.8544, "num_input_tokens_seen": 2180416, "step": 3750 }, { "epoch": 0.5592791182603515, "grad_norm": 0.8041268587112427, "learning_rate": 1.3978254393803992e-05, "loss": 0.7019, "num_input_tokens_seen": 2183520, "step": 3755 }, { "epoch": 0.5600238308013107, "grad_norm": 1.9724481105804443, "learning_rate": 1.3996872207327971e-05, "loss": 0.78, "num_input_tokens_seen": 2186016, "step": 3760 }, { "epoch": 0.5607685433422699, "grad_norm": 0.9301114082336426, "learning_rate": 1.4015490020851951e-05, "loss": 0.841, "num_input_tokens_seen": 2189376, "step": 3765 }, { "epoch": 0.5615132558832291, "grad_norm": 1.4554097652435303, "learning_rate": 1.4034107834375931e-05, "loss": 0.7288, "num_input_tokens_seen": 2192160, "step": 3770 }, { "epoch": 0.5622579684241883, "grad_norm": 1.346166968345642, "learning_rate": 1.4052725647899911e-05, "loss": 0.8084, "num_input_tokens_seen": 2195200, "step": 3775 }, { "epoch": 0.5630026809651475, "grad_norm": 1.432773232460022, "learning_rate": 1.4071343461423891e-05, "loss": 0.6643, "num_input_tokens_seen": 2198048, "step": 3780 }, { "epoch": 0.5637473935061067, "grad_norm": 1.0774775743484497, "learning_rate": 1.4089961274947871e-05, "loss": 0.6677, "num_input_tokens_seen": 2201024, "step": 3785 }, { "epoch": 0.5644921060470658, "grad_norm": 0.6880927681922913, "learning_rate": 1.4108579088471851e-05, "loss": 0.6267, "num_input_tokens_seen": 2203712, "step": 3790 }, { "epoch": 0.565236818588025, "grad_norm": 1.982969045639038, "learning_rate": 1.412719690199583e-05, "loss": 0.8342, "num_input_tokens_seen": 2206464, "step": 3795 }, { "epoch": 0.5659815311289842, "grad_norm": 1.0071433782577515, "learning_rate": 1.4145814715519809e-05, "loss": 0.7494, "num_input_tokens_seen": 2209504, "step": 3800 }, { "epoch": 0.5667262436699434, "grad_norm": 0.8813167214393616, "learning_rate": 1.4164432529043789e-05, "loss": 0.6921, "num_input_tokens_seen": 2213088, "step": 3805 }, { "epoch": 0.5674709562109026, "grad_norm": 1.0846173763275146, "learning_rate": 1.4183050342567769e-05, "loss": 0.7685, "num_input_tokens_seen": 2215904, "step": 3810 }, { "epoch": 0.5682156687518618, "grad_norm": 0.9031472206115723, "learning_rate": 1.4201668156091749e-05, "loss": 0.8592, "num_input_tokens_seen": 2218848, "step": 3815 }, { "epoch": 0.568960381292821, "grad_norm": 1.2691751718521118, "learning_rate": 1.4220285969615729e-05, "loss": 0.7273, "num_input_tokens_seen": 2221920, "step": 3820 }, { "epoch": 0.5697050938337802, "grad_norm": 0.8475288152694702, "learning_rate": 1.4238903783139709e-05, "loss": 0.64, "num_input_tokens_seen": 2224992, "step": 3825 }, { "epoch": 0.5704498063747393, "grad_norm": 0.8438128232955933, "learning_rate": 1.4257521596663689e-05, "loss": 0.7136, "num_input_tokens_seen": 2227872, "step": 3830 }, { "epoch": 0.5711945189156985, "grad_norm": 0.8010666966438293, "learning_rate": 1.4276139410187669e-05, "loss": 0.6634, "num_input_tokens_seen": 2230272, "step": 3835 }, { "epoch": 0.5719392314566577, "grad_norm": 0.9309220910072327, "learning_rate": 1.4294757223711647e-05, "loss": 0.7243, "num_input_tokens_seen": 2233216, "step": 3840 }, { "epoch": 0.5726839439976169, "grad_norm": 1.5643774271011353, "learning_rate": 1.4313375037235627e-05, "loss": 0.7872, "num_input_tokens_seen": 2236192, "step": 3845 }, { "epoch": 0.5734286565385761, "grad_norm": 1.7107042074203491, "learning_rate": 1.4331992850759607e-05, "loss": 0.6564, "num_input_tokens_seen": 2239296, "step": 3850 }, { "epoch": 0.5741733690795353, "grad_norm": 1.0569114685058594, "learning_rate": 1.4350610664283587e-05, "loss": 0.7338, "num_input_tokens_seen": 2242272, "step": 3855 }, { "epoch": 0.5749180816204945, "grad_norm": 1.1332069635391235, "learning_rate": 1.4369228477807567e-05, "loss": 0.769, "num_input_tokens_seen": 2245024, "step": 3860 }, { "epoch": 0.5756627941614537, "grad_norm": 1.150734782218933, "learning_rate": 1.4387846291331546e-05, "loss": 0.8196, "num_input_tokens_seen": 2247744, "step": 3865 }, { "epoch": 0.5764075067024129, "grad_norm": 1.0878506898880005, "learning_rate": 1.4406464104855526e-05, "loss": 0.6518, "num_input_tokens_seen": 2250624, "step": 3870 }, { "epoch": 0.577152219243372, "grad_norm": 0.9058593511581421, "learning_rate": 1.4425081918379506e-05, "loss": 0.7724, "num_input_tokens_seen": 2253440, "step": 3875 }, { "epoch": 0.5778969317843312, "grad_norm": 0.7786282896995544, "learning_rate": 1.4443699731903488e-05, "loss": 0.6543, "num_input_tokens_seen": 2256288, "step": 3880 }, { "epoch": 0.5786416443252904, "grad_norm": 0.8904737234115601, "learning_rate": 1.4462317545427465e-05, "loss": 0.6941, "num_input_tokens_seen": 2259232, "step": 3885 }, { "epoch": 0.5793863568662496, "grad_norm": 1.2440816164016724, "learning_rate": 1.4480935358951444e-05, "loss": 0.6612, "num_input_tokens_seen": 2262400, "step": 3890 }, { "epoch": 0.5801310694072088, "grad_norm": 1.205078125, "learning_rate": 1.4499553172475424e-05, "loss": 0.7638, "num_input_tokens_seen": 2265056, "step": 3895 }, { "epoch": 0.580875781948168, "grad_norm": 0.9477605223655701, "learning_rate": 1.4518170985999404e-05, "loss": 0.6453, "num_input_tokens_seen": 2268096, "step": 3900 }, { "epoch": 0.5816204944891272, "grad_norm": 0.8558230400085449, "learning_rate": 1.4536788799523384e-05, "loss": 0.6911, "num_input_tokens_seen": 2271136, "step": 3905 }, { "epoch": 0.5823652070300864, "grad_norm": 0.8946242928504944, "learning_rate": 1.4555406613047364e-05, "loss": 0.6575, "num_input_tokens_seen": 2274048, "step": 3910 }, { "epoch": 0.5831099195710456, "grad_norm": 0.9900463223457336, "learning_rate": 1.4574024426571346e-05, "loss": 0.8135, "num_input_tokens_seen": 2276736, "step": 3915 }, { "epoch": 0.5838546321120047, "grad_norm": 1.1009212732315063, "learning_rate": 1.4592642240095326e-05, "loss": 0.8137, "num_input_tokens_seen": 2279584, "step": 3920 }, { "epoch": 0.5845993446529639, "grad_norm": 0.8364351391792297, "learning_rate": 1.4611260053619302e-05, "loss": 0.682, "num_input_tokens_seen": 2282592, "step": 3925 }, { "epoch": 0.5853440571939231, "grad_norm": 0.8529859185218811, "learning_rate": 1.4629877867143282e-05, "loss": 0.6647, "num_input_tokens_seen": 2285824, "step": 3930 }, { "epoch": 0.5860887697348823, "grad_norm": 2.768551826477051, "learning_rate": 1.4648495680667262e-05, "loss": 0.9127, "num_input_tokens_seen": 2288672, "step": 3935 }, { "epoch": 0.5868334822758415, "grad_norm": 1.0167548656463623, "learning_rate": 1.4667113494191242e-05, "loss": 0.6596, "num_input_tokens_seen": 2291520, "step": 3940 }, { "epoch": 0.5875781948168007, "grad_norm": 1.0788307189941406, "learning_rate": 1.4685731307715222e-05, "loss": 0.7789, "num_input_tokens_seen": 2294048, "step": 3945 }, { "epoch": 0.5883229073577599, "grad_norm": 1.0277012586593628, "learning_rate": 1.4704349121239202e-05, "loss": 0.7894, "num_input_tokens_seen": 2296832, "step": 3950 }, { "epoch": 0.5890676198987191, "grad_norm": 0.9133859276771545, "learning_rate": 1.4722966934763183e-05, "loss": 0.7004, "num_input_tokens_seen": 2299808, "step": 3955 }, { "epoch": 0.5898123324396782, "grad_norm": 1.0593572854995728, "learning_rate": 1.4741584748287163e-05, "loss": 0.8015, "num_input_tokens_seen": 2302592, "step": 3960 }, { "epoch": 0.5905570449806374, "grad_norm": 1.0678741931915283, "learning_rate": 1.4760202561811143e-05, "loss": 0.7467, "num_input_tokens_seen": 2305344, "step": 3965 }, { "epoch": 0.5913017575215966, "grad_norm": 0.7942363023757935, "learning_rate": 1.477882037533512e-05, "loss": 0.7083, "num_input_tokens_seen": 2308096, "step": 3970 }, { "epoch": 0.5920464700625558, "grad_norm": 1.2089866399765015, "learning_rate": 1.47974381888591e-05, "loss": 0.8016, "num_input_tokens_seen": 2311104, "step": 3975 }, { "epoch": 0.592791182603515, "grad_norm": 1.8391236066818237, "learning_rate": 1.481605600238308e-05, "loss": 0.7715, "num_input_tokens_seen": 2314080, "step": 3980 }, { "epoch": 0.5935358951444742, "grad_norm": 0.9647204279899597, "learning_rate": 1.483467381590706e-05, "loss": 0.6486, "num_input_tokens_seen": 2317120, "step": 3985 }, { "epoch": 0.5942806076854334, "grad_norm": 0.986487865447998, "learning_rate": 1.485329162943104e-05, "loss": 0.6162, "num_input_tokens_seen": 2319968, "step": 3990 }, { "epoch": 0.5950253202263927, "grad_norm": 0.7825473546981812, "learning_rate": 1.4871909442955021e-05, "loss": 0.7954, "num_input_tokens_seen": 2322944, "step": 3995 }, { "epoch": 0.5957700327673519, "grad_norm": 0.8422633409500122, "learning_rate": 1.4890527256479001e-05, "loss": 0.8054, "num_input_tokens_seen": 2325920, "step": 4000 }, { "epoch": 0.596514745308311, "grad_norm": 0.782781720161438, "learning_rate": 1.4909145070002981e-05, "loss": 0.7201, "num_input_tokens_seen": 2328768, "step": 4005 }, { "epoch": 0.5972594578492701, "grad_norm": 1.921257734298706, "learning_rate": 1.4927762883526961e-05, "loss": 0.8335, "num_input_tokens_seen": 2331616, "step": 4010 }, { "epoch": 0.5980041703902294, "grad_norm": 0.9212729334831238, "learning_rate": 1.4946380697050938e-05, "loss": 0.7481, "num_input_tokens_seen": 2334592, "step": 4015 }, { "epoch": 0.5987488829311886, "grad_norm": 0.7512657046318054, "learning_rate": 1.4964998510574917e-05, "loss": 0.8371, "num_input_tokens_seen": 2337408, "step": 4020 }, { "epoch": 0.5994935954721478, "grad_norm": 1.0520752668380737, "learning_rate": 1.4983616324098897e-05, "loss": 0.6605, "num_input_tokens_seen": 2340320, "step": 4025 }, { "epoch": 0.600238308013107, "grad_norm": 1.1305584907531738, "learning_rate": 1.5002234137622877e-05, "loss": 0.7136, "num_input_tokens_seen": 2343136, "step": 4030 }, { "epoch": 0.6009830205540662, "grad_norm": 0.7461051940917969, "learning_rate": 1.5020851951146859e-05, "loss": 0.8271, "num_input_tokens_seen": 2346208, "step": 4035 }, { "epoch": 0.6017277330950254, "grad_norm": 0.9754770398139954, "learning_rate": 1.5039469764670839e-05, "loss": 0.7286, "num_input_tokens_seen": 2348864, "step": 4040 }, { "epoch": 0.6024724456359845, "grad_norm": 1.0906108617782593, "learning_rate": 1.5058087578194819e-05, "loss": 0.7383, "num_input_tokens_seen": 2351520, "step": 4045 }, { "epoch": 0.6032171581769437, "grad_norm": 1.21021568775177, "learning_rate": 1.5076705391718799e-05, "loss": 0.6347, "num_input_tokens_seen": 2354592, "step": 4050 }, { "epoch": 0.6039618707179029, "grad_norm": 1.2582364082336426, "learning_rate": 1.5095323205242775e-05, "loss": 0.6762, "num_input_tokens_seen": 2357376, "step": 4055 }, { "epoch": 0.6047065832588621, "grad_norm": 1.0637602806091309, "learning_rate": 1.5113941018766755e-05, "loss": 0.8749, "num_input_tokens_seen": 2360128, "step": 4060 }, { "epoch": 0.6054512957998213, "grad_norm": 0.9084541201591492, "learning_rate": 1.5132558832290735e-05, "loss": 0.6599, "num_input_tokens_seen": 2363136, "step": 4065 }, { "epoch": 0.6061960083407805, "grad_norm": 1.4667794704437256, "learning_rate": 1.5151176645814715e-05, "loss": 0.6892, "num_input_tokens_seen": 2365984, "step": 4070 }, { "epoch": 0.6069407208817397, "grad_norm": 1.2063342332839966, "learning_rate": 1.5169794459338697e-05, "loss": 0.7507, "num_input_tokens_seen": 2368640, "step": 4075 }, { "epoch": 0.6076854334226989, "grad_norm": 0.9629943370819092, "learning_rate": 1.5188412272862677e-05, "loss": 0.6375, "num_input_tokens_seen": 2371424, "step": 4080 }, { "epoch": 0.6084301459636581, "grad_norm": 0.8267983794212341, "learning_rate": 1.5207030086386656e-05, "loss": 0.7175, "num_input_tokens_seen": 2374240, "step": 4085 }, { "epoch": 0.6091748585046172, "grad_norm": 1.4784531593322754, "learning_rate": 1.5225647899910636e-05, "loss": 0.832, "num_input_tokens_seen": 2377376, "step": 4090 }, { "epoch": 0.6099195710455764, "grad_norm": 0.7543299794197083, "learning_rate": 1.5244265713434616e-05, "loss": 0.7129, "num_input_tokens_seen": 2380128, "step": 4095 }, { "epoch": 0.6106642835865356, "grad_norm": 1.4315153360366821, "learning_rate": 1.5262883526958593e-05, "loss": 0.741, "num_input_tokens_seen": 2382944, "step": 4100 }, { "epoch": 0.6114089961274948, "grad_norm": 1.0306097269058228, "learning_rate": 1.5281501340482574e-05, "loss": 0.7031, "num_input_tokens_seen": 2385824, "step": 4105 }, { "epoch": 0.612153708668454, "grad_norm": 1.306687355041504, "learning_rate": 1.5300119154006553e-05, "loss": 0.7656, "num_input_tokens_seen": 2388736, "step": 4110 }, { "epoch": 0.6128984212094132, "grad_norm": 0.9980324506759644, "learning_rate": 1.5318736967530534e-05, "loss": 0.7872, "num_input_tokens_seen": 2391584, "step": 4115 }, { "epoch": 0.6136431337503724, "grad_norm": 0.783931314945221, "learning_rate": 1.5337354781054513e-05, "loss": 0.7711, "num_input_tokens_seen": 2394368, "step": 4120 }, { "epoch": 0.6143878462913316, "grad_norm": 1.236297369003296, "learning_rate": 1.5355972594578494e-05, "loss": 0.7453, "num_input_tokens_seen": 2397568, "step": 4125 }, { "epoch": 0.6151325588322908, "grad_norm": 0.9730316996574402, "learning_rate": 1.5374590408102472e-05, "loss": 0.7818, "num_input_tokens_seen": 2400416, "step": 4130 }, { "epoch": 0.6158772713732499, "grad_norm": 1.1148568391799927, "learning_rate": 1.5393208221626454e-05, "loss": 0.8207, "num_input_tokens_seen": 2403360, "step": 4135 }, { "epoch": 0.6166219839142091, "grad_norm": 1.0377554893493652, "learning_rate": 1.5411826035150436e-05, "loss": 0.6467, "num_input_tokens_seen": 2406112, "step": 4140 }, { "epoch": 0.6173666964551683, "grad_norm": 1.2391294240951538, "learning_rate": 1.543044384867441e-05, "loss": 0.7603, "num_input_tokens_seen": 2409120, "step": 4145 }, { "epoch": 0.6181114089961275, "grad_norm": 0.978664755821228, "learning_rate": 1.5449061662198392e-05, "loss": 0.6342, "num_input_tokens_seen": 2412000, "step": 4150 }, { "epoch": 0.6188561215370867, "grad_norm": 0.9309139251708984, "learning_rate": 1.546767947572237e-05, "loss": 0.6131, "num_input_tokens_seen": 2414624, "step": 4155 }, { "epoch": 0.6196008340780459, "grad_norm": 0.7581933736801147, "learning_rate": 1.5486297289246352e-05, "loss": 0.858, "num_input_tokens_seen": 2417280, "step": 4160 }, { "epoch": 0.6203455466190051, "grad_norm": 1.329095482826233, "learning_rate": 1.550491510277033e-05, "loss": 0.8095, "num_input_tokens_seen": 2420128, "step": 4165 }, { "epoch": 0.6210902591599643, "grad_norm": 1.1628692150115967, "learning_rate": 1.5523532916294312e-05, "loss": 0.7074, "num_input_tokens_seen": 2423136, "step": 4170 }, { "epoch": 0.6218349717009234, "grad_norm": 0.6995810270309448, "learning_rate": 1.554215072981829e-05, "loss": 0.731, "num_input_tokens_seen": 2426176, "step": 4175 }, { "epoch": 0.6225796842418826, "grad_norm": 0.659412145614624, "learning_rate": 1.5560768543342272e-05, "loss": 0.665, "num_input_tokens_seen": 2429216, "step": 4180 }, { "epoch": 0.6233243967828418, "grad_norm": 1.054045557975769, "learning_rate": 1.557938635686625e-05, "loss": 0.8238, "num_input_tokens_seen": 2432416, "step": 4185 }, { "epoch": 0.624069109323801, "grad_norm": 0.8872559666633606, "learning_rate": 1.5598004170390228e-05, "loss": 0.7284, "num_input_tokens_seen": 2435264, "step": 4190 }, { "epoch": 0.6248138218647602, "grad_norm": 1.3007398843765259, "learning_rate": 1.561662198391421e-05, "loss": 0.767, "num_input_tokens_seen": 2438336, "step": 4195 }, { "epoch": 0.6255585344057194, "grad_norm": 0.9868432879447937, "learning_rate": 1.5635239797438188e-05, "loss": 0.7544, "num_input_tokens_seen": 2441376, "step": 4200 }, { "epoch": 0.6263032469466786, "grad_norm": 0.9726581573486328, "learning_rate": 1.565385761096217e-05, "loss": 0.7297, "num_input_tokens_seen": 2444544, "step": 4205 }, { "epoch": 0.6270479594876378, "grad_norm": 1.8012444972991943, "learning_rate": 1.5672475424486148e-05, "loss": 0.8426, "num_input_tokens_seen": 2447328, "step": 4210 }, { "epoch": 0.627792672028597, "grad_norm": 1.1977384090423584, "learning_rate": 1.569109323801013e-05, "loss": 0.6607, "num_input_tokens_seen": 2450464, "step": 4215 }, { "epoch": 0.6285373845695561, "grad_norm": 0.8115970492362976, "learning_rate": 1.570971105153411e-05, "loss": 0.6975, "num_input_tokens_seen": 2453280, "step": 4220 }, { "epoch": 0.6292820971105153, "grad_norm": 1.8218704462051392, "learning_rate": 1.572832886505809e-05, "loss": 0.8227, "num_input_tokens_seen": 2456256, "step": 4225 }, { "epoch": 0.6300268096514745, "grad_norm": 0.9160654544830322, "learning_rate": 1.5746946678582068e-05, "loss": 0.7264, "num_input_tokens_seen": 2459136, "step": 4230 }, { "epoch": 0.6307715221924337, "grad_norm": 0.9487511515617371, "learning_rate": 1.5765564492106046e-05, "loss": 0.5716, "num_input_tokens_seen": 2461792, "step": 4235 }, { "epoch": 0.6315162347333929, "grad_norm": 2.2700796127319336, "learning_rate": 1.5784182305630027e-05, "loss": 0.8196, "num_input_tokens_seen": 2464768, "step": 4240 }, { "epoch": 0.6322609472743521, "grad_norm": 2.01389217376709, "learning_rate": 1.5802800119154006e-05, "loss": 0.7065, "num_input_tokens_seen": 2467680, "step": 4245 }, { "epoch": 0.6330056598153113, "grad_norm": 1.2490431070327759, "learning_rate": 1.5821417932677987e-05, "loss": 0.7214, "num_input_tokens_seen": 2470464, "step": 4250 }, { "epoch": 0.6337503723562705, "grad_norm": 0.7197795510292053, "learning_rate": 1.5840035746201966e-05, "loss": 0.9141, "num_input_tokens_seen": 2473376, "step": 4255 }, { "epoch": 0.6344950848972297, "grad_norm": 0.9947896599769592, "learning_rate": 1.5858653559725947e-05, "loss": 0.6309, "num_input_tokens_seen": 2476640, "step": 4260 }, { "epoch": 0.6352397974381888, "grad_norm": 1.6522548198699951, "learning_rate": 1.587727137324993e-05, "loss": 0.877, "num_input_tokens_seen": 2479968, "step": 4265 }, { "epoch": 0.635984509979148, "grad_norm": 1.0097204446792603, "learning_rate": 1.5895889186773907e-05, "loss": 0.6487, "num_input_tokens_seen": 2482816, "step": 4270 }, { "epoch": 0.6367292225201072, "grad_norm": 1.6890532970428467, "learning_rate": 1.5914507000297885e-05, "loss": 0.6905, "num_input_tokens_seen": 2485568, "step": 4275 }, { "epoch": 0.6374739350610664, "grad_norm": 1.116841435432434, "learning_rate": 1.5933124813821863e-05, "loss": 0.6904, "num_input_tokens_seen": 2488480, "step": 4280 }, { "epoch": 0.6382186476020256, "grad_norm": 0.8468011021614075, "learning_rate": 1.5951742627345845e-05, "loss": 0.5735, "num_input_tokens_seen": 2491360, "step": 4285 }, { "epoch": 0.6389633601429848, "grad_norm": 1.1045362949371338, "learning_rate": 1.5970360440869823e-05, "loss": 0.704, "num_input_tokens_seen": 2494272, "step": 4290 }, { "epoch": 0.639708072683944, "grad_norm": 0.8849327564239502, "learning_rate": 1.5988978254393805e-05, "loss": 0.7851, "num_input_tokens_seen": 2496928, "step": 4295 }, { "epoch": 0.6404527852249032, "grad_norm": 2.3654732704162598, "learning_rate": 1.6007596067917787e-05, "loss": 0.7878, "num_input_tokens_seen": 2499680, "step": 4300 }, { "epoch": 0.6411974977658623, "grad_norm": 1.054079294204712, "learning_rate": 1.6026213881441765e-05, "loss": 0.7194, "num_input_tokens_seen": 2502400, "step": 4305 }, { "epoch": 0.6419422103068215, "grad_norm": 2.0124268531799316, "learning_rate": 1.6044831694965746e-05, "loss": 0.7352, "num_input_tokens_seen": 2505184, "step": 4310 }, { "epoch": 0.6426869228477807, "grad_norm": 0.899603009223938, "learning_rate": 1.606344950848972e-05, "loss": 0.8736, "num_input_tokens_seen": 2508160, "step": 4315 }, { "epoch": 0.6434316353887399, "grad_norm": 0.8540710210800171, "learning_rate": 1.6082067322013703e-05, "loss": 0.7384, "num_input_tokens_seen": 2510944, "step": 4320 }, { "epoch": 0.6441763479296991, "grad_norm": 1.2380996942520142, "learning_rate": 1.610068513553768e-05, "loss": 0.8001, "num_input_tokens_seen": 2513856, "step": 4325 }, { "epoch": 0.6449210604706583, "grad_norm": 1.214143991470337, "learning_rate": 1.6119302949061663e-05, "loss": 0.7804, "num_input_tokens_seen": 2516512, "step": 4330 }, { "epoch": 0.6456657730116175, "grad_norm": 0.9236889481544495, "learning_rate": 1.6137920762585644e-05, "loss": 0.7381, "num_input_tokens_seen": 2519200, "step": 4335 }, { "epoch": 0.6464104855525767, "grad_norm": 1.523733139038086, "learning_rate": 1.6156538576109623e-05, "loss": 0.8597, "num_input_tokens_seen": 2522016, "step": 4340 }, { "epoch": 0.6471551980935359, "grad_norm": 1.1811411380767822, "learning_rate": 1.6175156389633604e-05, "loss": 0.7597, "num_input_tokens_seen": 2524768, "step": 4345 }, { "epoch": 0.647899910634495, "grad_norm": 1.1761482954025269, "learning_rate": 1.6193774203157582e-05, "loss": 0.773, "num_input_tokens_seen": 2527328, "step": 4350 }, { "epoch": 0.6486446231754542, "grad_norm": 0.9438775181770325, "learning_rate": 1.6212392016681564e-05, "loss": 0.6566, "num_input_tokens_seen": 2530304, "step": 4355 }, { "epoch": 0.6493893357164134, "grad_norm": 1.0359644889831543, "learning_rate": 1.623100983020554e-05, "loss": 0.7077, "num_input_tokens_seen": 2533536, "step": 4360 }, { "epoch": 0.6501340482573726, "grad_norm": 1.058864951133728, "learning_rate": 1.624962764372952e-05, "loss": 0.7105, "num_input_tokens_seen": 2536544, "step": 4365 }, { "epoch": 0.6508787607983318, "grad_norm": 0.8640514016151428, "learning_rate": 1.62682454572535e-05, "loss": 0.7089, "num_input_tokens_seen": 2539456, "step": 4370 }, { "epoch": 0.651623473339291, "grad_norm": 1.1220128536224365, "learning_rate": 1.628686327077748e-05, "loss": 0.6639, "num_input_tokens_seen": 2542528, "step": 4375 }, { "epoch": 0.6523681858802503, "grad_norm": 0.9449430704116821, "learning_rate": 1.6305481084301462e-05, "loss": 0.7068, "num_input_tokens_seen": 2545664, "step": 4380 }, { "epoch": 0.6531128984212095, "grad_norm": 1.1347644329071045, "learning_rate": 1.632409889782544e-05, "loss": 0.7697, "num_input_tokens_seen": 2548544, "step": 4385 }, { "epoch": 0.6538576109621687, "grad_norm": 0.7528411746025085, "learning_rate": 1.6342716711349422e-05, "loss": 0.7529, "num_input_tokens_seen": 2551392, "step": 4390 }, { "epoch": 0.6546023235031277, "grad_norm": 1.7199598550796509, "learning_rate": 1.63613345248734e-05, "loss": 0.8222, "num_input_tokens_seen": 2554336, "step": 4395 }, { "epoch": 0.655347036044087, "grad_norm": 0.9828110933303833, "learning_rate": 1.637995233839738e-05, "loss": 0.8338, "num_input_tokens_seen": 2557056, "step": 4400 }, { "epoch": 0.6560917485850462, "grad_norm": 1.1502844095230103, "learning_rate": 1.6398570151921357e-05, "loss": 0.6637, "num_input_tokens_seen": 2559904, "step": 4405 }, { "epoch": 0.6568364611260054, "grad_norm": 0.9794354438781738, "learning_rate": 1.6417187965445338e-05, "loss": 0.8153, "num_input_tokens_seen": 2562752, "step": 4410 }, { "epoch": 0.6575811736669646, "grad_norm": 1.0617181062698364, "learning_rate": 1.643580577896932e-05, "loss": 0.7459, "num_input_tokens_seen": 2565536, "step": 4415 }, { "epoch": 0.6583258862079238, "grad_norm": 1.5891205072402954, "learning_rate": 1.6454423592493298e-05, "loss": 0.6687, "num_input_tokens_seen": 2568512, "step": 4420 }, { "epoch": 0.659070598748883, "grad_norm": 0.7307397127151489, "learning_rate": 1.647304140601728e-05, "loss": 0.8629, "num_input_tokens_seen": 2571392, "step": 4425 }, { "epoch": 0.6598153112898422, "grad_norm": 0.855374276638031, "learning_rate": 1.6491659219541258e-05, "loss": 0.7711, "num_input_tokens_seen": 2574368, "step": 4430 }, { "epoch": 0.6605600238308013, "grad_norm": 1.5938329696655273, "learning_rate": 1.651027703306524e-05, "loss": 0.7754, "num_input_tokens_seen": 2577440, "step": 4435 }, { "epoch": 0.6613047363717605, "grad_norm": 0.8235436677932739, "learning_rate": 1.6528894846589218e-05, "loss": 0.6615, "num_input_tokens_seen": 2580608, "step": 4440 }, { "epoch": 0.6620494489127197, "grad_norm": 1.335121989250183, "learning_rate": 1.6547512660113196e-05, "loss": 0.7141, "num_input_tokens_seen": 2583232, "step": 4445 }, { "epoch": 0.6627941614536789, "grad_norm": 1.039319634437561, "learning_rate": 1.6566130473637174e-05, "loss": 0.8452, "num_input_tokens_seen": 2586208, "step": 4450 }, { "epoch": 0.6635388739946381, "grad_norm": 0.8239016532897949, "learning_rate": 1.6584748287161156e-05, "loss": 0.7204, "num_input_tokens_seen": 2589120, "step": 4455 }, { "epoch": 0.6642835865355973, "grad_norm": 0.9572461247444153, "learning_rate": 1.6603366100685137e-05, "loss": 0.695, "num_input_tokens_seen": 2591872, "step": 4460 }, { "epoch": 0.6650282990765565, "grad_norm": 1.1265547275543213, "learning_rate": 1.6621983914209116e-05, "loss": 0.7508, "num_input_tokens_seen": 2594944, "step": 4465 }, { "epoch": 0.6657730116175157, "grad_norm": 0.8678054809570312, "learning_rate": 1.6640601727733097e-05, "loss": 0.6393, "num_input_tokens_seen": 2597760, "step": 4470 }, { "epoch": 0.6665177241584749, "grad_norm": 1.2051528692245483, "learning_rate": 1.6659219541257075e-05, "loss": 0.7488, "num_input_tokens_seen": 2600672, "step": 4475 }, { "epoch": 0.667262436699434, "grad_norm": 0.722379744052887, "learning_rate": 1.6677837354781057e-05, "loss": 0.6438, "num_input_tokens_seen": 2603456, "step": 4480 }, { "epoch": 0.6680071492403932, "grad_norm": 3.1602530479431152, "learning_rate": 1.6696455168305035e-05, "loss": 0.8989, "num_input_tokens_seen": 2606496, "step": 4485 }, { "epoch": 0.6687518617813524, "grad_norm": 0.9488964676856995, "learning_rate": 1.6715072981829014e-05, "loss": 0.6332, "num_input_tokens_seen": 2609216, "step": 4490 }, { "epoch": 0.6694965743223116, "grad_norm": 0.8448203206062317, "learning_rate": 1.6733690795352995e-05, "loss": 0.7545, "num_input_tokens_seen": 2611936, "step": 4495 }, { "epoch": 0.6702412868632708, "grad_norm": 1.548484206199646, "learning_rate": 1.6752308608876973e-05, "loss": 0.6984, "num_input_tokens_seen": 2614848, "step": 4500 }, { "epoch": 0.67098599940423, "grad_norm": 0.8816007375717163, "learning_rate": 1.6770926422400955e-05, "loss": 0.8302, "num_input_tokens_seen": 2617856, "step": 4505 }, { "epoch": 0.6717307119451892, "grad_norm": 0.8535122871398926, "learning_rate": 1.6789544235924933e-05, "loss": 0.7225, "num_input_tokens_seen": 2620960, "step": 4510 }, { "epoch": 0.6724754244861484, "grad_norm": 1.0654058456420898, "learning_rate": 1.6808162049448915e-05, "loss": 0.7801, "num_input_tokens_seen": 2623936, "step": 4515 }, { "epoch": 0.6732201370271076, "grad_norm": 1.0967421531677246, "learning_rate": 1.6826779862972893e-05, "loss": 0.7483, "num_input_tokens_seen": 2626816, "step": 4520 }, { "epoch": 0.6739648495680667, "grad_norm": 0.8715873956680298, "learning_rate": 1.6845397676496875e-05, "loss": 0.6995, "num_input_tokens_seen": 2629504, "step": 4525 }, { "epoch": 0.6747095621090259, "grad_norm": 0.7969054579734802, "learning_rate": 1.6864015490020853e-05, "loss": 0.628, "num_input_tokens_seen": 2632032, "step": 4530 }, { "epoch": 0.6754542746499851, "grad_norm": 1.524053692817688, "learning_rate": 1.688263330354483e-05, "loss": 0.852, "num_input_tokens_seen": 2635104, "step": 4535 }, { "epoch": 0.6761989871909443, "grad_norm": 0.7611150145530701, "learning_rate": 1.6901251117068813e-05, "loss": 0.8046, "num_input_tokens_seen": 2637888, "step": 4540 }, { "epoch": 0.6769436997319035, "grad_norm": 0.6599342823028564, "learning_rate": 1.691986893059279e-05, "loss": 0.6756, "num_input_tokens_seen": 2640608, "step": 4545 }, { "epoch": 0.6776884122728627, "grad_norm": 0.6560043096542358, "learning_rate": 1.6938486744116773e-05, "loss": 0.6173, "num_input_tokens_seen": 2643616, "step": 4550 }, { "epoch": 0.6784331248138219, "grad_norm": 0.9711167812347412, "learning_rate": 1.695710455764075e-05, "loss": 0.6661, "num_input_tokens_seen": 2646464, "step": 4555 }, { "epoch": 0.6791778373547811, "grad_norm": 1.0012891292572021, "learning_rate": 1.6975722371164733e-05, "loss": 0.7316, "num_input_tokens_seen": 2649280, "step": 4560 }, { "epoch": 0.6799225498957402, "grad_norm": 0.9748907089233398, "learning_rate": 1.699434018468871e-05, "loss": 0.6944, "num_input_tokens_seen": 2652288, "step": 4565 }, { "epoch": 0.6806672624366994, "grad_norm": 0.7926095724105835, "learning_rate": 1.7012957998212692e-05, "loss": 0.7434, "num_input_tokens_seen": 2655520, "step": 4570 }, { "epoch": 0.6814119749776586, "grad_norm": 0.8060621619224548, "learning_rate": 1.703157581173667e-05, "loss": 0.7165, "num_input_tokens_seen": 2658336, "step": 4575 }, { "epoch": 0.6821566875186178, "grad_norm": 1.5620323419570923, "learning_rate": 1.705019362526065e-05, "loss": 0.861, "num_input_tokens_seen": 2661568, "step": 4580 }, { "epoch": 0.682901400059577, "grad_norm": 0.8737844228744507, "learning_rate": 1.706881143878463e-05, "loss": 0.8737, "num_input_tokens_seen": 2664576, "step": 4585 }, { "epoch": 0.6836461126005362, "grad_norm": 0.7758747935295105, "learning_rate": 1.708742925230861e-05, "loss": 0.7617, "num_input_tokens_seen": 2667584, "step": 4590 }, { "epoch": 0.6843908251414954, "grad_norm": 1.0488361120224, "learning_rate": 1.710604706583259e-05, "loss": 0.7347, "num_input_tokens_seen": 2670272, "step": 4595 }, { "epoch": 0.6851355376824546, "grad_norm": 0.8520858883857727, "learning_rate": 1.712466487935657e-05, "loss": 0.7148, "num_input_tokens_seen": 2673312, "step": 4600 }, { "epoch": 0.6858802502234138, "grad_norm": 0.8194794058799744, "learning_rate": 1.714328269288055e-05, "loss": 0.6794, "num_input_tokens_seen": 2676512, "step": 4605 }, { "epoch": 0.6866249627643729, "grad_norm": 1.2477805614471436, "learning_rate": 1.716190050640453e-05, "loss": 0.677, "num_input_tokens_seen": 2679488, "step": 4610 }, { "epoch": 0.6873696753053321, "grad_norm": 0.8944991827011108, "learning_rate": 1.718051831992851e-05, "loss": 0.6713, "num_input_tokens_seen": 2682208, "step": 4615 }, { "epoch": 0.6881143878462913, "grad_norm": 1.177215576171875, "learning_rate": 1.7199136133452488e-05, "loss": 0.67, "num_input_tokens_seen": 2684992, "step": 4620 }, { "epoch": 0.6888591003872505, "grad_norm": 0.9961646199226379, "learning_rate": 1.7217753946976467e-05, "loss": 0.7968, "num_input_tokens_seen": 2687840, "step": 4625 }, { "epoch": 0.6896038129282097, "grad_norm": 1.1201213598251343, "learning_rate": 1.7236371760500448e-05, "loss": 0.7995, "num_input_tokens_seen": 2690592, "step": 4630 }, { "epoch": 0.6903485254691689, "grad_norm": 1.172155737876892, "learning_rate": 1.7254989574024426e-05, "loss": 0.6672, "num_input_tokens_seen": 2693472, "step": 4635 }, { "epoch": 0.6910932380101281, "grad_norm": 1.5727664232254028, "learning_rate": 1.7273607387548408e-05, "loss": 0.6573, "num_input_tokens_seen": 2696256, "step": 4640 }, { "epoch": 0.6918379505510873, "grad_norm": 1.2705111503601074, "learning_rate": 1.7292225201072386e-05, "loss": 0.8329, "num_input_tokens_seen": 2699104, "step": 4645 }, { "epoch": 0.6925826630920465, "grad_norm": 1.0501744747161865, "learning_rate": 1.7310843014596368e-05, "loss": 0.7243, "num_input_tokens_seen": 2702240, "step": 4650 }, { "epoch": 0.6933273756330056, "grad_norm": 0.9258216619491577, "learning_rate": 1.7329460828120346e-05, "loss": 0.6145, "num_input_tokens_seen": 2704864, "step": 4655 }, { "epoch": 0.6940720881739648, "grad_norm": 1.1578415632247925, "learning_rate": 1.7348078641644328e-05, "loss": 0.7556, "num_input_tokens_seen": 2707552, "step": 4660 }, { "epoch": 0.694816800714924, "grad_norm": 0.7144506573677063, "learning_rate": 1.7366696455168306e-05, "loss": 0.7198, "num_input_tokens_seen": 2710592, "step": 4665 }, { "epoch": 0.6955615132558832, "grad_norm": 0.9658413529396057, "learning_rate": 1.7385314268692284e-05, "loss": 0.8641, "num_input_tokens_seen": 2713504, "step": 4670 }, { "epoch": 0.6963062257968424, "grad_norm": 1.0961179733276367, "learning_rate": 1.7403932082216266e-05, "loss": 0.699, "num_input_tokens_seen": 2716448, "step": 4675 }, { "epoch": 0.6970509383378016, "grad_norm": 1.6990935802459717, "learning_rate": 1.7422549895740244e-05, "loss": 0.72, "num_input_tokens_seen": 2719296, "step": 4680 }, { "epoch": 0.6977956508787608, "grad_norm": 0.792478084564209, "learning_rate": 1.7441167709264226e-05, "loss": 0.754, "num_input_tokens_seen": 2722112, "step": 4685 }, { "epoch": 0.69854036341972, "grad_norm": 1.244199275970459, "learning_rate": 1.7459785522788204e-05, "loss": 0.7402, "num_input_tokens_seen": 2725056, "step": 4690 }, { "epoch": 0.6992850759606791, "grad_norm": 0.9637672901153564, "learning_rate": 1.7478403336312185e-05, "loss": 0.6455, "num_input_tokens_seen": 2727936, "step": 4695 }, { "epoch": 0.7000297885016383, "grad_norm": 0.7123041749000549, "learning_rate": 1.7497021149836164e-05, "loss": 0.838, "num_input_tokens_seen": 2730656, "step": 4700 }, { "epoch": 0.7007745010425975, "grad_norm": 1.1167466640472412, "learning_rate": 1.7515638963360142e-05, "loss": 0.661, "num_input_tokens_seen": 2733888, "step": 4705 }, { "epoch": 0.7015192135835567, "grad_norm": 1.1779786348342896, "learning_rate": 1.7534256776884124e-05, "loss": 0.5691, "num_input_tokens_seen": 2736640, "step": 4710 }, { "epoch": 0.7022639261245159, "grad_norm": 0.9368203282356262, "learning_rate": 1.7552874590408102e-05, "loss": 0.687, "num_input_tokens_seen": 2739360, "step": 4715 }, { "epoch": 0.7030086386654751, "grad_norm": 0.804231584072113, "learning_rate": 1.7571492403932083e-05, "loss": 0.7872, "num_input_tokens_seen": 2742080, "step": 4720 }, { "epoch": 0.7037533512064343, "grad_norm": 1.138701319694519, "learning_rate": 1.759011021745606e-05, "loss": 0.8303, "num_input_tokens_seen": 2744736, "step": 4725 }, { "epoch": 0.7044980637473935, "grad_norm": 0.724285900592804, "learning_rate": 1.7608728030980043e-05, "loss": 0.7837, "num_input_tokens_seen": 2747712, "step": 4730 }, { "epoch": 0.7052427762883527, "grad_norm": 1.0601332187652588, "learning_rate": 1.762734584450402e-05, "loss": 0.7693, "num_input_tokens_seen": 2750688, "step": 4735 }, { "epoch": 0.7059874888293118, "grad_norm": 1.0833158493041992, "learning_rate": 1.7645963658028003e-05, "loss": 0.8147, "num_input_tokens_seen": 2753568, "step": 4740 }, { "epoch": 0.706732201370271, "grad_norm": 0.8026087284088135, "learning_rate": 1.7664581471551985e-05, "loss": 0.7876, "num_input_tokens_seen": 2756416, "step": 4745 }, { "epoch": 0.7074769139112302, "grad_norm": 0.9807589650154114, "learning_rate": 1.768319928507596e-05, "loss": 0.757, "num_input_tokens_seen": 2759488, "step": 4750 }, { "epoch": 0.7082216264521894, "grad_norm": 1.0778698921203613, "learning_rate": 1.770181709859994e-05, "loss": 0.6965, "num_input_tokens_seen": 2762432, "step": 4755 }, { "epoch": 0.7089663389931486, "grad_norm": 0.7117688059806824, "learning_rate": 1.772043491212392e-05, "loss": 0.8255, "num_input_tokens_seen": 2765504, "step": 4760 }, { "epoch": 0.7097110515341079, "grad_norm": 1.2347626686096191, "learning_rate": 1.77390527256479e-05, "loss": 0.7287, "num_input_tokens_seen": 2768416, "step": 4765 }, { "epoch": 0.710455764075067, "grad_norm": 0.61350417137146, "learning_rate": 1.775767053917188e-05, "loss": 0.5871, "num_input_tokens_seen": 2771456, "step": 4770 }, { "epoch": 0.7112004766160263, "grad_norm": 0.8348529934883118, "learning_rate": 1.777628835269586e-05, "loss": 0.7522, "num_input_tokens_seen": 2774720, "step": 4775 }, { "epoch": 0.7119451891569855, "grad_norm": 0.8004449009895325, "learning_rate": 1.779490616621984e-05, "loss": 0.7585, "num_input_tokens_seen": 2777600, "step": 4780 }, { "epoch": 0.7126899016979446, "grad_norm": 1.0189279317855835, "learning_rate": 1.781352397974382e-05, "loss": 0.8206, "num_input_tokens_seen": 2780608, "step": 4785 }, { "epoch": 0.7134346142389038, "grad_norm": 0.883749783039093, "learning_rate": 1.78321417932678e-05, "loss": 0.7804, "num_input_tokens_seen": 2783328, "step": 4790 }, { "epoch": 0.714179326779863, "grad_norm": 1.304351806640625, "learning_rate": 1.7850759606791777e-05, "loss": 0.7258, "num_input_tokens_seen": 2786336, "step": 4795 }, { "epoch": 0.7149240393208222, "grad_norm": 1.2475749254226685, "learning_rate": 1.786937742031576e-05, "loss": 0.6027, "num_input_tokens_seen": 2789024, "step": 4800 }, { "epoch": 0.7156687518617814, "grad_norm": 0.7574827671051025, "learning_rate": 1.7887995233839737e-05, "loss": 0.6605, "num_input_tokens_seen": 2792064, "step": 4805 }, { "epoch": 0.7164134644027406, "grad_norm": 1.084133267402649, "learning_rate": 1.790661304736372e-05, "loss": 0.7905, "num_input_tokens_seen": 2794848, "step": 4810 }, { "epoch": 0.7171581769436998, "grad_norm": 1.2530180215835571, "learning_rate": 1.7925230860887697e-05, "loss": 0.7029, "num_input_tokens_seen": 2797728, "step": 4815 }, { "epoch": 0.717902889484659, "grad_norm": 0.5518854260444641, "learning_rate": 1.794384867441168e-05, "loss": 0.6862, "num_input_tokens_seen": 2800832, "step": 4820 }, { "epoch": 0.7186476020256181, "grad_norm": 0.9170163869857788, "learning_rate": 1.796246648793566e-05, "loss": 0.7559, "num_input_tokens_seen": 2803712, "step": 4825 }, { "epoch": 0.7193923145665773, "grad_norm": 0.977379560470581, "learning_rate": 1.798108430145964e-05, "loss": 0.7618, "num_input_tokens_seen": 2806528, "step": 4830 }, { "epoch": 0.7201370271075365, "grad_norm": 1.0659383535385132, "learning_rate": 1.7999702114983617e-05, "loss": 0.8566, "num_input_tokens_seen": 2809216, "step": 4835 }, { "epoch": 0.7208817396484957, "grad_norm": 0.9553453922271729, "learning_rate": 1.8018319928507595e-05, "loss": 0.6173, "num_input_tokens_seen": 2812224, "step": 4840 }, { "epoch": 0.7216264521894549, "grad_norm": 0.8265039920806885, "learning_rate": 1.8036937742031576e-05, "loss": 0.7849, "num_input_tokens_seen": 2815008, "step": 4845 }, { "epoch": 0.7223711647304141, "grad_norm": 0.8245757818222046, "learning_rate": 1.8055555555555555e-05, "loss": 0.7758, "num_input_tokens_seen": 2817632, "step": 4850 }, { "epoch": 0.7231158772713733, "grad_norm": 0.9654885530471802, "learning_rate": 1.8074173369079536e-05, "loss": 0.7769, "num_input_tokens_seen": 2820352, "step": 4855 }, { "epoch": 0.7238605898123325, "grad_norm": 1.4545817375183105, "learning_rate": 1.8092791182603515e-05, "loss": 0.7212, "num_input_tokens_seen": 2823328, "step": 4860 }, { "epoch": 0.7246053023532917, "grad_norm": 0.9878905415534973, "learning_rate": 1.8111408996127496e-05, "loss": 0.8131, "num_input_tokens_seen": 2826368, "step": 4865 }, { "epoch": 0.7253500148942508, "grad_norm": 2.8329973220825195, "learning_rate": 1.8130026809651478e-05, "loss": 0.7896, "num_input_tokens_seen": 2829504, "step": 4870 }, { "epoch": 0.72609472743521, "grad_norm": 0.9904627799987793, "learning_rate": 1.8148644623175456e-05, "loss": 0.7919, "num_input_tokens_seen": 2832384, "step": 4875 }, { "epoch": 0.7268394399761692, "grad_norm": 0.9353175163269043, "learning_rate": 1.8167262436699434e-05, "loss": 0.8615, "num_input_tokens_seen": 2834976, "step": 4880 }, { "epoch": 0.7275841525171284, "grad_norm": 0.885760486125946, "learning_rate": 1.8185880250223413e-05, "loss": 0.8293, "num_input_tokens_seen": 2837952, "step": 4885 }, { "epoch": 0.7283288650580876, "grad_norm": 0.9273067116737366, "learning_rate": 1.8204498063747394e-05, "loss": 0.7476, "num_input_tokens_seen": 2840736, "step": 4890 }, { "epoch": 0.7290735775990468, "grad_norm": 0.733432412147522, "learning_rate": 1.8223115877271372e-05, "loss": 0.6347, "num_input_tokens_seen": 2843552, "step": 4895 }, { "epoch": 0.729818290140006, "grad_norm": 0.7074889540672302, "learning_rate": 1.8241733690795354e-05, "loss": 0.5923, "num_input_tokens_seen": 2846592, "step": 4900 }, { "epoch": 0.7305630026809652, "grad_norm": 1.2694340944290161, "learning_rate": 1.8260351504319336e-05, "loss": 0.7193, "num_input_tokens_seen": 2849376, "step": 4905 }, { "epoch": 0.7313077152219244, "grad_norm": 1.4275898933410645, "learning_rate": 1.8278969317843314e-05, "loss": 0.9052, "num_input_tokens_seen": 2852032, "step": 4910 }, { "epoch": 0.7320524277628835, "grad_norm": 2.525458335876465, "learning_rate": 1.8297587131367295e-05, "loss": 0.8377, "num_input_tokens_seen": 2855360, "step": 4915 }, { "epoch": 0.7327971403038427, "grad_norm": 0.8099566698074341, "learning_rate": 1.831620494489127e-05, "loss": 0.6005, "num_input_tokens_seen": 2858080, "step": 4920 }, { "epoch": 0.7335418528448019, "grad_norm": 0.8939014673233032, "learning_rate": 1.8334822758415252e-05, "loss": 0.8513, "num_input_tokens_seen": 2861024, "step": 4925 }, { "epoch": 0.7342865653857611, "grad_norm": 1.0271828174591064, "learning_rate": 1.835344057193923e-05, "loss": 0.7466, "num_input_tokens_seen": 2864128, "step": 4930 }, { "epoch": 0.7350312779267203, "grad_norm": 1.057223916053772, "learning_rate": 1.8372058385463212e-05, "loss": 0.6751, "num_input_tokens_seen": 2866912, "step": 4935 }, { "epoch": 0.7357759904676795, "grad_norm": 1.0442454814910889, "learning_rate": 1.8390676198987193e-05, "loss": 0.6584, "num_input_tokens_seen": 2869824, "step": 4940 }, { "epoch": 0.7365207030086387, "grad_norm": 1.5494019985198975, "learning_rate": 1.840929401251117e-05, "loss": 0.8603, "num_input_tokens_seen": 2872576, "step": 4945 }, { "epoch": 0.7372654155495979, "grad_norm": 1.1076806783676147, "learning_rate": 1.8427911826035153e-05, "loss": 0.8155, "num_input_tokens_seen": 2875552, "step": 4950 }, { "epoch": 0.738010128090557, "grad_norm": 0.8040313124656677, "learning_rate": 1.844652963955913e-05, "loss": 0.7013, "num_input_tokens_seen": 2878560, "step": 4955 }, { "epoch": 0.7387548406315162, "grad_norm": 1.1824244260787964, "learning_rate": 1.8465147453083113e-05, "loss": 0.6396, "num_input_tokens_seen": 2881344, "step": 4960 }, { "epoch": 0.7394995531724754, "grad_norm": 0.6896454095840454, "learning_rate": 1.8483765266607088e-05, "loss": 0.8594, "num_input_tokens_seen": 2884416, "step": 4965 }, { "epoch": 0.7402442657134346, "grad_norm": 1.1430799961090088, "learning_rate": 1.850238308013107e-05, "loss": 0.6786, "num_input_tokens_seen": 2887264, "step": 4970 }, { "epoch": 0.7409889782543938, "grad_norm": 1.183379888534546, "learning_rate": 1.8521000893655048e-05, "loss": 0.9047, "num_input_tokens_seen": 2890016, "step": 4975 }, { "epoch": 0.741733690795353, "grad_norm": 0.8105623722076416, "learning_rate": 1.853961870717903e-05, "loss": 0.6451, "num_input_tokens_seen": 2892832, "step": 4980 }, { "epoch": 0.7424784033363122, "grad_norm": 1.4519586563110352, "learning_rate": 1.855823652070301e-05, "loss": 0.8948, "num_input_tokens_seen": 2895904, "step": 4985 }, { "epoch": 0.7432231158772714, "grad_norm": 1.121055245399475, "learning_rate": 1.857685433422699e-05, "loss": 0.8571, "num_input_tokens_seen": 2898848, "step": 4990 }, { "epoch": 0.7439678284182306, "grad_norm": 0.7392324805259705, "learning_rate": 1.859547214775097e-05, "loss": 0.7046, "num_input_tokens_seen": 2901696, "step": 4995 }, { "epoch": 0.7447125409591897, "grad_norm": 1.179587960243225, "learning_rate": 1.861408996127495e-05, "loss": 0.8819, "num_input_tokens_seen": 2904224, "step": 5000 }, { "epoch": 0.7454572535001489, "grad_norm": 1.0136088132858276, "learning_rate": 1.863270777479893e-05, "loss": 0.7803, "num_input_tokens_seen": 2906848, "step": 5005 }, { "epoch": 0.7462019660411081, "grad_norm": 2.0267844200134277, "learning_rate": 1.8651325588322906e-05, "loss": 0.7798, "num_input_tokens_seen": 2909568, "step": 5010 }, { "epoch": 0.7469466785820673, "grad_norm": 1.0562494993209839, "learning_rate": 1.8669943401846887e-05, "loss": 0.792, "num_input_tokens_seen": 2912352, "step": 5015 }, { "epoch": 0.7476913911230265, "grad_norm": 1.3591195344924927, "learning_rate": 1.868856121537087e-05, "loss": 0.663, "num_input_tokens_seen": 2914720, "step": 5020 }, { "epoch": 0.7484361036639857, "grad_norm": 0.9201484322547913, "learning_rate": 1.8707179028894847e-05, "loss": 0.7802, "num_input_tokens_seen": 2917536, "step": 5025 }, { "epoch": 0.7491808162049449, "grad_norm": 0.9147063493728638, "learning_rate": 1.872579684241883e-05, "loss": 0.8008, "num_input_tokens_seen": 2920448, "step": 5030 }, { "epoch": 0.7499255287459041, "grad_norm": 1.3171766996383667, "learning_rate": 1.8744414655942807e-05, "loss": 0.799, "num_input_tokens_seen": 2923712, "step": 5035 }, { "epoch": 0.7506702412868632, "grad_norm": 0.787266731262207, "learning_rate": 1.876303246946679e-05, "loss": 0.6912, "num_input_tokens_seen": 2926528, "step": 5040 }, { "epoch": 0.7514149538278224, "grad_norm": 0.8815605044364929, "learning_rate": 1.8781650282990767e-05, "loss": 0.6901, "num_input_tokens_seen": 2929280, "step": 5045 }, { "epoch": 0.7521596663687816, "grad_norm": 0.9342383146286011, "learning_rate": 1.8800268096514745e-05, "loss": 0.8498, "num_input_tokens_seen": 2932288, "step": 5050 }, { "epoch": 0.7529043789097408, "grad_norm": 1.0170035362243652, "learning_rate": 1.8818885910038723e-05, "loss": 0.7521, "num_input_tokens_seen": 2934976, "step": 5055 }, { "epoch": 0.7536490914507, "grad_norm": 0.8957746028900146, "learning_rate": 1.8837503723562705e-05, "loss": 0.8019, "num_input_tokens_seen": 2937856, "step": 5060 }, { "epoch": 0.7543938039916592, "grad_norm": 0.7553759813308716, "learning_rate": 1.8856121537086686e-05, "loss": 0.6478, "num_input_tokens_seen": 2940384, "step": 5065 }, { "epoch": 0.7551385165326184, "grad_norm": 0.6602668166160583, "learning_rate": 1.8874739350610665e-05, "loss": 0.713, "num_input_tokens_seen": 2943232, "step": 5070 }, { "epoch": 0.7558832290735776, "grad_norm": 1.9141215085983276, "learning_rate": 1.8893357164134646e-05, "loss": 0.8958, "num_input_tokens_seen": 2946208, "step": 5075 }, { "epoch": 0.7566279416145368, "grad_norm": 1.1847620010375977, "learning_rate": 1.8911974977658625e-05, "loss": 0.6598, "num_input_tokens_seen": 2949280, "step": 5080 }, { "epoch": 0.7573726541554959, "grad_norm": 1.0175971984863281, "learning_rate": 1.8930592791182606e-05, "loss": 0.8224, "num_input_tokens_seen": 2951808, "step": 5085 }, { "epoch": 0.7581173666964551, "grad_norm": 1.323228359222412, "learning_rate": 1.8949210604706584e-05, "loss": 0.6704, "num_input_tokens_seen": 2954560, "step": 5090 }, { "epoch": 0.7588620792374143, "grad_norm": 0.867445707321167, "learning_rate": 1.8967828418230563e-05, "loss": 0.7857, "num_input_tokens_seen": 2957376, "step": 5095 }, { "epoch": 0.7596067917783735, "grad_norm": 0.8723859190940857, "learning_rate": 1.8986446231754544e-05, "loss": 0.6063, "num_input_tokens_seen": 2960352, "step": 5100 }, { "epoch": 0.7603515043193327, "grad_norm": 0.8764175176620483, "learning_rate": 1.9005064045278523e-05, "loss": 0.8021, "num_input_tokens_seen": 2963328, "step": 5105 }, { "epoch": 0.7610962168602919, "grad_norm": 0.6127743721008301, "learning_rate": 1.9023681858802504e-05, "loss": 0.6904, "num_input_tokens_seen": 2966208, "step": 5110 }, { "epoch": 0.7618409294012511, "grad_norm": 0.937519907951355, "learning_rate": 1.9042299672326482e-05, "loss": 0.7384, "num_input_tokens_seen": 2969120, "step": 5115 }, { "epoch": 0.7625856419422103, "grad_norm": 1.0166327953338623, "learning_rate": 1.9060917485850464e-05, "loss": 0.7435, "num_input_tokens_seen": 2971776, "step": 5120 }, { "epoch": 0.7633303544831695, "grad_norm": 1.1288949251174927, "learning_rate": 1.9079535299374442e-05, "loss": 0.7678, "num_input_tokens_seen": 2974560, "step": 5125 }, { "epoch": 0.7640750670241286, "grad_norm": 0.838300347328186, "learning_rate": 1.9098153112898424e-05, "loss": 0.7006, "num_input_tokens_seen": 2977376, "step": 5130 }, { "epoch": 0.7648197795650878, "grad_norm": 0.8217558860778809, "learning_rate": 1.9116770926422402e-05, "loss": 0.6843, "num_input_tokens_seen": 2980192, "step": 5135 }, { "epoch": 0.765564492106047, "grad_norm": 1.4878672361373901, "learning_rate": 1.913538873994638e-05, "loss": 0.8396, "num_input_tokens_seen": 2982848, "step": 5140 }, { "epoch": 0.7663092046470062, "grad_norm": 0.6860877275466919, "learning_rate": 1.9154006553470362e-05, "loss": 0.6782, "num_input_tokens_seen": 2985344, "step": 5145 }, { "epoch": 0.7670539171879655, "grad_norm": 0.6514309048652649, "learning_rate": 1.917262436699434e-05, "loss": 0.792, "num_input_tokens_seen": 2988128, "step": 5150 }, { "epoch": 0.7677986297289247, "grad_norm": 0.6134268641471863, "learning_rate": 1.9191242180518322e-05, "loss": 0.6925, "num_input_tokens_seen": 2991040, "step": 5155 }, { "epoch": 0.7685433422698839, "grad_norm": 1.2343363761901855, "learning_rate": 1.92098599940423e-05, "loss": 0.6949, "num_input_tokens_seen": 2994112, "step": 5160 }, { "epoch": 0.7692880548108431, "grad_norm": 1.1552643775939941, "learning_rate": 1.922847780756628e-05, "loss": 0.8329, "num_input_tokens_seen": 2996960, "step": 5165 }, { "epoch": 0.7700327673518021, "grad_norm": 0.6446408629417419, "learning_rate": 1.924709562109026e-05, "loss": 0.722, "num_input_tokens_seen": 2999680, "step": 5170 }, { "epoch": 0.7707774798927614, "grad_norm": 1.8672552108764648, "learning_rate": 1.926571343461424e-05, "loss": 0.7988, "num_input_tokens_seen": 3002336, "step": 5175 }, { "epoch": 0.7715221924337206, "grad_norm": 0.7773911356925964, "learning_rate": 1.928433124813822e-05, "loss": 0.7055, "num_input_tokens_seen": 3005216, "step": 5180 }, { "epoch": 0.7722669049746798, "grad_norm": 1.248471736907959, "learning_rate": 1.9302949061662198e-05, "loss": 0.7314, "num_input_tokens_seen": 3008032, "step": 5185 }, { "epoch": 0.773011617515639, "grad_norm": 0.8157533407211304, "learning_rate": 1.932156687518618e-05, "loss": 0.7235, "num_input_tokens_seen": 3010848, "step": 5190 }, { "epoch": 0.7737563300565982, "grad_norm": 1.4044054746627808, "learning_rate": 1.9340184688710158e-05, "loss": 0.7465, "num_input_tokens_seen": 3013568, "step": 5195 }, { "epoch": 0.7745010425975574, "grad_norm": 1.1153078079223633, "learning_rate": 1.935880250223414e-05, "loss": 0.6956, "num_input_tokens_seen": 3016512, "step": 5200 }, { "epoch": 0.7752457551385166, "grad_norm": 1.3257050514221191, "learning_rate": 1.9377420315758118e-05, "loss": 0.8414, "num_input_tokens_seen": 3019424, "step": 5205 }, { "epoch": 0.7759904676794758, "grad_norm": 0.9345287680625916, "learning_rate": 1.93960381292821e-05, "loss": 0.813, "num_input_tokens_seen": 3022368, "step": 5210 }, { "epoch": 0.7767351802204349, "grad_norm": 1.2269099950790405, "learning_rate": 1.9414655942806077e-05, "loss": 0.8242, "num_input_tokens_seen": 3025120, "step": 5215 }, { "epoch": 0.7774798927613941, "grad_norm": 0.936810314655304, "learning_rate": 1.943327375633006e-05, "loss": 0.6649, "num_input_tokens_seen": 3028032, "step": 5220 }, { "epoch": 0.7782246053023533, "grad_norm": 0.8655732274055481, "learning_rate": 1.9451891569854037e-05, "loss": 0.6711, "num_input_tokens_seen": 3030624, "step": 5225 }, { "epoch": 0.7789693178433125, "grad_norm": 1.3695474863052368, "learning_rate": 1.9470509383378016e-05, "loss": 0.7952, "num_input_tokens_seen": 3033472, "step": 5230 }, { "epoch": 0.7797140303842717, "grad_norm": 1.3363656997680664, "learning_rate": 1.9489127196901997e-05, "loss": 0.7479, "num_input_tokens_seen": 3036384, "step": 5235 }, { "epoch": 0.7804587429252309, "grad_norm": 1.2266234159469604, "learning_rate": 1.9507745010425975e-05, "loss": 0.7091, "num_input_tokens_seen": 3039200, "step": 5240 }, { "epoch": 0.7812034554661901, "grad_norm": 0.9528170228004456, "learning_rate": 1.9526362823949957e-05, "loss": 0.764, "num_input_tokens_seen": 3042208, "step": 5245 }, { "epoch": 0.7819481680071493, "grad_norm": 1.9203678369522095, "learning_rate": 1.9544980637473935e-05, "loss": 0.6872, "num_input_tokens_seen": 3044992, "step": 5250 }, { "epoch": 0.7826928805481085, "grad_norm": 1.280224323272705, "learning_rate": 1.9563598450997917e-05, "loss": 0.7022, "num_input_tokens_seen": 3048288, "step": 5255 }, { "epoch": 0.7834375930890676, "grad_norm": 0.6809734106063843, "learning_rate": 1.9582216264521895e-05, "loss": 0.7668, "num_input_tokens_seen": 3051072, "step": 5260 }, { "epoch": 0.7841823056300268, "grad_norm": 0.8640082478523254, "learning_rate": 1.9600834078045877e-05, "loss": 0.6518, "num_input_tokens_seen": 3054016, "step": 5265 }, { "epoch": 0.784927018170986, "grad_norm": 1.0790574550628662, "learning_rate": 1.9619451891569855e-05, "loss": 0.7387, "num_input_tokens_seen": 3057088, "step": 5270 }, { "epoch": 0.7856717307119452, "grad_norm": 0.935377836227417, "learning_rate": 1.9638069705093833e-05, "loss": 0.7093, "num_input_tokens_seen": 3060096, "step": 5275 }, { "epoch": 0.7864164432529044, "grad_norm": 0.8971146941184998, "learning_rate": 1.9656687518617815e-05, "loss": 0.7983, "num_input_tokens_seen": 3063040, "step": 5280 }, { "epoch": 0.7871611557938636, "grad_norm": 0.7480303645133972, "learning_rate": 1.9675305332141793e-05, "loss": 0.7389, "num_input_tokens_seen": 3065824, "step": 5285 }, { "epoch": 0.7879058683348228, "grad_norm": 1.0782907009124756, "learning_rate": 1.9693923145665775e-05, "loss": 0.7572, "num_input_tokens_seen": 3068672, "step": 5290 }, { "epoch": 0.788650580875782, "grad_norm": 0.7668389678001404, "learning_rate": 1.9712540959189753e-05, "loss": 0.7079, "num_input_tokens_seen": 3071616, "step": 5295 }, { "epoch": 0.7893952934167411, "grad_norm": 0.7496082186698914, "learning_rate": 1.9731158772713735e-05, "loss": 0.7033, "num_input_tokens_seen": 3074432, "step": 5300 }, { "epoch": 0.7901400059577003, "grad_norm": 1.8950011730194092, "learning_rate": 1.9749776586237713e-05, "loss": 0.7171, "num_input_tokens_seen": 3077472, "step": 5305 }, { "epoch": 0.7908847184986595, "grad_norm": 0.8117872476577759, "learning_rate": 1.976839439976169e-05, "loss": 0.7474, "num_input_tokens_seen": 3080544, "step": 5310 }, { "epoch": 0.7916294310396187, "grad_norm": 1.1002696752548218, "learning_rate": 1.9787012213285673e-05, "loss": 0.7985, "num_input_tokens_seen": 3083584, "step": 5315 }, { "epoch": 0.7923741435805779, "grad_norm": 0.8522593975067139, "learning_rate": 1.980563002680965e-05, "loss": 0.7894, "num_input_tokens_seen": 3086208, "step": 5320 }, { "epoch": 0.7931188561215371, "grad_norm": 0.9042148590087891, "learning_rate": 1.9824247840333632e-05, "loss": 0.6849, "num_input_tokens_seen": 3089152, "step": 5325 }, { "epoch": 0.7938635686624963, "grad_norm": 2.291433095932007, "learning_rate": 1.984286565385761e-05, "loss": 0.8021, "num_input_tokens_seen": 3091968, "step": 5330 }, { "epoch": 0.7946082812034555, "grad_norm": 0.9141678810119629, "learning_rate": 1.9861483467381592e-05, "loss": 0.7591, "num_input_tokens_seen": 3095072, "step": 5335 }, { "epoch": 0.7953529937444147, "grad_norm": 0.6105952262878418, "learning_rate": 1.988010128090557e-05, "loss": 0.6643, "num_input_tokens_seen": 3097920, "step": 5340 }, { "epoch": 0.7960977062853738, "grad_norm": 1.0130480527877808, "learning_rate": 1.9898719094429552e-05, "loss": 0.6429, "num_input_tokens_seen": 3101056, "step": 5345 }, { "epoch": 0.796842418826333, "grad_norm": 1.062715768814087, "learning_rate": 1.9917336907953534e-05, "loss": 0.6043, "num_input_tokens_seen": 3103904, "step": 5350 }, { "epoch": 0.7975871313672922, "grad_norm": 1.3033955097198486, "learning_rate": 1.993595472147751e-05, "loss": 0.8402, "num_input_tokens_seen": 3106912, "step": 5355 }, { "epoch": 0.7983318439082514, "grad_norm": 0.8861692547798157, "learning_rate": 1.995457253500149e-05, "loss": 0.7415, "num_input_tokens_seen": 3109408, "step": 5360 }, { "epoch": 0.7990765564492106, "grad_norm": 1.046314001083374, "learning_rate": 1.997319034852547e-05, "loss": 0.711, "num_input_tokens_seen": 3112352, "step": 5365 }, { "epoch": 0.7998212689901698, "grad_norm": 0.8677878379821777, "learning_rate": 1.999180816204945e-05, "loss": 0.7945, "num_input_tokens_seen": 3115328, "step": 5370 }, { "epoch": 0.800565981531129, "grad_norm": 0.9339365363121033, "learning_rate": 2.001042597557343e-05, "loss": 0.7942, "num_input_tokens_seen": 3118112, "step": 5375 }, { "epoch": 0.8013106940720882, "grad_norm": 0.6308919191360474, "learning_rate": 2.002904378909741e-05, "loss": 0.7297, "num_input_tokens_seen": 3121280, "step": 5380 }, { "epoch": 0.8020554066130474, "grad_norm": 0.7070090174674988, "learning_rate": 2.0047661602621388e-05, "loss": 0.6758, "num_input_tokens_seen": 3124352, "step": 5385 }, { "epoch": 0.8028001191540065, "grad_norm": 0.7551192045211792, "learning_rate": 2.006627941614537e-05, "loss": 0.5804, "num_input_tokens_seen": 3127296, "step": 5390 }, { "epoch": 0.8035448316949657, "grad_norm": 0.7536697387695312, "learning_rate": 2.008489722966935e-05, "loss": 0.7575, "num_input_tokens_seen": 3130336, "step": 5395 }, { "epoch": 0.8042895442359249, "grad_norm": 0.5967417359352112, "learning_rate": 2.0103515043193326e-05, "loss": 0.6963, "num_input_tokens_seen": 3133408, "step": 5400 }, { "epoch": 0.8050342567768841, "grad_norm": 0.7384681105613708, "learning_rate": 2.0122132856717308e-05, "loss": 0.6904, "num_input_tokens_seen": 3136352, "step": 5405 }, { "epoch": 0.8057789693178433, "grad_norm": 1.328580379486084, "learning_rate": 2.0140750670241286e-05, "loss": 0.7126, "num_input_tokens_seen": 3139232, "step": 5410 }, { "epoch": 0.8065236818588025, "grad_norm": 1.1082572937011719, "learning_rate": 2.0159368483765268e-05, "loss": 0.6382, "num_input_tokens_seen": 3142080, "step": 5415 }, { "epoch": 0.8072683943997617, "grad_norm": 1.1724473237991333, "learning_rate": 2.0177986297289246e-05, "loss": 0.8182, "num_input_tokens_seen": 3144864, "step": 5420 }, { "epoch": 0.8080131069407209, "grad_norm": 0.815814733505249, "learning_rate": 2.0196604110813228e-05, "loss": 0.6975, "num_input_tokens_seen": 3147712, "step": 5425 }, { "epoch": 0.80875781948168, "grad_norm": 2.271585464477539, "learning_rate": 2.021522192433721e-05, "loss": 0.7591, "num_input_tokens_seen": 3150688, "step": 5430 }, { "epoch": 0.8095025320226392, "grad_norm": 1.6204464435577393, "learning_rate": 2.0233839737861187e-05, "loss": 0.7205, "num_input_tokens_seen": 3153760, "step": 5435 }, { "epoch": 0.8102472445635984, "grad_norm": 0.9167062044143677, "learning_rate": 2.0252457551385166e-05, "loss": 0.7009, "num_input_tokens_seen": 3156896, "step": 5440 }, { "epoch": 0.8109919571045576, "grad_norm": 1.4267003536224365, "learning_rate": 2.0271075364909144e-05, "loss": 0.82, "num_input_tokens_seen": 3159648, "step": 5445 }, { "epoch": 0.8117366696455168, "grad_norm": 1.1168702840805054, "learning_rate": 2.0289693178433126e-05, "loss": 0.7476, "num_input_tokens_seen": 3162432, "step": 5450 }, { "epoch": 0.812481382186476, "grad_norm": 0.765704870223999, "learning_rate": 2.0308310991957104e-05, "loss": 0.6731, "num_input_tokens_seen": 3165088, "step": 5455 }, { "epoch": 0.8132260947274352, "grad_norm": 2.391406536102295, "learning_rate": 2.0326928805481085e-05, "loss": 0.7261, "num_input_tokens_seen": 3167936, "step": 5460 }, { "epoch": 0.8139708072683944, "grad_norm": 1.8349345922470093, "learning_rate": 2.0345546619005064e-05, "loss": 0.8081, "num_input_tokens_seen": 3170848, "step": 5465 }, { "epoch": 0.8147155198093536, "grad_norm": 0.6811018586158752, "learning_rate": 2.0364164432529045e-05, "loss": 0.7023, "num_input_tokens_seen": 3173760, "step": 5470 }, { "epoch": 0.8154602323503127, "grad_norm": 2.2036197185516357, "learning_rate": 2.0382782246053027e-05, "loss": 0.7844, "num_input_tokens_seen": 3176288, "step": 5475 }, { "epoch": 0.8162049448912719, "grad_norm": 0.9882214069366455, "learning_rate": 2.0401400059577005e-05, "loss": 0.8119, "num_input_tokens_seen": 3179392, "step": 5480 }, { "epoch": 0.8169496574322311, "grad_norm": 1.2166911363601685, "learning_rate": 2.0420017873100983e-05, "loss": 0.6511, "num_input_tokens_seen": 3182080, "step": 5485 }, { "epoch": 0.8176943699731903, "grad_norm": 0.9874687790870667, "learning_rate": 2.043863568662496e-05, "loss": 0.6659, "num_input_tokens_seen": 3184480, "step": 5490 }, { "epoch": 0.8184390825141495, "grad_norm": 0.7939738631248474, "learning_rate": 2.0457253500148943e-05, "loss": 0.6902, "num_input_tokens_seen": 3187424, "step": 5495 }, { "epoch": 0.8191837950551087, "grad_norm": 0.9984259009361267, "learning_rate": 2.047587131367292e-05, "loss": 0.6292, "num_input_tokens_seen": 3190304, "step": 5500 }, { "epoch": 0.819928507596068, "grad_norm": 1.0236331224441528, "learning_rate": 2.0494489127196903e-05, "loss": 0.7466, "num_input_tokens_seen": 3193280, "step": 5505 }, { "epoch": 0.8206732201370271, "grad_norm": 0.8515987396240234, "learning_rate": 2.0513106940720885e-05, "loss": 0.6823, "num_input_tokens_seen": 3196160, "step": 5510 }, { "epoch": 0.8214179326779864, "grad_norm": 0.8657733201980591, "learning_rate": 2.0531724754244863e-05, "loss": 0.7921, "num_input_tokens_seen": 3199072, "step": 5515 }, { "epoch": 0.8221626452189454, "grad_norm": 1.0391792058944702, "learning_rate": 2.0550342567768845e-05, "loss": 0.7261, "num_input_tokens_seen": 3201856, "step": 5520 }, { "epoch": 0.8229073577599046, "grad_norm": 0.999819815158844, "learning_rate": 2.0568960381292823e-05, "loss": 0.8429, "num_input_tokens_seen": 3204672, "step": 5525 }, { "epoch": 0.8236520703008638, "grad_norm": 1.0899251699447632, "learning_rate": 2.05875781948168e-05, "loss": 0.7548, "num_input_tokens_seen": 3207392, "step": 5530 }, { "epoch": 0.824396782841823, "grad_norm": 1.1160403490066528, "learning_rate": 2.060619600834078e-05, "loss": 0.7555, "num_input_tokens_seen": 3210208, "step": 5535 }, { "epoch": 0.8251414953827823, "grad_norm": 0.9915401935577393, "learning_rate": 2.062481382186476e-05, "loss": 0.734, "num_input_tokens_seen": 3212832, "step": 5540 }, { "epoch": 0.8258862079237415, "grad_norm": 0.8807947039604187, "learning_rate": 2.0643431635388742e-05, "loss": 0.7092, "num_input_tokens_seen": 3215360, "step": 5545 }, { "epoch": 0.8266309204647007, "grad_norm": 0.8777509331703186, "learning_rate": 2.066204944891272e-05, "loss": 0.6915, "num_input_tokens_seen": 3218208, "step": 5550 }, { "epoch": 0.8273756330056599, "grad_norm": 0.9280523657798767, "learning_rate": 2.0680667262436702e-05, "loss": 0.768, "num_input_tokens_seen": 3221248, "step": 5555 }, { "epoch": 0.828120345546619, "grad_norm": 1.3565312623977661, "learning_rate": 2.069928507596068e-05, "loss": 0.6899, "num_input_tokens_seen": 3224416, "step": 5560 }, { "epoch": 0.8288650580875782, "grad_norm": 0.7299878001213074, "learning_rate": 2.0717902889484662e-05, "loss": 0.6425, "num_input_tokens_seen": 3227104, "step": 5565 }, { "epoch": 0.8296097706285374, "grad_norm": 0.7380046248435974, "learning_rate": 2.0736520703008637e-05, "loss": 0.7423, "num_input_tokens_seen": 3229856, "step": 5570 }, { "epoch": 0.8303544831694966, "grad_norm": 1.817415714263916, "learning_rate": 2.075513851653262e-05, "loss": 0.8198, "num_input_tokens_seen": 3232960, "step": 5575 }, { "epoch": 0.8310991957104558, "grad_norm": 0.816281259059906, "learning_rate": 2.0773756330056597e-05, "loss": 0.6938, "num_input_tokens_seen": 3235840, "step": 5580 }, { "epoch": 0.831843908251415, "grad_norm": 1.0589195489883423, "learning_rate": 2.079237414358058e-05, "loss": 0.7535, "num_input_tokens_seen": 3238496, "step": 5585 }, { "epoch": 0.8325886207923742, "grad_norm": 0.5852819681167603, "learning_rate": 2.081099195710456e-05, "loss": 0.5314, "num_input_tokens_seen": 3241088, "step": 5590 }, { "epoch": 0.8333333333333334, "grad_norm": 0.923941969871521, "learning_rate": 2.082960977062854e-05, "loss": 0.7599, "num_input_tokens_seen": 3244032, "step": 5595 }, { "epoch": 0.8340780458742926, "grad_norm": 1.5015379190444946, "learning_rate": 2.084822758415252e-05, "loss": 0.6761, "num_input_tokens_seen": 3247040, "step": 5600 }, { "epoch": 0.8348227584152517, "grad_norm": 0.9120040535926819, "learning_rate": 2.0866845397676498e-05, "loss": 0.8254, "num_input_tokens_seen": 3250080, "step": 5605 }, { "epoch": 0.8355674709562109, "grad_norm": 1.452939748764038, "learning_rate": 2.088546321120048e-05, "loss": 0.8538, "num_input_tokens_seen": 3252992, "step": 5610 }, { "epoch": 0.8363121834971701, "grad_norm": 0.6785905361175537, "learning_rate": 2.0904081024724455e-05, "loss": 0.7422, "num_input_tokens_seen": 3256032, "step": 5615 }, { "epoch": 0.8370568960381293, "grad_norm": 0.8811051249504089, "learning_rate": 2.0922698838248436e-05, "loss": 0.781, "num_input_tokens_seen": 3258656, "step": 5620 }, { "epoch": 0.8378016085790885, "grad_norm": 0.9036545753479004, "learning_rate": 2.0941316651772418e-05, "loss": 0.6405, "num_input_tokens_seen": 3261472, "step": 5625 }, { "epoch": 0.8385463211200477, "grad_norm": 1.2438772916793823, "learning_rate": 2.0959934465296396e-05, "loss": 0.7589, "num_input_tokens_seen": 3264576, "step": 5630 }, { "epoch": 0.8392910336610069, "grad_norm": 1.0132428407669067, "learning_rate": 2.0978552278820378e-05, "loss": 0.6798, "num_input_tokens_seen": 3267488, "step": 5635 }, { "epoch": 0.8400357462019661, "grad_norm": 1.6523507833480835, "learning_rate": 2.0997170092344356e-05, "loss": 0.6709, "num_input_tokens_seen": 3270432, "step": 5640 }, { "epoch": 0.8407804587429253, "grad_norm": 1.0640618801116943, "learning_rate": 2.1015787905868338e-05, "loss": 0.69, "num_input_tokens_seen": 3273216, "step": 5645 }, { "epoch": 0.8415251712838844, "grad_norm": 0.9308984875679016, "learning_rate": 2.1034405719392316e-05, "loss": 0.626, "num_input_tokens_seen": 3276448, "step": 5650 }, { "epoch": 0.8422698838248436, "grad_norm": 1.330808401107788, "learning_rate": 2.1053023532916297e-05, "loss": 0.7884, "num_input_tokens_seen": 3279296, "step": 5655 }, { "epoch": 0.8430145963658028, "grad_norm": 2.0441670417785645, "learning_rate": 2.1071641346440272e-05, "loss": 0.8053, "num_input_tokens_seen": 3282112, "step": 5660 }, { "epoch": 0.843759308906762, "grad_norm": 0.6984859108924866, "learning_rate": 2.1090259159964254e-05, "loss": 0.7602, "num_input_tokens_seen": 3285344, "step": 5665 }, { "epoch": 0.8445040214477212, "grad_norm": 0.9884188175201416, "learning_rate": 2.1108876973488236e-05, "loss": 0.836, "num_input_tokens_seen": 3288384, "step": 5670 }, { "epoch": 0.8452487339886804, "grad_norm": 1.2453994750976562, "learning_rate": 2.1127494787012214e-05, "loss": 0.667, "num_input_tokens_seen": 3291360, "step": 5675 }, { "epoch": 0.8459934465296396, "grad_norm": 0.7807565331459045, "learning_rate": 2.1146112600536195e-05, "loss": 0.7243, "num_input_tokens_seen": 3294496, "step": 5680 }, { "epoch": 0.8467381590705988, "grad_norm": 1.8553359508514404, "learning_rate": 2.1164730414060174e-05, "loss": 0.8909, "num_input_tokens_seen": 3297216, "step": 5685 }, { "epoch": 0.8474828716115579, "grad_norm": 0.8910384178161621, "learning_rate": 2.1183348227584155e-05, "loss": 0.7047, "num_input_tokens_seen": 3300000, "step": 5690 }, { "epoch": 0.8482275841525171, "grad_norm": 0.9726409316062927, "learning_rate": 2.1201966041108133e-05, "loss": 0.6649, "num_input_tokens_seen": 3302560, "step": 5695 }, { "epoch": 0.8489722966934763, "grad_norm": 0.5523024797439575, "learning_rate": 2.1220583854632112e-05, "loss": 0.6687, "num_input_tokens_seen": 3305312, "step": 5700 }, { "epoch": 0.8497170092344355, "grad_norm": 0.8805645704269409, "learning_rate": 2.1239201668156093e-05, "loss": 0.7585, "num_input_tokens_seen": 3308192, "step": 5705 }, { "epoch": 0.8504617217753947, "grad_norm": 0.8517377972602844, "learning_rate": 2.125781948168007e-05, "loss": 0.9111, "num_input_tokens_seen": 3311296, "step": 5710 }, { "epoch": 0.8512064343163539, "grad_norm": 0.4981464743614197, "learning_rate": 2.1276437295204053e-05, "loss": 0.7251, "num_input_tokens_seen": 3313984, "step": 5715 }, { "epoch": 0.8519511468573131, "grad_norm": 1.2439910173416138, "learning_rate": 2.129505510872803e-05, "loss": 0.7337, "num_input_tokens_seen": 3316864, "step": 5720 }, { "epoch": 0.8526958593982723, "grad_norm": 2.428814649581909, "learning_rate": 2.1313672922252013e-05, "loss": 0.9046, "num_input_tokens_seen": 3319840, "step": 5725 }, { "epoch": 0.8534405719392315, "grad_norm": 2.6912999153137207, "learning_rate": 2.133229073577599e-05, "loss": 0.8105, "num_input_tokens_seen": 3322720, "step": 5730 }, { "epoch": 0.8541852844801906, "grad_norm": 1.0631948709487915, "learning_rate": 2.1350908549299973e-05, "loss": 0.7823, "num_input_tokens_seen": 3325536, "step": 5735 }, { "epoch": 0.8549299970211498, "grad_norm": 0.9334118366241455, "learning_rate": 2.136952636282395e-05, "loss": 0.7302, "num_input_tokens_seen": 3328544, "step": 5740 }, { "epoch": 0.855674709562109, "grad_norm": 0.7226126790046692, "learning_rate": 2.138814417634793e-05, "loss": 0.614, "num_input_tokens_seen": 3331520, "step": 5745 }, { "epoch": 0.8564194221030682, "grad_norm": 1.1258221864700317, "learning_rate": 2.140676198987191e-05, "loss": 0.6561, "num_input_tokens_seen": 3334464, "step": 5750 }, { "epoch": 0.8571641346440274, "grad_norm": 0.9832072854042053, "learning_rate": 2.142537980339589e-05, "loss": 0.8213, "num_input_tokens_seen": 3337248, "step": 5755 }, { "epoch": 0.8579088471849866, "grad_norm": 0.7573320865631104, "learning_rate": 2.144399761691987e-05, "loss": 0.5993, "num_input_tokens_seen": 3340128, "step": 5760 }, { "epoch": 0.8586535597259458, "grad_norm": 0.6175960898399353, "learning_rate": 2.146261543044385e-05, "loss": 0.8698, "num_input_tokens_seen": 3342784, "step": 5765 }, { "epoch": 0.859398272266905, "grad_norm": 0.9286507964134216, "learning_rate": 2.148123324396783e-05, "loss": 0.7748, "num_input_tokens_seen": 3345504, "step": 5770 }, { "epoch": 0.8601429848078642, "grad_norm": 0.8562994003295898, "learning_rate": 2.149985105749181e-05, "loss": 0.8415, "num_input_tokens_seen": 3348416, "step": 5775 }, { "epoch": 0.8608876973488233, "grad_norm": 0.641556978225708, "learning_rate": 2.151846887101579e-05, "loss": 0.7714, "num_input_tokens_seen": 3351296, "step": 5780 }, { "epoch": 0.8616324098897825, "grad_norm": 0.9706088304519653, "learning_rate": 2.153708668453977e-05, "loss": 0.768, "num_input_tokens_seen": 3354368, "step": 5785 }, { "epoch": 0.8623771224307417, "grad_norm": 1.2196253538131714, "learning_rate": 2.1555704498063747e-05, "loss": 0.6915, "num_input_tokens_seen": 3357120, "step": 5790 }, { "epoch": 0.8631218349717009, "grad_norm": 0.9127417206764221, "learning_rate": 2.157432231158773e-05, "loss": 0.7123, "num_input_tokens_seen": 3360192, "step": 5795 }, { "epoch": 0.8638665475126601, "grad_norm": 0.8127618432044983, "learning_rate": 2.1592940125111707e-05, "loss": 0.6362, "num_input_tokens_seen": 3362944, "step": 5800 }, { "epoch": 0.8646112600536193, "grad_norm": 0.8341195583343506, "learning_rate": 2.161155793863569e-05, "loss": 0.7321, "num_input_tokens_seen": 3365920, "step": 5805 }, { "epoch": 0.8653559725945785, "grad_norm": 1.3230547904968262, "learning_rate": 2.1630175752159667e-05, "loss": 0.7737, "num_input_tokens_seen": 3368832, "step": 5810 }, { "epoch": 0.8661006851355377, "grad_norm": 0.7498741149902344, "learning_rate": 2.164879356568365e-05, "loss": 0.751, "num_input_tokens_seen": 3371808, "step": 5815 }, { "epoch": 0.8668453976764968, "grad_norm": 1.8572065830230713, "learning_rate": 2.1667411379207627e-05, "loss": 0.7108, "num_input_tokens_seen": 3374976, "step": 5820 }, { "epoch": 0.867590110217456, "grad_norm": 1.0012519359588623, "learning_rate": 2.1686029192731608e-05, "loss": 0.7596, "num_input_tokens_seen": 3378112, "step": 5825 }, { "epoch": 0.8683348227584152, "grad_norm": 0.5301274657249451, "learning_rate": 2.1704647006255586e-05, "loss": 0.8288, "num_input_tokens_seen": 3381024, "step": 5830 }, { "epoch": 0.8690795352993744, "grad_norm": 0.8800262808799744, "learning_rate": 2.1723264819779565e-05, "loss": 0.6289, "num_input_tokens_seen": 3383808, "step": 5835 }, { "epoch": 0.8698242478403336, "grad_norm": 0.7950550317764282, "learning_rate": 2.1741882633303546e-05, "loss": 0.7407, "num_input_tokens_seen": 3386848, "step": 5840 }, { "epoch": 0.8705689603812928, "grad_norm": 0.8072235584259033, "learning_rate": 2.1760500446827525e-05, "loss": 0.6661, "num_input_tokens_seen": 3389760, "step": 5845 }, { "epoch": 0.871313672922252, "grad_norm": 1.0110154151916504, "learning_rate": 2.1779118260351506e-05, "loss": 0.6146, "num_input_tokens_seen": 3392704, "step": 5850 }, { "epoch": 0.8720583854632112, "grad_norm": 0.8792695999145508, "learning_rate": 2.1797736073875484e-05, "loss": 0.6536, "num_input_tokens_seen": 3395744, "step": 5855 }, { "epoch": 0.8728030980041704, "grad_norm": 0.750478208065033, "learning_rate": 2.1816353887399466e-05, "loss": 0.6233, "num_input_tokens_seen": 3398592, "step": 5860 }, { "epoch": 0.8735478105451295, "grad_norm": 2.0372886657714844, "learning_rate": 2.1834971700923444e-05, "loss": 0.7502, "num_input_tokens_seen": 3401216, "step": 5865 }, { "epoch": 0.8742925230860887, "grad_norm": 0.9263136982917786, "learning_rate": 2.1853589514447426e-05, "loss": 0.6548, "num_input_tokens_seen": 3404256, "step": 5870 }, { "epoch": 0.8750372356270479, "grad_norm": 0.7487751841545105, "learning_rate": 2.1872207327971404e-05, "loss": 0.7465, "num_input_tokens_seen": 3407232, "step": 5875 }, { "epoch": 0.8757819481680071, "grad_norm": 1.6730602979660034, "learning_rate": 2.1890825141495382e-05, "loss": 0.9968, "num_input_tokens_seen": 3409984, "step": 5880 }, { "epoch": 0.8765266607089663, "grad_norm": 0.9902485013008118, "learning_rate": 2.1909442955019364e-05, "loss": 0.669, "num_input_tokens_seen": 3412768, "step": 5885 }, { "epoch": 0.8772713732499255, "grad_norm": 1.67047119140625, "learning_rate": 2.1928060768543342e-05, "loss": 0.81, "num_input_tokens_seen": 3415456, "step": 5890 }, { "epoch": 0.8780160857908847, "grad_norm": 1.019727349281311, "learning_rate": 2.1946678582067324e-05, "loss": 0.8094, "num_input_tokens_seen": 3418368, "step": 5895 }, { "epoch": 0.878760798331844, "grad_norm": 0.9959003329277039, "learning_rate": 2.1965296395591302e-05, "loss": 0.7429, "num_input_tokens_seen": 3421248, "step": 5900 }, { "epoch": 0.8795055108728032, "grad_norm": 0.8914616107940674, "learning_rate": 2.1983914209115284e-05, "loss": 0.5955, "num_input_tokens_seen": 3424544, "step": 5905 }, { "epoch": 0.8802502234137622, "grad_norm": 0.964705228805542, "learning_rate": 2.2002532022639262e-05, "loss": 0.7917, "num_input_tokens_seen": 3427584, "step": 5910 }, { "epoch": 0.8809949359547214, "grad_norm": 0.7959586381912231, "learning_rate": 2.202114983616324e-05, "loss": 0.7135, "num_input_tokens_seen": 3430400, "step": 5915 }, { "epoch": 0.8817396484956807, "grad_norm": 1.635144591331482, "learning_rate": 2.2039767649687222e-05, "loss": 0.6856, "num_input_tokens_seen": 3433248, "step": 5920 }, { "epoch": 0.8824843610366399, "grad_norm": 1.0157220363616943, "learning_rate": 2.20583854632112e-05, "loss": 0.7713, "num_input_tokens_seen": 3436032, "step": 5925 }, { "epoch": 0.883229073577599, "grad_norm": 3.0441551208496094, "learning_rate": 2.207700327673518e-05, "loss": 0.8694, "num_input_tokens_seen": 3438912, "step": 5930 }, { "epoch": 0.8839737861185583, "grad_norm": 0.9680255055427551, "learning_rate": 2.209562109025916e-05, "loss": 0.7942, "num_input_tokens_seen": 3441760, "step": 5935 }, { "epoch": 0.8847184986595175, "grad_norm": 1.0748474597930908, "learning_rate": 2.211423890378314e-05, "loss": 0.724, "num_input_tokens_seen": 3444448, "step": 5940 }, { "epoch": 0.8854632112004767, "grad_norm": 0.9057283997535706, "learning_rate": 2.213285671730712e-05, "loss": 0.6949, "num_input_tokens_seen": 3447104, "step": 5945 }, { "epoch": 0.8862079237414358, "grad_norm": 0.8827692866325378, "learning_rate": 2.21514745308311e-05, "loss": 0.6386, "num_input_tokens_seen": 3449856, "step": 5950 }, { "epoch": 0.886952636282395, "grad_norm": 0.8437610268592834, "learning_rate": 2.2170092344355083e-05, "loss": 0.6463, "num_input_tokens_seen": 3452544, "step": 5955 }, { "epoch": 0.8876973488233542, "grad_norm": 1.2365542650222778, "learning_rate": 2.2188710157879058e-05, "loss": 0.8082, "num_input_tokens_seen": 3455136, "step": 5960 }, { "epoch": 0.8884420613643134, "grad_norm": 1.364600419998169, "learning_rate": 2.220732797140304e-05, "loss": 0.7328, "num_input_tokens_seen": 3458272, "step": 5965 }, { "epoch": 0.8891867739052726, "grad_norm": 0.8340849280357361, "learning_rate": 2.2225945784927018e-05, "loss": 0.6606, "num_input_tokens_seen": 3461504, "step": 5970 }, { "epoch": 0.8899314864462318, "grad_norm": 0.6834797263145447, "learning_rate": 2.2244563598451e-05, "loss": 0.7626, "num_input_tokens_seen": 3464448, "step": 5975 }, { "epoch": 0.890676198987191, "grad_norm": 0.917208731174469, "learning_rate": 2.2263181411974977e-05, "loss": 0.766, "num_input_tokens_seen": 3467264, "step": 5980 }, { "epoch": 0.8914209115281502, "grad_norm": 0.8235246539115906, "learning_rate": 2.228179922549896e-05, "loss": 0.7477, "num_input_tokens_seen": 3470080, "step": 5985 }, { "epoch": 0.8921656240691094, "grad_norm": 0.809131920337677, "learning_rate": 2.2300417039022937e-05, "loss": 0.7967, "num_input_tokens_seen": 3473088, "step": 5990 }, { "epoch": 0.8929103366100685, "grad_norm": 0.8623836636543274, "learning_rate": 2.231903485254692e-05, "loss": 0.6454, "num_input_tokens_seen": 3476032, "step": 5995 }, { "epoch": 0.8936550491510277, "grad_norm": 1.4778982400894165, "learning_rate": 2.23376526660709e-05, "loss": 0.7917, "num_input_tokens_seen": 3478560, "step": 6000 }, { "epoch": 0.8943997616919869, "grad_norm": 1.3276277780532837, "learning_rate": 2.2356270479594875e-05, "loss": 0.7707, "num_input_tokens_seen": 3482048, "step": 6005 }, { "epoch": 0.8951444742329461, "grad_norm": 1.93425452709198, "learning_rate": 2.2374888293118857e-05, "loss": 0.8664, "num_input_tokens_seen": 3485184, "step": 6010 }, { "epoch": 0.8958891867739053, "grad_norm": 1.0437672138214111, "learning_rate": 2.2393506106642835e-05, "loss": 0.7823, "num_input_tokens_seen": 3488000, "step": 6015 }, { "epoch": 0.8966338993148645, "grad_norm": 1.5505383014678955, "learning_rate": 2.2412123920166817e-05, "loss": 0.7663, "num_input_tokens_seen": 3490816, "step": 6020 }, { "epoch": 0.8973786118558237, "grad_norm": 1.6969045400619507, "learning_rate": 2.2430741733690795e-05, "loss": 0.7432, "num_input_tokens_seen": 3493728, "step": 6025 }, { "epoch": 0.8981233243967829, "grad_norm": 0.8594477772712708, "learning_rate": 2.2449359547214777e-05, "loss": 0.7179, "num_input_tokens_seen": 3496384, "step": 6030 }, { "epoch": 0.898868036937742, "grad_norm": 1.0445504188537598, "learning_rate": 2.246797736073876e-05, "loss": 0.7663, "num_input_tokens_seen": 3499104, "step": 6035 }, { "epoch": 0.8996127494787012, "grad_norm": 0.7922555804252625, "learning_rate": 2.2486595174262737e-05, "loss": 0.7389, "num_input_tokens_seen": 3501888, "step": 6040 }, { "epoch": 0.9003574620196604, "grad_norm": 0.7965902090072632, "learning_rate": 2.2505212987786715e-05, "loss": 0.7676, "num_input_tokens_seen": 3504608, "step": 6045 }, { "epoch": 0.9011021745606196, "grad_norm": 0.8833857774734497, "learning_rate": 2.2523830801310693e-05, "loss": 0.6698, "num_input_tokens_seen": 3507232, "step": 6050 }, { "epoch": 0.9018468871015788, "grad_norm": 0.8597462773323059, "learning_rate": 2.2542448614834675e-05, "loss": 0.732, "num_input_tokens_seen": 3510080, "step": 6055 }, { "epoch": 0.902591599642538, "grad_norm": 1.0815290212631226, "learning_rate": 2.2561066428358653e-05, "loss": 0.6273, "num_input_tokens_seen": 3513280, "step": 6060 }, { "epoch": 0.9033363121834972, "grad_norm": 0.875655472278595, "learning_rate": 2.2579684241882634e-05, "loss": 0.8199, "num_input_tokens_seen": 3516128, "step": 6065 }, { "epoch": 0.9040810247244564, "grad_norm": 0.9319368004798889, "learning_rate": 2.2598302055406613e-05, "loss": 0.5958, "num_input_tokens_seen": 3518944, "step": 6070 }, { "epoch": 0.9048257372654156, "grad_norm": 0.8523111343383789, "learning_rate": 2.2616919868930594e-05, "loss": 0.6546, "num_input_tokens_seen": 3521824, "step": 6075 }, { "epoch": 0.9055704498063747, "grad_norm": 0.7866729497909546, "learning_rate": 2.2635537682454576e-05, "loss": 0.7276, "num_input_tokens_seen": 3524640, "step": 6080 }, { "epoch": 0.9063151623473339, "grad_norm": 1.1345893144607544, "learning_rate": 2.2654155495978554e-05, "loss": 0.7017, "num_input_tokens_seen": 3527648, "step": 6085 }, { "epoch": 0.9070598748882931, "grad_norm": 0.8114946484565735, "learning_rate": 2.2672773309502532e-05, "loss": 0.6946, "num_input_tokens_seen": 3530720, "step": 6090 }, { "epoch": 0.9078045874292523, "grad_norm": 0.9556549787521362, "learning_rate": 2.269139112302651e-05, "loss": 0.6939, "num_input_tokens_seen": 3533600, "step": 6095 }, { "epoch": 0.9085492999702115, "grad_norm": 0.9648672938346863, "learning_rate": 2.2710008936550492e-05, "loss": 0.6711, "num_input_tokens_seen": 3536320, "step": 6100 }, { "epoch": 0.9092940125111707, "grad_norm": 0.8925691843032837, "learning_rate": 2.272862675007447e-05, "loss": 0.6126, "num_input_tokens_seen": 3539104, "step": 6105 }, { "epoch": 0.9100387250521299, "grad_norm": 0.9837997555732727, "learning_rate": 2.2747244563598452e-05, "loss": 0.7758, "num_input_tokens_seen": 3542272, "step": 6110 }, { "epoch": 0.9107834375930891, "grad_norm": 1.4750312566757202, "learning_rate": 2.2765862377122434e-05, "loss": 0.9238, "num_input_tokens_seen": 3545120, "step": 6115 }, { "epoch": 0.9115281501340483, "grad_norm": 0.9970677495002747, "learning_rate": 2.2784480190646412e-05, "loss": 0.6559, "num_input_tokens_seen": 3548128, "step": 6120 }, { "epoch": 0.9122728626750074, "grad_norm": 0.9605469107627869, "learning_rate": 2.2803098004170394e-05, "loss": 0.7697, "num_input_tokens_seen": 3551712, "step": 6125 }, { "epoch": 0.9130175752159666, "grad_norm": 0.8264591693878174, "learning_rate": 2.2821715817694372e-05, "loss": 0.7875, "num_input_tokens_seen": 3554784, "step": 6130 }, { "epoch": 0.9137622877569258, "grad_norm": 0.6345618963241577, "learning_rate": 2.284033363121835e-05, "loss": 0.76, "num_input_tokens_seen": 3557888, "step": 6135 }, { "epoch": 0.914507000297885, "grad_norm": 0.8240998387336731, "learning_rate": 2.2858951444742328e-05, "loss": 0.7788, "num_input_tokens_seen": 3561248, "step": 6140 }, { "epoch": 0.9152517128388442, "grad_norm": 0.8250702023506165, "learning_rate": 2.287756925826631e-05, "loss": 0.8625, "num_input_tokens_seen": 3564256, "step": 6145 }, { "epoch": 0.9159964253798034, "grad_norm": 0.7253707647323608, "learning_rate": 2.289618707179029e-05, "loss": 0.7382, "num_input_tokens_seen": 3567424, "step": 6150 }, { "epoch": 0.9167411379207626, "grad_norm": 0.8899668455123901, "learning_rate": 2.291480488531427e-05, "loss": 0.736, "num_input_tokens_seen": 3570432, "step": 6155 }, { "epoch": 0.9174858504617218, "grad_norm": 1.5159481763839722, "learning_rate": 2.293342269883825e-05, "loss": 0.7849, "num_input_tokens_seen": 3573216, "step": 6160 }, { "epoch": 0.9182305630026809, "grad_norm": 0.46056464314460754, "learning_rate": 2.295204051236223e-05, "loss": 0.8447, "num_input_tokens_seen": 3575872, "step": 6165 }, { "epoch": 0.9189752755436401, "grad_norm": 0.9166273474693298, "learning_rate": 2.297065832588621e-05, "loss": 0.8035, "num_input_tokens_seen": 3578848, "step": 6170 }, { "epoch": 0.9197199880845993, "grad_norm": 2.1584792137145996, "learning_rate": 2.2989276139410186e-05, "loss": 0.7857, "num_input_tokens_seen": 3581952, "step": 6175 }, { "epoch": 0.9204647006255585, "grad_norm": 1.4362560510635376, "learning_rate": 2.3007893952934168e-05, "loss": 0.6295, "num_input_tokens_seen": 3584960, "step": 6180 }, { "epoch": 0.9212094131665177, "grad_norm": 0.8008808493614197, "learning_rate": 2.3026511766458146e-05, "loss": 0.6543, "num_input_tokens_seen": 3588192, "step": 6185 }, { "epoch": 0.9219541257074769, "grad_norm": 0.7357810139656067, "learning_rate": 2.3045129579982128e-05, "loss": 0.7865, "num_input_tokens_seen": 3591072, "step": 6190 }, { "epoch": 0.9226988382484361, "grad_norm": 0.730498194694519, "learning_rate": 2.306374739350611e-05, "loss": 0.7911, "num_input_tokens_seen": 3594048, "step": 6195 }, { "epoch": 0.9234435507893953, "grad_norm": 0.864989697933197, "learning_rate": 2.3082365207030087e-05, "loss": 0.811, "num_input_tokens_seen": 3596960, "step": 6200 }, { "epoch": 0.9241882633303545, "grad_norm": 0.8883414268493652, "learning_rate": 2.310098302055407e-05, "loss": 0.7806, "num_input_tokens_seen": 3600224, "step": 6205 }, { "epoch": 0.9249329758713136, "grad_norm": 0.7695617079734802, "learning_rate": 2.3119600834078047e-05, "loss": 0.7544, "num_input_tokens_seen": 3603360, "step": 6210 }, { "epoch": 0.9256776884122728, "grad_norm": 1.2710700035095215, "learning_rate": 2.313821864760203e-05, "loss": 0.7036, "num_input_tokens_seen": 3606144, "step": 6215 }, { "epoch": 0.926422400953232, "grad_norm": 1.1462000608444214, "learning_rate": 2.3156836461126004e-05, "loss": 0.723, "num_input_tokens_seen": 3609280, "step": 6220 }, { "epoch": 0.9271671134941912, "grad_norm": 0.9851747751235962, "learning_rate": 2.3175454274649985e-05, "loss": 0.7415, "num_input_tokens_seen": 3612320, "step": 6225 }, { "epoch": 0.9279118260351504, "grad_norm": 0.8484352827072144, "learning_rate": 2.3194072088173967e-05, "loss": 0.7849, "num_input_tokens_seen": 3615296, "step": 6230 }, { "epoch": 0.9286565385761096, "grad_norm": 4.026581287384033, "learning_rate": 2.3212689901697945e-05, "loss": 0.7665, "num_input_tokens_seen": 3618336, "step": 6235 }, { "epoch": 0.9294012511170688, "grad_norm": 0.8653831481933594, "learning_rate": 2.3231307715221927e-05, "loss": 0.8178, "num_input_tokens_seen": 3621280, "step": 6240 }, { "epoch": 0.930145963658028, "grad_norm": 0.9783188104629517, "learning_rate": 2.3249925528745905e-05, "loss": 0.7405, "num_input_tokens_seen": 3624352, "step": 6245 }, { "epoch": 0.9308906761989872, "grad_norm": 1.0310806035995483, "learning_rate": 2.3268543342269887e-05, "loss": 0.6619, "num_input_tokens_seen": 3627488, "step": 6250 }, { "epoch": 0.9316353887399463, "grad_norm": 0.9444814324378967, "learning_rate": 2.3287161155793865e-05, "loss": 0.821, "num_input_tokens_seen": 3630400, "step": 6255 }, { "epoch": 0.9323801012809055, "grad_norm": 0.8044673800468445, "learning_rate": 2.3305778969317847e-05, "loss": 0.7243, "num_input_tokens_seen": 3632928, "step": 6260 }, { "epoch": 0.9331248138218647, "grad_norm": 0.785372257232666, "learning_rate": 2.332439678284182e-05, "loss": 0.6787, "num_input_tokens_seen": 3635840, "step": 6265 }, { "epoch": 0.9338695263628239, "grad_norm": 0.6569969654083252, "learning_rate": 2.3343014596365803e-05, "loss": 0.6669, "num_input_tokens_seen": 3638656, "step": 6270 }, { "epoch": 0.9346142389037831, "grad_norm": 1.1015491485595703, "learning_rate": 2.3361632409889785e-05, "loss": 0.7295, "num_input_tokens_seen": 3641472, "step": 6275 }, { "epoch": 0.9353589514447423, "grad_norm": 0.7537623643875122, "learning_rate": 2.3380250223413763e-05, "loss": 0.6794, "num_input_tokens_seen": 3644192, "step": 6280 }, { "epoch": 0.9361036639857016, "grad_norm": 0.6698867678642273, "learning_rate": 2.3398868036937744e-05, "loss": 0.699, "num_input_tokens_seen": 3646976, "step": 6285 }, { "epoch": 0.9368483765266608, "grad_norm": 0.9461511969566345, "learning_rate": 2.3417485850461723e-05, "loss": 0.5308, "num_input_tokens_seen": 3649856, "step": 6290 }, { "epoch": 0.9375930890676198, "grad_norm": 0.726538896560669, "learning_rate": 2.3436103663985704e-05, "loss": 0.8218, "num_input_tokens_seen": 3652736, "step": 6295 }, { "epoch": 0.938337801608579, "grad_norm": 1.0108641386032104, "learning_rate": 2.3454721477509683e-05, "loss": 0.6949, "num_input_tokens_seen": 3655648, "step": 6300 }, { "epoch": 0.9390825141495382, "grad_norm": 1.8820862770080566, "learning_rate": 2.347333929103366e-05, "loss": 0.7078, "num_input_tokens_seen": 3658432, "step": 6305 }, { "epoch": 0.9398272266904975, "grad_norm": 1.029455542564392, "learning_rate": 2.3491957104557642e-05, "loss": 0.7196, "num_input_tokens_seen": 3661280, "step": 6310 }, { "epoch": 0.9405719392314567, "grad_norm": 0.8073568940162659, "learning_rate": 2.351057491808162e-05, "loss": 0.8428, "num_input_tokens_seen": 3664576, "step": 6315 }, { "epoch": 0.9413166517724159, "grad_norm": 1.0416557788848877, "learning_rate": 2.3529192731605602e-05, "loss": 0.8625, "num_input_tokens_seen": 3667264, "step": 6320 }, { "epoch": 0.9420613643133751, "grad_norm": 1.0542337894439697, "learning_rate": 2.354781054512958e-05, "loss": 0.6901, "num_input_tokens_seen": 3670144, "step": 6325 }, { "epoch": 0.9428060768543343, "grad_norm": 1.2451198101043701, "learning_rate": 2.3566428358653562e-05, "loss": 0.7888, "num_input_tokens_seen": 3672992, "step": 6330 }, { "epoch": 0.9435507893952935, "grad_norm": 0.5120468735694885, "learning_rate": 2.358504617217754e-05, "loss": 0.6549, "num_input_tokens_seen": 3675840, "step": 6335 }, { "epoch": 0.9442955019362526, "grad_norm": 1.5952061414718628, "learning_rate": 2.3603663985701522e-05, "loss": 0.7426, "num_input_tokens_seen": 3678944, "step": 6340 }, { "epoch": 0.9450402144772118, "grad_norm": 0.915005624294281, "learning_rate": 2.36222817992255e-05, "loss": 0.6596, "num_input_tokens_seen": 3681728, "step": 6345 }, { "epoch": 0.945784927018171, "grad_norm": 0.8962762951850891, "learning_rate": 2.364089961274948e-05, "loss": 0.7504, "num_input_tokens_seen": 3684672, "step": 6350 }, { "epoch": 0.9465296395591302, "grad_norm": 1.0362931489944458, "learning_rate": 2.365951742627346e-05, "loss": 0.7378, "num_input_tokens_seen": 3687296, "step": 6355 }, { "epoch": 0.9472743521000894, "grad_norm": 0.8897198438644409, "learning_rate": 2.3678135239797438e-05, "loss": 0.672, "num_input_tokens_seen": 3689856, "step": 6360 }, { "epoch": 0.9480190646410486, "grad_norm": 0.7585018277168274, "learning_rate": 2.369675305332142e-05, "loss": 0.7126, "num_input_tokens_seen": 3692704, "step": 6365 }, { "epoch": 0.9487637771820078, "grad_norm": 0.9835681319236755, "learning_rate": 2.3715370866845398e-05, "loss": 0.6897, "num_input_tokens_seen": 3695424, "step": 6370 }, { "epoch": 0.949508489722967, "grad_norm": 0.9765822887420654, "learning_rate": 2.373398868036938e-05, "loss": 0.8076, "num_input_tokens_seen": 3698592, "step": 6375 }, { "epoch": 0.9502532022639262, "grad_norm": 0.8306041359901428, "learning_rate": 2.3752606493893358e-05, "loss": 0.6996, "num_input_tokens_seen": 3701568, "step": 6380 }, { "epoch": 0.9509979148048853, "grad_norm": 1.0160813331604004, "learning_rate": 2.377122430741734e-05, "loss": 0.6593, "num_input_tokens_seen": 3704672, "step": 6385 }, { "epoch": 0.9517426273458445, "grad_norm": 1.444205641746521, "learning_rate": 2.3789842120941318e-05, "loss": 0.8094, "num_input_tokens_seen": 3707488, "step": 6390 }, { "epoch": 0.9524873398868037, "grad_norm": 1.2925093173980713, "learning_rate": 2.3808459934465296e-05, "loss": 0.7923, "num_input_tokens_seen": 3710528, "step": 6395 }, { "epoch": 0.9532320524277629, "grad_norm": 0.8095593452453613, "learning_rate": 2.3827077747989278e-05, "loss": 0.8139, "num_input_tokens_seen": 3713664, "step": 6400 }, { "epoch": 0.9539767649687221, "grad_norm": 1.1490336656570435, "learning_rate": 2.3845695561513256e-05, "loss": 0.7612, "num_input_tokens_seen": 3716704, "step": 6405 }, { "epoch": 0.9547214775096813, "grad_norm": 1.0362900495529175, "learning_rate": 2.3864313375037238e-05, "loss": 0.725, "num_input_tokens_seen": 3719456, "step": 6410 }, { "epoch": 0.9554661900506405, "grad_norm": 0.6269125938415527, "learning_rate": 2.3882931188561216e-05, "loss": 0.6968, "num_input_tokens_seen": 3722400, "step": 6415 }, { "epoch": 0.9562109025915997, "grad_norm": 0.7129384279251099, "learning_rate": 2.3901549002085197e-05, "loss": 0.7042, "num_input_tokens_seen": 3725024, "step": 6420 }, { "epoch": 0.9569556151325588, "grad_norm": 0.7557113766670227, "learning_rate": 2.3920166815609176e-05, "loss": 0.8877, "num_input_tokens_seen": 3727808, "step": 6425 }, { "epoch": 0.957700327673518, "grad_norm": 1.8137881755828857, "learning_rate": 2.3938784629133157e-05, "loss": 0.8189, "num_input_tokens_seen": 3730432, "step": 6430 }, { "epoch": 0.9584450402144772, "grad_norm": 1.1162168979644775, "learning_rate": 2.3957402442657135e-05, "loss": 0.7159, "num_input_tokens_seen": 3733088, "step": 6435 }, { "epoch": 0.9591897527554364, "grad_norm": 1.218261480331421, "learning_rate": 2.3976020256181114e-05, "loss": 0.7818, "num_input_tokens_seen": 3735840, "step": 6440 }, { "epoch": 0.9599344652963956, "grad_norm": 0.8277303576469421, "learning_rate": 2.3994638069705095e-05, "loss": 0.699, "num_input_tokens_seen": 3738464, "step": 6445 }, { "epoch": 0.9606791778373548, "grad_norm": 0.6961784362792969, "learning_rate": 2.4013255883229074e-05, "loss": 0.7867, "num_input_tokens_seen": 3741280, "step": 6450 }, { "epoch": 0.961423890378314, "grad_norm": 0.7305643558502197, "learning_rate": 2.4031873696753055e-05, "loss": 0.8166, "num_input_tokens_seen": 3744256, "step": 6455 }, { "epoch": 0.9621686029192732, "grad_norm": 0.624472439289093, "learning_rate": 2.4050491510277033e-05, "loss": 0.5931, "num_input_tokens_seen": 3747456, "step": 6460 }, { "epoch": 0.9629133154602324, "grad_norm": 0.802893340587616, "learning_rate": 2.4069109323801015e-05, "loss": 0.7025, "num_input_tokens_seen": 3750240, "step": 6465 }, { "epoch": 0.9636580280011915, "grad_norm": 1.5450060367584229, "learning_rate": 2.4087727137324993e-05, "loss": 0.737, "num_input_tokens_seen": 3752960, "step": 6470 }, { "epoch": 0.9644027405421507, "grad_norm": 1.0729165077209473, "learning_rate": 2.4106344950848975e-05, "loss": 0.6873, "num_input_tokens_seen": 3755840, "step": 6475 }, { "epoch": 0.9651474530831099, "grad_norm": 1.2245051860809326, "learning_rate": 2.4124962764372953e-05, "loss": 0.6137, "num_input_tokens_seen": 3758560, "step": 6480 }, { "epoch": 0.9658921656240691, "grad_norm": 0.9495630264282227, "learning_rate": 2.414358057789693e-05, "loss": 0.8322, "num_input_tokens_seen": 3761376, "step": 6485 }, { "epoch": 0.9666368781650283, "grad_norm": 0.9825810790061951, "learning_rate": 2.4162198391420913e-05, "loss": 0.7802, "num_input_tokens_seen": 3764128, "step": 6490 }, { "epoch": 0.9673815907059875, "grad_norm": 0.700385570526123, "learning_rate": 2.418081620494489e-05, "loss": 0.7745, "num_input_tokens_seen": 3768128, "step": 6495 }, { "epoch": 0.9681263032469467, "grad_norm": 0.8068217635154724, "learning_rate": 2.4199434018468873e-05, "loss": 0.8009, "num_input_tokens_seen": 3770976, "step": 6500 }, { "epoch": 0.9688710157879059, "grad_norm": 1.6972684860229492, "learning_rate": 2.421805183199285e-05, "loss": 0.7623, "num_input_tokens_seen": 3773888, "step": 6505 }, { "epoch": 0.9696157283288651, "grad_norm": 0.9122498035430908, "learning_rate": 2.4236669645516833e-05, "loss": 0.802, "num_input_tokens_seen": 3777024, "step": 6510 }, { "epoch": 0.9703604408698242, "grad_norm": 0.8593528270721436, "learning_rate": 2.425528745904081e-05, "loss": 0.6555, "num_input_tokens_seen": 3779872, "step": 6515 }, { "epoch": 0.9711051534107834, "grad_norm": 0.7681351900100708, "learning_rate": 2.4273905272564793e-05, "loss": 0.7349, "num_input_tokens_seen": 3782656, "step": 6520 }, { "epoch": 0.9718498659517426, "grad_norm": 0.8670471906661987, "learning_rate": 2.429252308608877e-05, "loss": 0.7286, "num_input_tokens_seen": 3785504, "step": 6525 }, { "epoch": 0.9725945784927018, "grad_norm": 1.1689728498458862, "learning_rate": 2.431114089961275e-05, "loss": 0.7024, "num_input_tokens_seen": 3788160, "step": 6530 }, { "epoch": 0.973339291033661, "grad_norm": 1.340386152267456, "learning_rate": 2.432975871313673e-05, "loss": 0.635, "num_input_tokens_seen": 3791424, "step": 6535 }, { "epoch": 0.9740840035746202, "grad_norm": 0.6103370785713196, "learning_rate": 2.434837652666071e-05, "loss": 0.7621, "num_input_tokens_seen": 3794400, "step": 6540 }, { "epoch": 0.9748287161155794, "grad_norm": 0.7642139196395874, "learning_rate": 2.436699434018469e-05, "loss": 0.7644, "num_input_tokens_seen": 3797408, "step": 6545 }, { "epoch": 0.9755734286565386, "grad_norm": 0.7920448184013367, "learning_rate": 2.438561215370867e-05, "loss": 0.7484, "num_input_tokens_seen": 3800352, "step": 6550 }, { "epoch": 0.9763181411974977, "grad_norm": 0.6574881076812744, "learning_rate": 2.440422996723265e-05, "loss": 0.738, "num_input_tokens_seen": 3803328, "step": 6555 }, { "epoch": 0.9770628537384569, "grad_norm": 0.7829082608222961, "learning_rate": 2.4422847780756632e-05, "loss": 0.7503, "num_input_tokens_seen": 3806176, "step": 6560 }, { "epoch": 0.9778075662794161, "grad_norm": 0.9264868497848511, "learning_rate": 2.4441465594280607e-05, "loss": 0.8037, "num_input_tokens_seen": 3809088, "step": 6565 }, { "epoch": 0.9785522788203753, "grad_norm": 0.688225269317627, "learning_rate": 2.446008340780459e-05, "loss": 0.701, "num_input_tokens_seen": 3811680, "step": 6570 }, { "epoch": 0.9792969913613345, "grad_norm": 1.0081348419189453, "learning_rate": 2.4478701221328567e-05, "loss": 0.7547, "num_input_tokens_seen": 3814592, "step": 6575 }, { "epoch": 0.9800417039022937, "grad_norm": 1.2804707288742065, "learning_rate": 2.4497319034852548e-05, "loss": 0.8197, "num_input_tokens_seen": 3817728, "step": 6580 }, { "epoch": 0.9807864164432529, "grad_norm": 0.626983106136322, "learning_rate": 2.4515936848376527e-05, "loss": 0.6836, "num_input_tokens_seen": 3820608, "step": 6585 }, { "epoch": 0.9815311289842121, "grad_norm": 0.874258279800415, "learning_rate": 2.4534554661900508e-05, "loss": 0.8707, "num_input_tokens_seen": 3823648, "step": 6590 }, { "epoch": 0.9822758415251713, "grad_norm": 0.7427605390548706, "learning_rate": 2.4553172475424486e-05, "loss": 0.7793, "num_input_tokens_seen": 3826624, "step": 6595 }, { "epoch": 0.9830205540661304, "grad_norm": 0.7036868929862976, "learning_rate": 2.4571790288948468e-05, "loss": 0.6429, "num_input_tokens_seen": 3829632, "step": 6600 }, { "epoch": 0.9837652666070896, "grad_norm": 1.4482345581054688, "learning_rate": 2.459040810247245e-05, "loss": 0.7863, "num_input_tokens_seen": 3832384, "step": 6605 }, { "epoch": 0.9845099791480488, "grad_norm": 0.9768532514572144, "learning_rate": 2.4609025915996424e-05, "loss": 0.8271, "num_input_tokens_seen": 3835488, "step": 6610 }, { "epoch": 0.985254691689008, "grad_norm": 0.8895449042320251, "learning_rate": 2.4627643729520406e-05, "loss": 0.6888, "num_input_tokens_seen": 3838560, "step": 6615 }, { "epoch": 0.9859994042299672, "grad_norm": 1.0895912647247314, "learning_rate": 2.4646261543044384e-05, "loss": 0.7681, "num_input_tokens_seen": 3841376, "step": 6620 }, { "epoch": 0.9867441167709264, "grad_norm": 1.199975848197937, "learning_rate": 2.4664879356568366e-05, "loss": 0.688, "num_input_tokens_seen": 3844192, "step": 6625 }, { "epoch": 0.9874888293118856, "grad_norm": 0.5773093104362488, "learning_rate": 2.4683497170092344e-05, "loss": 0.6378, "num_input_tokens_seen": 3847424, "step": 6630 }, { "epoch": 0.9882335418528448, "grad_norm": 1.2102851867675781, "learning_rate": 2.4702114983616326e-05, "loss": 0.6246, "num_input_tokens_seen": 3850048, "step": 6635 }, { "epoch": 0.988978254393804, "grad_norm": 0.900021493434906, "learning_rate": 2.4720732797140307e-05, "loss": 0.6194, "num_input_tokens_seen": 3852736, "step": 6640 }, { "epoch": 0.9897229669347631, "grad_norm": 0.8826397657394409, "learning_rate": 2.4739350610664286e-05, "loss": 0.6739, "num_input_tokens_seen": 3855456, "step": 6645 }, { "epoch": 0.9904676794757223, "grad_norm": 0.9970483779907227, "learning_rate": 2.4757968424188267e-05, "loss": 0.6595, "num_input_tokens_seen": 3858080, "step": 6650 }, { "epoch": 0.9912123920166815, "grad_norm": 0.9039361476898193, "learning_rate": 2.4776586237712242e-05, "loss": 0.862, "num_input_tokens_seen": 3860864, "step": 6655 }, { "epoch": 0.9919571045576407, "grad_norm": 0.7146745324134827, "learning_rate": 2.4795204051236224e-05, "loss": 0.6896, "num_input_tokens_seen": 3863840, "step": 6660 }, { "epoch": 0.9927018170986, "grad_norm": 0.8459193110466003, "learning_rate": 2.4813821864760202e-05, "loss": 0.6874, "num_input_tokens_seen": 3866720, "step": 6665 }, { "epoch": 0.9934465296395592, "grad_norm": 1.056667447090149, "learning_rate": 2.4832439678284184e-05, "loss": 0.6734, "num_input_tokens_seen": 3869504, "step": 6670 }, { "epoch": 0.9941912421805184, "grad_norm": 0.8969321250915527, "learning_rate": 2.4851057491808162e-05, "loss": 0.7263, "num_input_tokens_seen": 3872608, "step": 6675 }, { "epoch": 0.9949359547214776, "grad_norm": 0.9630788564682007, "learning_rate": 2.4869675305332143e-05, "loss": 0.8154, "num_input_tokens_seen": 3875584, "step": 6680 }, { "epoch": 0.9956806672624366, "grad_norm": 0.7393785715103149, "learning_rate": 2.4888293118856125e-05, "loss": 0.7927, "num_input_tokens_seen": 3878336, "step": 6685 }, { "epoch": 0.9964253798033958, "grad_norm": 0.9890385866165161, "learning_rate": 2.4906910932380103e-05, "loss": 0.7501, "num_input_tokens_seen": 3881600, "step": 6690 }, { "epoch": 0.997170092344355, "grad_norm": 1.3065303564071655, "learning_rate": 2.492552874590408e-05, "loss": 0.7801, "num_input_tokens_seen": 3884160, "step": 6695 }, { "epoch": 0.9979148048853143, "grad_norm": 1.0725781917572021, "learning_rate": 2.494414655942806e-05, "loss": 0.7339, "num_input_tokens_seen": 3886944, "step": 6700 }, { "epoch": 0.9986595174262735, "grad_norm": 1.120996117591858, "learning_rate": 2.496276437295204e-05, "loss": 0.7871, "num_input_tokens_seen": 3889856, "step": 6705 }, { "epoch": 0.9994042299672327, "grad_norm": 1.8177130222320557, "learning_rate": 2.498138218647602e-05, "loss": 0.7328, "num_input_tokens_seen": 3892704, "step": 6710 }, { "epoch": 1.0, "eval_loss": 0.7198248505592346, "eval_runtime": 46.9498, "eval_samples_per_second": 63.557, "eval_steps_per_second": 15.889, "num_input_tokens_seen": 3894688, "step": 6714 }, { "epoch": 1.0001489425081918, "grad_norm": 0.844726026058197, "learning_rate": 2.5e-05, "loss": 0.642, "num_input_tokens_seen": 3895200, "step": 6715 }, { "epoch": 1.000893655049151, "grad_norm": 0.7317543625831604, "learning_rate": 2.5018617813523983e-05, "loss": 0.7242, "num_input_tokens_seen": 3897984, "step": 6720 }, { "epoch": 1.0016383675901102, "grad_norm": 0.5564881563186646, "learning_rate": 2.503723562704796e-05, "loss": 0.8026, "num_input_tokens_seen": 3901024, "step": 6725 }, { "epoch": 1.0023830801310694, "grad_norm": 1.0662776231765747, "learning_rate": 2.5055853440571943e-05, "loss": 0.6805, "num_input_tokens_seen": 3904032, "step": 6730 }, { "epoch": 1.0031277926720286, "grad_norm": 0.906423032283783, "learning_rate": 2.507447125409592e-05, "loss": 0.7724, "num_input_tokens_seen": 3906912, "step": 6735 }, { "epoch": 1.0038725052129878, "grad_norm": 1.0549273490905762, "learning_rate": 2.5093089067619903e-05, "loss": 0.688, "num_input_tokens_seen": 3909952, "step": 6740 }, { "epoch": 1.004617217753947, "grad_norm": 1.2983109951019287, "learning_rate": 2.511170688114388e-05, "loss": 0.7209, "num_input_tokens_seen": 3912800, "step": 6745 }, { "epoch": 1.0053619302949062, "grad_norm": 0.6584098935127258, "learning_rate": 2.5130324694667862e-05, "loss": 0.6956, "num_input_tokens_seen": 3915584, "step": 6750 }, { "epoch": 1.0061066428358654, "grad_norm": 0.6388137936592102, "learning_rate": 2.514894250819184e-05, "loss": 0.8195, "num_input_tokens_seen": 3918432, "step": 6755 }, { "epoch": 1.0068513553768246, "grad_norm": 1.1371053457260132, "learning_rate": 2.5167560321715815e-05, "loss": 0.6589, "num_input_tokens_seen": 3921216, "step": 6760 }, { "epoch": 1.0075960679177838, "grad_norm": 0.8811500668525696, "learning_rate": 2.5186178135239797e-05, "loss": 0.6489, "num_input_tokens_seen": 3924256, "step": 6765 }, { "epoch": 1.008340780458743, "grad_norm": 1.3009247779846191, "learning_rate": 2.5204795948763775e-05, "loss": 0.7136, "num_input_tokens_seen": 3926976, "step": 6770 }, { "epoch": 1.0090854929997022, "grad_norm": 1.0917015075683594, "learning_rate": 2.5223413762287757e-05, "loss": 0.6816, "num_input_tokens_seen": 3929792, "step": 6775 }, { "epoch": 1.0098302055406614, "grad_norm": 1.4427670240402222, "learning_rate": 2.5242031575811735e-05, "loss": 0.7573, "num_input_tokens_seen": 3932768, "step": 6780 }, { "epoch": 1.0105749180816206, "grad_norm": 0.9385707974433899, "learning_rate": 2.5260649389335717e-05, "loss": 0.785, "num_input_tokens_seen": 3935456, "step": 6785 }, { "epoch": 1.0113196306225798, "grad_norm": 1.1999932527542114, "learning_rate": 2.5279267202859695e-05, "loss": 0.6263, "num_input_tokens_seen": 3938208, "step": 6790 }, { "epoch": 1.0120643431635388, "grad_norm": 0.7753531336784363, "learning_rate": 2.5297885016383677e-05, "loss": 0.8055, "num_input_tokens_seen": 3941088, "step": 6795 }, { "epoch": 1.012809055704498, "grad_norm": 0.8664447069168091, "learning_rate": 2.5316502829907658e-05, "loss": 0.7318, "num_input_tokens_seen": 3943904, "step": 6800 }, { "epoch": 1.0135537682454572, "grad_norm": 1.1256427764892578, "learning_rate": 2.5335120643431636e-05, "loss": 0.8581, "num_input_tokens_seen": 3946752, "step": 6805 }, { "epoch": 1.0142984807864164, "grad_norm": 0.7162502408027649, "learning_rate": 2.5353738456955618e-05, "loss": 0.6924, "num_input_tokens_seen": 3949632, "step": 6810 }, { "epoch": 1.0150431933273756, "grad_norm": 0.7495654225349426, "learning_rate": 2.5372356270479596e-05, "loss": 0.8258, "num_input_tokens_seen": 3952864, "step": 6815 }, { "epoch": 1.0157879058683348, "grad_norm": 0.9089675545692444, "learning_rate": 2.5390974084003578e-05, "loss": 0.7131, "num_input_tokens_seen": 3956032, "step": 6820 }, { "epoch": 1.016532618409294, "grad_norm": 0.7017732858657837, "learning_rate": 2.5409591897527556e-05, "loss": 0.7434, "num_input_tokens_seen": 3958944, "step": 6825 }, { "epoch": 1.0172773309502532, "grad_norm": 0.8980140089988708, "learning_rate": 2.5428209711051538e-05, "loss": 0.7602, "num_input_tokens_seen": 3961984, "step": 6830 }, { "epoch": 1.0180220434912124, "grad_norm": 1.0021334886550903, "learning_rate": 2.5446827524575516e-05, "loss": 0.7609, "num_input_tokens_seen": 3964768, "step": 6835 }, { "epoch": 1.0187667560321716, "grad_norm": 0.7565438747406006, "learning_rate": 2.5465445338099498e-05, "loss": 0.6797, "num_input_tokens_seen": 3967584, "step": 6840 }, { "epoch": 1.0195114685731308, "grad_norm": 1.2272131443023682, "learning_rate": 2.5484063151623473e-05, "loss": 0.7184, "num_input_tokens_seen": 3970336, "step": 6845 }, { "epoch": 1.02025618111409, "grad_norm": 0.7809627652168274, "learning_rate": 2.550268096514745e-05, "loss": 0.675, "num_input_tokens_seen": 3972896, "step": 6850 }, { "epoch": 1.0210008936550492, "grad_norm": 0.6246674656867981, "learning_rate": 2.5521298778671432e-05, "loss": 0.8019, "num_input_tokens_seen": 3975808, "step": 6855 }, { "epoch": 1.0217456061960084, "grad_norm": 0.6930403709411621, "learning_rate": 2.553991659219541e-05, "loss": 0.7143, "num_input_tokens_seen": 3978688, "step": 6860 }, { "epoch": 1.0224903187369676, "grad_norm": 0.8364614248275757, "learning_rate": 2.5558534405719392e-05, "loss": 0.7031, "num_input_tokens_seen": 3981408, "step": 6865 }, { "epoch": 1.0232350312779268, "grad_norm": 0.9923933744430542, "learning_rate": 2.557715221924337e-05, "loss": 0.7277, "num_input_tokens_seen": 3984384, "step": 6870 }, { "epoch": 1.023979743818886, "grad_norm": 0.7641106843948364, "learning_rate": 2.5595770032767352e-05, "loss": 0.6708, "num_input_tokens_seen": 3987040, "step": 6875 }, { "epoch": 1.024724456359845, "grad_norm": 0.6540545225143433, "learning_rate": 2.5614387846291334e-05, "loss": 0.7264, "num_input_tokens_seen": 3990016, "step": 6880 }, { "epoch": 1.0254691689008042, "grad_norm": 1.0570400953292847, "learning_rate": 2.5633005659815312e-05, "loss": 0.6952, "num_input_tokens_seen": 3993248, "step": 6885 }, { "epoch": 1.0262138814417634, "grad_norm": 1.0117523670196533, "learning_rate": 2.5651623473339294e-05, "loss": 0.7188, "num_input_tokens_seen": 3996256, "step": 6890 }, { "epoch": 1.0269585939827226, "grad_norm": 1.2411518096923828, "learning_rate": 2.5670241286863272e-05, "loss": 0.7151, "num_input_tokens_seen": 3999360, "step": 6895 }, { "epoch": 1.0277033065236818, "grad_norm": 1.2947218418121338, "learning_rate": 2.5688859100387253e-05, "loss": 0.722, "num_input_tokens_seen": 4002400, "step": 6900 }, { "epoch": 1.028448019064641, "grad_norm": 1.1460785865783691, "learning_rate": 2.570747691391123e-05, "loss": 0.7625, "num_input_tokens_seen": 4005664, "step": 6905 }, { "epoch": 1.0291927316056002, "grad_norm": 0.9544150233268738, "learning_rate": 2.5726094727435213e-05, "loss": 0.7015, "num_input_tokens_seen": 4008736, "step": 6910 }, { "epoch": 1.0299374441465594, "grad_norm": 0.8700282573699951, "learning_rate": 2.574471254095919e-05, "loss": 0.7035, "num_input_tokens_seen": 4011616, "step": 6915 }, { "epoch": 1.0306821566875186, "grad_norm": 1.117033839225769, "learning_rate": 2.5763330354483173e-05, "loss": 0.7047, "num_input_tokens_seen": 4014272, "step": 6920 }, { "epoch": 1.0314268692284778, "grad_norm": 0.8892152905464172, "learning_rate": 2.578194816800715e-05, "loss": 0.6084, "num_input_tokens_seen": 4017248, "step": 6925 }, { "epoch": 1.032171581769437, "grad_norm": 0.9972197413444519, "learning_rate": 2.5800565981531133e-05, "loss": 0.756, "num_input_tokens_seen": 4019680, "step": 6930 }, { "epoch": 1.0329162943103962, "grad_norm": 0.8504201769828796, "learning_rate": 2.5819183795055108e-05, "loss": 0.7027, "num_input_tokens_seen": 4022336, "step": 6935 }, { "epoch": 1.0336610068513554, "grad_norm": 0.8922770619392395, "learning_rate": 2.5837801608579086e-05, "loss": 0.6238, "num_input_tokens_seen": 4025216, "step": 6940 }, { "epoch": 1.0344057193923146, "grad_norm": 1.7414772510528564, "learning_rate": 2.5856419422103068e-05, "loss": 0.8465, "num_input_tokens_seen": 4028064, "step": 6945 }, { "epoch": 1.0351504319332738, "grad_norm": 0.8047934770584106, "learning_rate": 2.587503723562705e-05, "loss": 0.5464, "num_input_tokens_seen": 4030912, "step": 6950 }, { "epoch": 1.035895144474233, "grad_norm": 0.6658483743667603, "learning_rate": 2.5893655049151027e-05, "loss": 0.6052, "num_input_tokens_seen": 4033952, "step": 6955 }, { "epoch": 1.0366398570151922, "grad_norm": 0.6781772375106812, "learning_rate": 2.591227286267501e-05, "loss": 0.6693, "num_input_tokens_seen": 4036544, "step": 6960 }, { "epoch": 1.0373845695561514, "grad_norm": 1.310895562171936, "learning_rate": 2.5930890676198987e-05, "loss": 0.6401, "num_input_tokens_seen": 4039296, "step": 6965 }, { "epoch": 1.0381292820971104, "grad_norm": 0.6101534962654114, "learning_rate": 2.594950848972297e-05, "loss": 0.6816, "num_input_tokens_seen": 4042400, "step": 6970 }, { "epoch": 1.0388739946380696, "grad_norm": 1.3581922054290771, "learning_rate": 2.5968126303246947e-05, "loss": 0.8191, "num_input_tokens_seen": 4045152, "step": 6975 }, { "epoch": 1.0396187071790288, "grad_norm": 1.0592827796936035, "learning_rate": 2.598674411677093e-05, "loss": 0.7581, "num_input_tokens_seen": 4048032, "step": 6980 }, { "epoch": 1.040363419719988, "grad_norm": 0.9179674983024597, "learning_rate": 2.6005361930294907e-05, "loss": 0.7682, "num_input_tokens_seen": 4050752, "step": 6985 }, { "epoch": 1.0411081322609472, "grad_norm": 0.7526964545249939, "learning_rate": 2.602397974381889e-05, "loss": 0.7304, "num_input_tokens_seen": 4053600, "step": 6990 }, { "epoch": 1.0418528448019064, "grad_norm": 0.6860525608062744, "learning_rate": 2.6042597557342867e-05, "loss": 0.6905, "num_input_tokens_seen": 4056608, "step": 6995 }, { "epoch": 1.0425975573428656, "grad_norm": 1.104139804840088, "learning_rate": 2.606121537086685e-05, "loss": 0.6756, "num_input_tokens_seen": 4059616, "step": 7000 }, { "epoch": 1.0433422698838248, "grad_norm": 0.7303552031517029, "learning_rate": 2.6079833184390827e-05, "loss": 0.6982, "num_input_tokens_seen": 4062688, "step": 7005 }, { "epoch": 1.044086982424784, "grad_norm": 1.2335530519485474, "learning_rate": 2.609845099791481e-05, "loss": 0.6676, "num_input_tokens_seen": 4065408, "step": 7010 }, { "epoch": 1.0448316949657432, "grad_norm": 0.9288187026977539, "learning_rate": 2.611706881143879e-05, "loss": 0.7566, "num_input_tokens_seen": 4068288, "step": 7015 }, { "epoch": 1.0455764075067024, "grad_norm": 1.0579841136932373, "learning_rate": 2.613568662496276e-05, "loss": 0.6859, "num_input_tokens_seen": 4070944, "step": 7020 }, { "epoch": 1.0463211200476616, "grad_norm": 1.0759884119033813, "learning_rate": 2.6154304438486743e-05, "loss": 0.7845, "num_input_tokens_seen": 4073696, "step": 7025 }, { "epoch": 1.0470658325886208, "grad_norm": 0.8192229270935059, "learning_rate": 2.6172922252010725e-05, "loss": 0.6505, "num_input_tokens_seen": 4076576, "step": 7030 }, { "epoch": 1.04781054512958, "grad_norm": 1.4710869789123535, "learning_rate": 2.6191540065534703e-05, "loss": 0.6079, "num_input_tokens_seen": 4079136, "step": 7035 }, { "epoch": 1.0485552576705393, "grad_norm": 0.8333053588867188, "learning_rate": 2.6210157879058685e-05, "loss": 0.7989, "num_input_tokens_seen": 4082208, "step": 7040 }, { "epoch": 1.0492999702114985, "grad_norm": 0.958477795124054, "learning_rate": 2.6228775692582663e-05, "loss": 0.8925, "num_input_tokens_seen": 4085120, "step": 7045 }, { "epoch": 1.0500446827524577, "grad_norm": 0.6531829833984375, "learning_rate": 2.6247393506106644e-05, "loss": 0.6069, "num_input_tokens_seen": 4087712, "step": 7050 }, { "epoch": 1.0507893952934166, "grad_norm": 0.9432560205459595, "learning_rate": 2.6266011319630623e-05, "loss": 0.7637, "num_input_tokens_seen": 4090464, "step": 7055 }, { "epoch": 1.0515341078343758, "grad_norm": 0.7140146493911743, "learning_rate": 2.6284629133154604e-05, "loss": 0.7282, "num_input_tokens_seen": 4093184, "step": 7060 }, { "epoch": 1.052278820375335, "grad_norm": 0.7034874558448792, "learning_rate": 2.6303246946678582e-05, "loss": 0.6935, "num_input_tokens_seen": 4095968, "step": 7065 }, { "epoch": 1.0530235329162942, "grad_norm": 1.458244800567627, "learning_rate": 2.6321864760202564e-05, "loss": 0.7664, "num_input_tokens_seen": 4098912, "step": 7070 }, { "epoch": 1.0537682454572534, "grad_norm": 0.6398018002510071, "learning_rate": 2.6340482573726542e-05, "loss": 0.6456, "num_input_tokens_seen": 4101696, "step": 7075 }, { "epoch": 1.0545129579982127, "grad_norm": 0.9394655227661133, "learning_rate": 2.6359100387250524e-05, "loss": 0.6401, "num_input_tokens_seen": 4104736, "step": 7080 }, { "epoch": 1.0552576705391719, "grad_norm": 0.7626346945762634, "learning_rate": 2.6377718200774502e-05, "loss": 0.7211, "num_input_tokens_seen": 4107776, "step": 7085 }, { "epoch": 1.056002383080131, "grad_norm": 1.2718104124069214, "learning_rate": 2.6396336014298484e-05, "loss": 0.6524, "num_input_tokens_seen": 4110592, "step": 7090 }, { "epoch": 1.0567470956210903, "grad_norm": 1.1824461221694946, "learning_rate": 2.6414953827822465e-05, "loss": 0.7607, "num_input_tokens_seen": 4113568, "step": 7095 }, { "epoch": 1.0574918081620495, "grad_norm": 0.8538609147071838, "learning_rate": 2.6433571641346444e-05, "loss": 0.7692, "num_input_tokens_seen": 4116608, "step": 7100 }, { "epoch": 1.0582365207030087, "grad_norm": 0.8403412699699402, "learning_rate": 2.645218945487042e-05, "loss": 0.6979, "num_input_tokens_seen": 4119712, "step": 7105 }, { "epoch": 1.0589812332439679, "grad_norm": 0.7696439623832703, "learning_rate": 2.64708072683944e-05, "loss": 0.6906, "num_input_tokens_seen": 4122336, "step": 7110 }, { "epoch": 1.059725945784927, "grad_norm": 1.1281399726867676, "learning_rate": 2.648942508191838e-05, "loss": 0.6662, "num_input_tokens_seen": 4125088, "step": 7115 }, { "epoch": 1.0604706583258863, "grad_norm": 1.0328750610351562, "learning_rate": 2.650804289544236e-05, "loss": 0.6964, "num_input_tokens_seen": 4128448, "step": 7120 }, { "epoch": 1.0612153708668455, "grad_norm": 0.9428989887237549, "learning_rate": 2.6526660708966338e-05, "loss": 0.7069, "num_input_tokens_seen": 4131520, "step": 7125 }, { "epoch": 1.0619600834078047, "grad_norm": 0.9294420480728149, "learning_rate": 2.654527852249032e-05, "loss": 0.7869, "num_input_tokens_seen": 4134944, "step": 7130 }, { "epoch": 1.0627047959487639, "grad_norm": 0.7153878808021545, "learning_rate": 2.6563896336014298e-05, "loss": 0.749, "num_input_tokens_seen": 4138016, "step": 7135 }, { "epoch": 1.063449508489723, "grad_norm": 0.8763701915740967, "learning_rate": 2.658251414953828e-05, "loss": 0.7659, "num_input_tokens_seen": 4140896, "step": 7140 }, { "epoch": 1.064194221030682, "grad_norm": 0.8516419529914856, "learning_rate": 2.6601131963062258e-05, "loss": 0.7162, "num_input_tokens_seen": 4143584, "step": 7145 }, { "epoch": 1.0649389335716413, "grad_norm": 1.1870859861373901, "learning_rate": 2.661974977658624e-05, "loss": 0.7065, "num_input_tokens_seen": 4146528, "step": 7150 }, { "epoch": 1.0656836461126005, "grad_norm": 0.834107518196106, "learning_rate": 2.6638367590110218e-05, "loss": 0.7127, "num_input_tokens_seen": 4149536, "step": 7155 }, { "epoch": 1.0664283586535597, "grad_norm": 0.73896723985672, "learning_rate": 2.66569854036342e-05, "loss": 0.7599, "num_input_tokens_seen": 4152384, "step": 7160 }, { "epoch": 1.0671730711945189, "grad_norm": 0.8299586772918701, "learning_rate": 2.667560321715818e-05, "loss": 0.5884, "num_input_tokens_seen": 4155200, "step": 7165 }, { "epoch": 1.067917783735478, "grad_norm": 0.7563808560371399, "learning_rate": 2.669422103068216e-05, "loss": 0.6768, "num_input_tokens_seen": 4158112, "step": 7170 }, { "epoch": 1.0686624962764373, "grad_norm": 0.9805957078933716, "learning_rate": 2.671283884420614e-05, "loss": 0.8034, "num_input_tokens_seen": 4160768, "step": 7175 }, { "epoch": 1.0694072088173965, "grad_norm": 0.8961006999015808, "learning_rate": 2.673145665773012e-05, "loss": 0.7177, "num_input_tokens_seen": 4163872, "step": 7180 }, { "epoch": 1.0701519213583557, "grad_norm": 0.9870815277099609, "learning_rate": 2.67500744712541e-05, "loss": 0.7481, "num_input_tokens_seen": 4166816, "step": 7185 }, { "epoch": 1.0708966338993149, "grad_norm": 1.59038507938385, "learning_rate": 2.676869228477808e-05, "loss": 0.7454, "num_input_tokens_seen": 4169568, "step": 7190 }, { "epoch": 1.071641346440274, "grad_norm": 0.9971291422843933, "learning_rate": 2.6787310098302054e-05, "loss": 0.7318, "num_input_tokens_seen": 4172480, "step": 7195 }, { "epoch": 1.0723860589812333, "grad_norm": 1.061975359916687, "learning_rate": 2.6805927911826035e-05, "loss": 0.8086, "num_input_tokens_seen": 4175744, "step": 7200 }, { "epoch": 1.0731307715221925, "grad_norm": 0.6944595575332642, "learning_rate": 2.6824545725350014e-05, "loss": 0.5908, "num_input_tokens_seen": 4178528, "step": 7205 }, { "epoch": 1.0738754840631517, "grad_norm": 0.897612988948822, "learning_rate": 2.6843163538873995e-05, "loss": 0.6988, "num_input_tokens_seen": 4181728, "step": 7210 }, { "epoch": 1.074620196604111, "grad_norm": 1.0961040258407593, "learning_rate": 2.6861781352397974e-05, "loss": 0.6848, "num_input_tokens_seen": 4184320, "step": 7215 }, { "epoch": 1.07536490914507, "grad_norm": 1.3661350011825562, "learning_rate": 2.6880399165921955e-05, "loss": 0.8244, "num_input_tokens_seen": 4187040, "step": 7220 }, { "epoch": 1.076109621686029, "grad_norm": 1.2718486785888672, "learning_rate": 2.6899016979445933e-05, "loss": 0.6351, "num_input_tokens_seen": 4190144, "step": 7225 }, { "epoch": 1.0768543342269883, "grad_norm": 0.8920322060585022, "learning_rate": 2.6917634792969915e-05, "loss": 0.7694, "num_input_tokens_seen": 4193152, "step": 7230 }, { "epoch": 1.0775990467679475, "grad_norm": 0.8520169854164124, "learning_rate": 2.6936252606493893e-05, "loss": 0.6715, "num_input_tokens_seen": 4196000, "step": 7235 }, { "epoch": 1.0783437593089067, "grad_norm": 0.7126653790473938, "learning_rate": 2.6954870420017875e-05, "loss": 0.7829, "num_input_tokens_seen": 4198880, "step": 7240 }, { "epoch": 1.079088471849866, "grad_norm": 1.030374526977539, "learning_rate": 2.6973488233541856e-05, "loss": 0.69, "num_input_tokens_seen": 4201664, "step": 7245 }, { "epoch": 1.079833184390825, "grad_norm": 0.8495602011680603, "learning_rate": 2.6992106047065835e-05, "loss": 0.761, "num_input_tokens_seen": 4204672, "step": 7250 }, { "epoch": 1.0805778969317843, "grad_norm": 1.0186512470245361, "learning_rate": 2.7010723860589816e-05, "loss": 0.8058, "num_input_tokens_seen": 4207552, "step": 7255 }, { "epoch": 1.0813226094727435, "grad_norm": 1.3130773305892944, "learning_rate": 2.7029341674113795e-05, "loss": 0.7389, "num_input_tokens_seen": 4210336, "step": 7260 }, { "epoch": 1.0820673220137027, "grad_norm": 1.1135194301605225, "learning_rate": 2.7047959487637776e-05, "loss": 0.7597, "num_input_tokens_seen": 4212992, "step": 7265 }, { "epoch": 1.082812034554662, "grad_norm": 0.9975517392158508, "learning_rate": 2.7066577301161754e-05, "loss": 0.6416, "num_input_tokens_seen": 4215712, "step": 7270 }, { "epoch": 1.083556747095621, "grad_norm": 1.0543055534362793, "learning_rate": 2.7085195114685736e-05, "loss": 0.6693, "num_input_tokens_seen": 4219072, "step": 7275 }, { "epoch": 1.0843014596365803, "grad_norm": 0.6500024795532227, "learning_rate": 2.710381292820971e-05, "loss": 0.7725, "num_input_tokens_seen": 4221856, "step": 7280 }, { "epoch": 1.0850461721775395, "grad_norm": 0.831123948097229, "learning_rate": 2.712243074173369e-05, "loss": 0.6245, "num_input_tokens_seen": 4224896, "step": 7285 }, { "epoch": 1.0857908847184987, "grad_norm": 0.7743117809295654, "learning_rate": 2.714104855525767e-05, "loss": 0.7641, "num_input_tokens_seen": 4227680, "step": 7290 }, { "epoch": 1.086535597259458, "grad_norm": 0.8011530041694641, "learning_rate": 2.715966636878165e-05, "loss": 0.688, "num_input_tokens_seen": 4230432, "step": 7295 }, { "epoch": 1.0872803098004171, "grad_norm": 1.0381929874420166, "learning_rate": 2.717828418230563e-05, "loss": 0.6838, "num_input_tokens_seen": 4233632, "step": 7300 }, { "epoch": 1.0880250223413763, "grad_norm": 1.206402063369751, "learning_rate": 2.719690199582961e-05, "loss": 0.7664, "num_input_tokens_seen": 4237024, "step": 7305 }, { "epoch": 1.0887697348823355, "grad_norm": 1.1504687070846558, "learning_rate": 2.721551980935359e-05, "loss": 0.9497, "num_input_tokens_seen": 4240320, "step": 7310 }, { "epoch": 1.0895144474232945, "grad_norm": 0.6395018100738525, "learning_rate": 2.723413762287757e-05, "loss": 0.7577, "num_input_tokens_seen": 4242976, "step": 7315 }, { "epoch": 1.0902591599642537, "grad_norm": 0.888832688331604, "learning_rate": 2.725275543640155e-05, "loss": 0.812, "num_input_tokens_seen": 4245792, "step": 7320 }, { "epoch": 1.091003872505213, "grad_norm": 1.0079768896102905, "learning_rate": 2.7271373249925532e-05, "loss": 0.7718, "num_input_tokens_seen": 4248608, "step": 7325 }, { "epoch": 1.0917485850461721, "grad_norm": 0.8496783971786499, "learning_rate": 2.728999106344951e-05, "loss": 0.6844, "num_input_tokens_seen": 4251616, "step": 7330 }, { "epoch": 1.0924932975871313, "grad_norm": 0.8117131590843201, "learning_rate": 2.7308608876973492e-05, "loss": 0.668, "num_input_tokens_seen": 4254720, "step": 7335 }, { "epoch": 1.0932380101280905, "grad_norm": 0.7925665974617004, "learning_rate": 2.732722669049747e-05, "loss": 0.6489, "num_input_tokens_seen": 4257568, "step": 7340 }, { "epoch": 1.0939827226690497, "grad_norm": 1.8298157453536987, "learning_rate": 2.734584450402145e-05, "loss": 0.7864, "num_input_tokens_seen": 4260384, "step": 7345 }, { "epoch": 1.094727435210009, "grad_norm": 1.11896550655365, "learning_rate": 2.736446231754543e-05, "loss": 0.7756, "num_input_tokens_seen": 4263616, "step": 7350 }, { "epoch": 1.0954721477509681, "grad_norm": 0.6904885768890381, "learning_rate": 2.738308013106941e-05, "loss": 0.6403, "num_input_tokens_seen": 4266368, "step": 7355 }, { "epoch": 1.0962168602919273, "grad_norm": 0.7370844483375549, "learning_rate": 2.740169794459339e-05, "loss": 0.7679, "num_input_tokens_seen": 4269120, "step": 7360 }, { "epoch": 1.0969615728328865, "grad_norm": 1.0121010541915894, "learning_rate": 2.7420315758117365e-05, "loss": 0.6917, "num_input_tokens_seen": 4272064, "step": 7365 }, { "epoch": 1.0977062853738457, "grad_norm": 0.8445209860801697, "learning_rate": 2.7438933571641346e-05, "loss": 0.7513, "num_input_tokens_seen": 4275072, "step": 7370 }, { "epoch": 1.098450997914805, "grad_norm": 1.2779982089996338, "learning_rate": 2.7457551385165324e-05, "loss": 0.7534, "num_input_tokens_seen": 4278240, "step": 7375 }, { "epoch": 1.0991957104557641, "grad_norm": 1.018578290939331, "learning_rate": 2.7476169198689306e-05, "loss": 0.7324, "num_input_tokens_seen": 4281184, "step": 7380 }, { "epoch": 1.0999404229967233, "grad_norm": 0.8786332011222839, "learning_rate": 2.7494787012213284e-05, "loss": 0.6482, "num_input_tokens_seen": 4284032, "step": 7385 }, { "epoch": 1.1006851355376825, "grad_norm": 0.937971293926239, "learning_rate": 2.7513404825737266e-05, "loss": 0.6445, "num_input_tokens_seen": 4286976, "step": 7390 }, { "epoch": 1.1014298480786415, "grad_norm": 0.8894299268722534, "learning_rate": 2.7532022639261244e-05, "loss": 0.6803, "num_input_tokens_seen": 4289920, "step": 7395 }, { "epoch": 1.1021745606196007, "grad_norm": 1.0183521509170532, "learning_rate": 2.7550640452785226e-05, "loss": 0.6643, "num_input_tokens_seen": 4292768, "step": 7400 }, { "epoch": 1.10291927316056, "grad_norm": 1.098241925239563, "learning_rate": 2.7569258266309207e-05, "loss": 0.7713, "num_input_tokens_seen": 4295840, "step": 7405 }, { "epoch": 1.1036639857015191, "grad_norm": 0.9871764183044434, "learning_rate": 2.7587876079833186e-05, "loss": 0.6575, "num_input_tokens_seen": 4298976, "step": 7410 }, { "epoch": 1.1044086982424783, "grad_norm": 1.101081371307373, "learning_rate": 2.7606493893357167e-05, "loss": 0.6866, "num_input_tokens_seen": 4302016, "step": 7415 }, { "epoch": 1.1051534107834375, "grad_norm": 1.8824421167373657, "learning_rate": 2.7625111706881145e-05, "loss": 0.6611, "num_input_tokens_seen": 4305216, "step": 7420 }, { "epoch": 1.1058981233243967, "grad_norm": 0.5840743184089661, "learning_rate": 2.7643729520405127e-05, "loss": 0.7074, "num_input_tokens_seen": 4308032, "step": 7425 }, { "epoch": 1.106642835865356, "grad_norm": 0.6115108132362366, "learning_rate": 2.7662347333929105e-05, "loss": 0.7498, "num_input_tokens_seen": 4310752, "step": 7430 }, { "epoch": 1.1073875484063151, "grad_norm": 0.9788126945495605, "learning_rate": 2.7680965147453087e-05, "loss": 0.6965, "num_input_tokens_seen": 4313344, "step": 7435 }, { "epoch": 1.1081322609472744, "grad_norm": 1.1444023847579956, "learning_rate": 2.7699582960977065e-05, "loss": 0.7145, "num_input_tokens_seen": 4316000, "step": 7440 }, { "epoch": 1.1088769734882336, "grad_norm": 1.1934845447540283, "learning_rate": 2.7718200774501047e-05, "loss": 0.8682, "num_input_tokens_seen": 4318688, "step": 7445 }, { "epoch": 1.1096216860291928, "grad_norm": 1.1951128244400024, "learning_rate": 2.7736818588025025e-05, "loss": 0.7071, "num_input_tokens_seen": 4321440, "step": 7450 }, { "epoch": 1.110366398570152, "grad_norm": 0.8274325132369995, "learning_rate": 2.7755436401549e-05, "loss": 0.6895, "num_input_tokens_seen": 4324224, "step": 7455 }, { "epoch": 1.1111111111111112, "grad_norm": 0.9168515205383301, "learning_rate": 2.777405421507298e-05, "loss": 0.7111, "num_input_tokens_seen": 4327008, "step": 7460 }, { "epoch": 1.1118558236520704, "grad_norm": 0.9370004534721375, "learning_rate": 2.779267202859696e-05, "loss": 0.6456, "num_input_tokens_seen": 4330112, "step": 7465 }, { "epoch": 1.1126005361930296, "grad_norm": 0.7702921628952026, "learning_rate": 2.781128984212094e-05, "loss": 0.5729, "num_input_tokens_seen": 4332992, "step": 7470 }, { "epoch": 1.1133452487339888, "grad_norm": 0.7795849442481995, "learning_rate": 2.782990765564492e-05, "loss": 0.8336, "num_input_tokens_seen": 4336128, "step": 7475 }, { "epoch": 1.114089961274948, "grad_norm": 0.9704838395118713, "learning_rate": 2.78485254691689e-05, "loss": 0.8469, "num_input_tokens_seen": 4339040, "step": 7480 }, { "epoch": 1.1148346738159072, "grad_norm": 1.7229372262954712, "learning_rate": 2.7867143282692883e-05, "loss": 0.7464, "num_input_tokens_seen": 4341920, "step": 7485 }, { "epoch": 1.1155793863568662, "grad_norm": 0.7659422159194946, "learning_rate": 2.788576109621686e-05, "loss": 0.7437, "num_input_tokens_seen": 4344960, "step": 7490 }, { "epoch": 1.1163240988978254, "grad_norm": 1.0871821641921997, "learning_rate": 2.7904378909740843e-05, "loss": 0.7656, "num_input_tokens_seen": 4347936, "step": 7495 }, { "epoch": 1.1170688114387846, "grad_norm": 0.9381280541419983, "learning_rate": 2.792299672326482e-05, "loss": 0.8067, "num_input_tokens_seen": 4350688, "step": 7500 }, { "epoch": 1.1178135239797438, "grad_norm": 1.189469575881958, "learning_rate": 2.7941614536788802e-05, "loss": 0.7884, "num_input_tokens_seen": 4353664, "step": 7505 }, { "epoch": 1.118558236520703, "grad_norm": 1.042815089225769, "learning_rate": 2.796023235031278e-05, "loss": 0.7019, "num_input_tokens_seen": 4356736, "step": 7510 }, { "epoch": 1.1193029490616622, "grad_norm": 1.1911628246307373, "learning_rate": 2.7978850163836762e-05, "loss": 0.631, "num_input_tokens_seen": 4359808, "step": 7515 }, { "epoch": 1.1200476616026214, "grad_norm": 0.952896773815155, "learning_rate": 2.799746797736074e-05, "loss": 0.755, "num_input_tokens_seen": 4362752, "step": 7520 }, { "epoch": 1.1207923741435806, "grad_norm": 0.9991500377655029, "learning_rate": 2.8016085790884722e-05, "loss": 0.7032, "num_input_tokens_seen": 4365632, "step": 7525 }, { "epoch": 1.1215370866845398, "grad_norm": 0.7498750686645508, "learning_rate": 2.80347036044087e-05, "loss": 0.7824, "num_input_tokens_seen": 4368512, "step": 7530 }, { "epoch": 1.122281799225499, "grad_norm": 1.1592961549758911, "learning_rate": 2.8053321417932682e-05, "loss": 0.7261, "num_input_tokens_seen": 4371584, "step": 7535 }, { "epoch": 1.1230265117664582, "grad_norm": 0.662933349609375, "learning_rate": 2.8071939231456657e-05, "loss": 0.6715, "num_input_tokens_seen": 4374336, "step": 7540 }, { "epoch": 1.1237712243074174, "grad_norm": 0.8035348653793335, "learning_rate": 2.8090557044980635e-05, "loss": 0.6279, "num_input_tokens_seen": 4377312, "step": 7545 }, { "epoch": 1.1245159368483766, "grad_norm": 0.792348325252533, "learning_rate": 2.8109174858504617e-05, "loss": 0.6929, "num_input_tokens_seen": 4380064, "step": 7550 }, { "epoch": 1.1252606493893358, "grad_norm": 1.0006194114685059, "learning_rate": 2.81277926720286e-05, "loss": 0.8584, "num_input_tokens_seen": 4382816, "step": 7555 }, { "epoch": 1.126005361930295, "grad_norm": 0.904651403427124, "learning_rate": 2.8146410485552577e-05, "loss": 0.7513, "num_input_tokens_seen": 4385568, "step": 7560 }, { "epoch": 1.1267500744712542, "grad_norm": 0.7547245025634766, "learning_rate": 2.8165028299076558e-05, "loss": 0.5817, "num_input_tokens_seen": 4388608, "step": 7565 }, { "epoch": 1.1274947870122132, "grad_norm": 1.2495418787002563, "learning_rate": 2.8183646112600536e-05, "loss": 0.7586, "num_input_tokens_seen": 4391744, "step": 7570 }, { "epoch": 1.1282394995531724, "grad_norm": 0.6273325085639954, "learning_rate": 2.8202263926124518e-05, "loss": 0.6947, "num_input_tokens_seen": 4394688, "step": 7575 }, { "epoch": 1.1289842120941316, "grad_norm": 1.070041298866272, "learning_rate": 2.8220881739648496e-05, "loss": 0.6866, "num_input_tokens_seen": 4397696, "step": 7580 }, { "epoch": 1.1297289246350908, "grad_norm": 1.4334360361099243, "learning_rate": 2.8239499553172478e-05, "loss": 0.7802, "num_input_tokens_seen": 4400608, "step": 7585 }, { "epoch": 1.13047363717605, "grad_norm": 0.8796997666358948, "learning_rate": 2.8258117366696456e-05, "loss": 0.6545, "num_input_tokens_seen": 4403552, "step": 7590 }, { "epoch": 1.1312183497170092, "grad_norm": 0.7016701698303223, "learning_rate": 2.8276735180220438e-05, "loss": 0.8134, "num_input_tokens_seen": 4406496, "step": 7595 }, { "epoch": 1.1319630622579684, "grad_norm": 0.8946820497512817, "learning_rate": 2.8295352993744416e-05, "loss": 0.698, "num_input_tokens_seen": 4409120, "step": 7600 }, { "epoch": 1.1327077747989276, "grad_norm": 0.8323245644569397, "learning_rate": 2.8313970807268398e-05, "loss": 0.7876, "num_input_tokens_seen": 4411744, "step": 7605 }, { "epoch": 1.1334524873398868, "grad_norm": 0.6717939376831055, "learning_rate": 2.8332588620792376e-05, "loss": 0.5532, "num_input_tokens_seen": 4414720, "step": 7610 }, { "epoch": 1.134197199880846, "grad_norm": 0.6846427917480469, "learning_rate": 2.8351206434316357e-05, "loss": 0.7561, "num_input_tokens_seen": 4417472, "step": 7615 }, { "epoch": 1.1349419124218052, "grad_norm": 0.9240310192108154, "learning_rate": 2.836982424784034e-05, "loss": 0.7343, "num_input_tokens_seen": 4420640, "step": 7620 }, { "epoch": 1.1356866249627644, "grad_norm": 1.3633363246917725, "learning_rate": 2.838844206136431e-05, "loss": 0.6733, "num_input_tokens_seen": 4423520, "step": 7625 }, { "epoch": 1.1364313375037236, "grad_norm": 0.8800905346870422, "learning_rate": 2.8407059874888292e-05, "loss": 0.6657, "num_input_tokens_seen": 4426496, "step": 7630 }, { "epoch": 1.1371760500446828, "grad_norm": 0.6934430003166199, "learning_rate": 2.8425677688412274e-05, "loss": 0.6925, "num_input_tokens_seen": 4429440, "step": 7635 }, { "epoch": 1.137920762585642, "grad_norm": 0.712548017501831, "learning_rate": 2.8444295501936252e-05, "loss": 0.6976, "num_input_tokens_seen": 4432320, "step": 7640 }, { "epoch": 1.1386654751266012, "grad_norm": 0.8137526512145996, "learning_rate": 2.8462913315460234e-05, "loss": 0.646, "num_input_tokens_seen": 4435584, "step": 7645 }, { "epoch": 1.1394101876675604, "grad_norm": 0.6886258721351624, "learning_rate": 2.8481531128984212e-05, "loss": 0.6926, "num_input_tokens_seen": 4438464, "step": 7650 }, { "epoch": 1.1401549002085196, "grad_norm": 0.6910961866378784, "learning_rate": 2.8500148942508193e-05, "loss": 0.8053, "num_input_tokens_seen": 4441120, "step": 7655 }, { "epoch": 1.1408996127494788, "grad_norm": 0.6412034630775452, "learning_rate": 2.8518766756032172e-05, "loss": 0.5882, "num_input_tokens_seen": 4443680, "step": 7660 }, { "epoch": 1.1416443252904378, "grad_norm": 0.8726643919944763, "learning_rate": 2.8537384569556153e-05, "loss": 0.5419, "num_input_tokens_seen": 4446528, "step": 7665 }, { "epoch": 1.142389037831397, "grad_norm": 0.8637129664421082, "learning_rate": 2.855600238308013e-05, "loss": 0.6988, "num_input_tokens_seen": 4449504, "step": 7670 }, { "epoch": 1.1431337503723562, "grad_norm": 1.0360543727874756, "learning_rate": 2.8574620196604113e-05, "loss": 0.6795, "num_input_tokens_seen": 4452352, "step": 7675 }, { "epoch": 1.1438784629133154, "grad_norm": 1.104764461517334, "learning_rate": 2.859323801012809e-05, "loss": 0.6607, "num_input_tokens_seen": 4455136, "step": 7680 }, { "epoch": 1.1446231754542746, "grad_norm": 1.325369954109192, "learning_rate": 2.8611855823652073e-05, "loss": 0.8426, "num_input_tokens_seen": 4458144, "step": 7685 }, { "epoch": 1.1453678879952338, "grad_norm": 1.0204452276229858, "learning_rate": 2.863047363717605e-05, "loss": 0.576, "num_input_tokens_seen": 4460800, "step": 7690 }, { "epoch": 1.146112600536193, "grad_norm": 0.9819490313529968, "learning_rate": 2.8649091450700033e-05, "loss": 0.6562, "num_input_tokens_seen": 4463904, "step": 7695 }, { "epoch": 1.1468573130771522, "grad_norm": 1.3573311567306519, "learning_rate": 2.8667709264224015e-05, "loss": 0.7326, "num_input_tokens_seen": 4466816, "step": 7700 }, { "epoch": 1.1476020256181114, "grad_norm": 1.637097716331482, "learning_rate": 2.8686327077747993e-05, "loss": 0.8738, "num_input_tokens_seen": 4469504, "step": 7705 }, { "epoch": 1.1483467381590706, "grad_norm": 2.0266594886779785, "learning_rate": 2.8704944891271968e-05, "loss": 0.7419, "num_input_tokens_seen": 4472736, "step": 7710 }, { "epoch": 1.1490914507000298, "grad_norm": 1.3142850399017334, "learning_rate": 2.872356270479595e-05, "loss": 0.8264, "num_input_tokens_seen": 4475552, "step": 7715 }, { "epoch": 1.149836163240989, "grad_norm": 0.9423490762710571, "learning_rate": 2.8742180518319927e-05, "loss": 0.613, "num_input_tokens_seen": 4478432, "step": 7720 }, { "epoch": 1.1505808757819482, "grad_norm": 1.3539966344833374, "learning_rate": 2.876079833184391e-05, "loss": 0.7861, "num_input_tokens_seen": 4481312, "step": 7725 }, { "epoch": 1.1513255883229074, "grad_norm": 1.2644274234771729, "learning_rate": 2.8779416145367887e-05, "loss": 0.8315, "num_input_tokens_seen": 4484608, "step": 7730 }, { "epoch": 1.1520703008638666, "grad_norm": 1.1377617120742798, "learning_rate": 2.879803395889187e-05, "loss": 0.8173, "num_input_tokens_seen": 4487808, "step": 7735 }, { "epoch": 1.1528150134048256, "grad_norm": 1.4001001119613647, "learning_rate": 2.8816651772415847e-05, "loss": 0.762, "num_input_tokens_seen": 4490688, "step": 7740 }, { "epoch": 1.1535597259457848, "grad_norm": 0.7683276534080505, "learning_rate": 2.883526958593983e-05, "loss": 0.7678, "num_input_tokens_seen": 4493920, "step": 7745 }, { "epoch": 1.154304438486744, "grad_norm": 0.7876514196395874, "learning_rate": 2.8853887399463807e-05, "loss": 0.7805, "num_input_tokens_seen": 4496768, "step": 7750 }, { "epoch": 1.1550491510277032, "grad_norm": 0.6287990808486938, "learning_rate": 2.887250521298779e-05, "loss": 0.6321, "num_input_tokens_seen": 4499840, "step": 7755 }, { "epoch": 1.1557938635686624, "grad_norm": 1.1024727821350098, "learning_rate": 2.8891123026511767e-05, "loss": 0.8435, "num_input_tokens_seen": 4502912, "step": 7760 }, { "epoch": 1.1565385761096216, "grad_norm": 0.6805821657180786, "learning_rate": 2.890974084003575e-05, "loss": 0.6987, "num_input_tokens_seen": 4505856, "step": 7765 }, { "epoch": 1.1572832886505808, "grad_norm": 0.6235100626945496, "learning_rate": 2.892835865355973e-05, "loss": 0.7344, "num_input_tokens_seen": 4508928, "step": 7770 }, { "epoch": 1.15802800119154, "grad_norm": 0.6992446184158325, "learning_rate": 2.894697646708371e-05, "loss": 0.628, "num_input_tokens_seen": 4512064, "step": 7775 }, { "epoch": 1.1587727137324992, "grad_norm": 1.289401650428772, "learning_rate": 2.896559428060769e-05, "loss": 0.6028, "num_input_tokens_seen": 4514880, "step": 7780 }, { "epoch": 1.1595174262734584, "grad_norm": 1.131568193435669, "learning_rate": 2.8984212094131668e-05, "loss": 0.6068, "num_input_tokens_seen": 4517632, "step": 7785 }, { "epoch": 1.1602621388144176, "grad_norm": 1.51252281665802, "learning_rate": 2.900282990765565e-05, "loss": 0.7193, "num_input_tokens_seen": 4520416, "step": 7790 }, { "epoch": 1.1610068513553768, "grad_norm": 0.6696089506149292, "learning_rate": 2.9021447721179628e-05, "loss": 0.6942, "num_input_tokens_seen": 4523296, "step": 7795 }, { "epoch": 1.161751563896336, "grad_norm": 0.5660209059715271, "learning_rate": 2.9040065534703603e-05, "loss": 0.6088, "num_input_tokens_seen": 4526880, "step": 7800 }, { "epoch": 1.1624962764372953, "grad_norm": 1.1292290687561035, "learning_rate": 2.9058683348227584e-05, "loss": 0.7716, "num_input_tokens_seen": 4529504, "step": 7805 }, { "epoch": 1.1632409889782545, "grad_norm": 1.088244915008545, "learning_rate": 2.9077301161751563e-05, "loss": 0.7938, "num_input_tokens_seen": 4532320, "step": 7810 }, { "epoch": 1.1639857015192137, "grad_norm": 1.2785612344741821, "learning_rate": 2.9095918975275544e-05, "loss": 0.6337, "num_input_tokens_seen": 4535104, "step": 7815 }, { "epoch": 1.1647304140601729, "grad_norm": 0.7202649116516113, "learning_rate": 2.9114536788799523e-05, "loss": 0.7806, "num_input_tokens_seen": 4537600, "step": 7820 }, { "epoch": 1.165475126601132, "grad_norm": 0.8799245357513428, "learning_rate": 2.9133154602323504e-05, "loss": 0.7866, "num_input_tokens_seen": 4540512, "step": 7825 }, { "epoch": 1.1662198391420913, "grad_norm": 0.7753821611404419, "learning_rate": 2.9151772415847482e-05, "loss": 0.6539, "num_input_tokens_seen": 4543328, "step": 7830 }, { "epoch": 1.1669645516830505, "grad_norm": 0.5793158411979675, "learning_rate": 2.9170390229371464e-05, "loss": 0.7414, "num_input_tokens_seen": 4546176, "step": 7835 }, { "epoch": 1.1677092642240094, "grad_norm": 0.7822144627571106, "learning_rate": 2.9189008042895442e-05, "loss": 0.7323, "num_input_tokens_seen": 4549344, "step": 7840 }, { "epoch": 1.1684539767649686, "grad_norm": 0.6091794371604919, "learning_rate": 2.9207625856419424e-05, "loss": 0.714, "num_input_tokens_seen": 4552160, "step": 7845 }, { "epoch": 1.1691986893059279, "grad_norm": 0.9784433841705322, "learning_rate": 2.9226243669943406e-05, "loss": 0.7734, "num_input_tokens_seen": 4554880, "step": 7850 }, { "epoch": 1.169943401846887, "grad_norm": 0.8667701482772827, "learning_rate": 2.9244861483467384e-05, "loss": 0.7226, "num_input_tokens_seen": 4557696, "step": 7855 }, { "epoch": 1.1706881143878463, "grad_norm": 1.7799527645111084, "learning_rate": 2.9263479296991365e-05, "loss": 0.6984, "num_input_tokens_seen": 4560224, "step": 7860 }, { "epoch": 1.1714328269288055, "grad_norm": 1.1631261110305786, "learning_rate": 2.9282097110515344e-05, "loss": 0.6279, "num_input_tokens_seen": 4563168, "step": 7865 }, { "epoch": 1.1721775394697647, "grad_norm": 0.7777840495109558, "learning_rate": 2.9300714924039325e-05, "loss": 0.643, "num_input_tokens_seen": 4565856, "step": 7870 }, { "epoch": 1.1729222520107239, "grad_norm": 0.9031128883361816, "learning_rate": 2.9319332737563303e-05, "loss": 0.6794, "num_input_tokens_seen": 4569216, "step": 7875 }, { "epoch": 1.173666964551683, "grad_norm": 0.75895094871521, "learning_rate": 2.9337950551087285e-05, "loss": 0.651, "num_input_tokens_seen": 4572064, "step": 7880 }, { "epoch": 1.1744116770926423, "grad_norm": 0.9728659391403198, "learning_rate": 2.935656836461126e-05, "loss": 0.6881, "num_input_tokens_seen": 4575136, "step": 7885 }, { "epoch": 1.1751563896336015, "grad_norm": 1.4988508224487305, "learning_rate": 2.9375186178135238e-05, "loss": 0.7701, "num_input_tokens_seen": 4577856, "step": 7890 }, { "epoch": 1.1759011021745607, "grad_norm": 0.9239181280136108, "learning_rate": 2.939380399165922e-05, "loss": 0.7222, "num_input_tokens_seen": 4580608, "step": 7895 }, { "epoch": 1.1766458147155199, "grad_norm": 0.6953989267349243, "learning_rate": 2.9412421805183198e-05, "loss": 0.8492, "num_input_tokens_seen": 4583200, "step": 7900 }, { "epoch": 1.177390527256479, "grad_norm": 0.704331636428833, "learning_rate": 2.943103961870718e-05, "loss": 0.5938, "num_input_tokens_seen": 4585952, "step": 7905 }, { "epoch": 1.1781352397974383, "grad_norm": 0.6270862817764282, "learning_rate": 2.9449657432231158e-05, "loss": 0.6713, "num_input_tokens_seen": 4589056, "step": 7910 }, { "epoch": 1.1788799523383973, "grad_norm": 1.0517059564590454, "learning_rate": 2.946827524575514e-05, "loss": 0.6921, "num_input_tokens_seen": 4592096, "step": 7915 }, { "epoch": 1.1796246648793565, "grad_norm": 0.8243216276168823, "learning_rate": 2.9486893059279118e-05, "loss": 0.7149, "num_input_tokens_seen": 4594784, "step": 7920 }, { "epoch": 1.1803693774203157, "grad_norm": 0.8982364535331726, "learning_rate": 2.95055108728031e-05, "loss": 0.6605, "num_input_tokens_seen": 4597856, "step": 7925 }, { "epoch": 1.1811140899612749, "grad_norm": 0.8653134107589722, "learning_rate": 2.952412868632708e-05, "loss": 0.7704, "num_input_tokens_seen": 4600512, "step": 7930 }, { "epoch": 1.181858802502234, "grad_norm": 0.9329127073287964, "learning_rate": 2.954274649985106e-05, "loss": 0.8222, "num_input_tokens_seen": 4603328, "step": 7935 }, { "epoch": 1.1826035150431933, "grad_norm": 0.8912926316261292, "learning_rate": 2.956136431337504e-05, "loss": 0.8419, "num_input_tokens_seen": 4606144, "step": 7940 }, { "epoch": 1.1833482275841525, "grad_norm": 0.8912903070449829, "learning_rate": 2.957998212689902e-05, "loss": 0.722, "num_input_tokens_seen": 4609088, "step": 7945 }, { "epoch": 1.1840929401251117, "grad_norm": 0.997660219669342, "learning_rate": 2.9598599940423e-05, "loss": 0.7054, "num_input_tokens_seen": 4612160, "step": 7950 }, { "epoch": 1.1848376526660709, "grad_norm": 0.9489918947219849, "learning_rate": 2.961721775394698e-05, "loss": 0.7173, "num_input_tokens_seen": 4615072, "step": 7955 }, { "epoch": 1.18558236520703, "grad_norm": 0.7083187699317932, "learning_rate": 2.963583556747096e-05, "loss": 0.9217, "num_input_tokens_seen": 4617600, "step": 7960 }, { "epoch": 1.1863270777479893, "grad_norm": 2.3305912017822266, "learning_rate": 2.965445338099494e-05, "loss": 0.5976, "num_input_tokens_seen": 4620320, "step": 7965 }, { "epoch": 1.1870717902889485, "grad_norm": 0.9494674801826477, "learning_rate": 2.9673071194518914e-05, "loss": 0.6558, "num_input_tokens_seen": 4623520, "step": 7970 }, { "epoch": 1.1878165028299077, "grad_norm": 0.8875821828842163, "learning_rate": 2.9691689008042895e-05, "loss": 0.739, "num_input_tokens_seen": 4626656, "step": 7975 }, { "epoch": 1.188561215370867, "grad_norm": 0.9201183915138245, "learning_rate": 2.9710306821566873e-05, "loss": 0.7438, "num_input_tokens_seen": 4629408, "step": 7980 }, { "epoch": 1.189305927911826, "grad_norm": 1.2715339660644531, "learning_rate": 2.9728924635090855e-05, "loss": 0.5516, "num_input_tokens_seen": 4632320, "step": 7985 }, { "epoch": 1.1900506404527853, "grad_norm": 1.862755537033081, "learning_rate": 2.9747542448614833e-05, "loss": 0.9121, "num_input_tokens_seen": 4635040, "step": 7990 }, { "epoch": 1.1907953529937445, "grad_norm": 1.032943844795227, "learning_rate": 2.9766160262138815e-05, "loss": 0.7424, "num_input_tokens_seen": 4638048, "step": 7995 }, { "epoch": 1.1915400655347037, "grad_norm": 1.6752588748931885, "learning_rate": 2.9784778075662793e-05, "loss": 0.6911, "num_input_tokens_seen": 4640864, "step": 8000 }, { "epoch": 1.192284778075663, "grad_norm": 1.6118379831314087, "learning_rate": 2.9803395889186775e-05, "loss": 0.693, "num_input_tokens_seen": 4643488, "step": 8005 }, { "epoch": 1.193029490616622, "grad_norm": 0.946580708026886, "learning_rate": 2.9822013702710756e-05, "loss": 0.7351, "num_input_tokens_seen": 4646432, "step": 8010 }, { "epoch": 1.193774203157581, "grad_norm": 1.0250365734100342, "learning_rate": 2.9840631516234735e-05, "loss": 0.7215, "num_input_tokens_seen": 4649280, "step": 8015 }, { "epoch": 1.1945189156985403, "grad_norm": 0.838659942150116, "learning_rate": 2.9859249329758716e-05, "loss": 0.6845, "num_input_tokens_seen": 4652192, "step": 8020 }, { "epoch": 1.1952636282394995, "grad_norm": 0.8605855703353882, "learning_rate": 2.9877867143282694e-05, "loss": 0.6381, "num_input_tokens_seen": 4654848, "step": 8025 }, { "epoch": 1.1960083407804587, "grad_norm": 1.4215376377105713, "learning_rate": 2.9896484956806676e-05, "loss": 0.6265, "num_input_tokens_seen": 4657600, "step": 8030 }, { "epoch": 1.196753053321418, "grad_norm": 0.76213538646698, "learning_rate": 2.9915102770330654e-05, "loss": 0.7855, "num_input_tokens_seen": 4660352, "step": 8035 }, { "epoch": 1.197497765862377, "grad_norm": 0.6234158873558044, "learning_rate": 2.9933720583854636e-05, "loss": 0.6277, "num_input_tokens_seen": 4663296, "step": 8040 }, { "epoch": 1.1982424784033363, "grad_norm": 1.0556715726852417, "learning_rate": 2.9952338397378614e-05, "loss": 0.6783, "num_input_tokens_seen": 4666240, "step": 8045 }, { "epoch": 1.1989871909442955, "grad_norm": 0.8580628037452698, "learning_rate": 2.9970956210902596e-05, "loss": 0.7549, "num_input_tokens_seen": 4668992, "step": 8050 }, { "epoch": 1.1997319034852547, "grad_norm": 0.7158346176147461, "learning_rate": 2.9989574024426574e-05, "loss": 0.7091, "num_input_tokens_seen": 4671584, "step": 8055 }, { "epoch": 1.200476616026214, "grad_norm": 0.9567337036132812, "learning_rate": 3.000819183795055e-05, "loss": 0.7008, "num_input_tokens_seen": 4674144, "step": 8060 }, { "epoch": 1.2012213285671731, "grad_norm": 1.592180609703064, "learning_rate": 3.002680965147453e-05, "loss": 0.7265, "num_input_tokens_seen": 4676864, "step": 8065 }, { "epoch": 1.2019660411081323, "grad_norm": 0.786554753780365, "learning_rate": 3.004542746499851e-05, "loss": 0.7268, "num_input_tokens_seen": 4679744, "step": 8070 }, { "epoch": 1.2027107536490915, "grad_norm": 0.9063117504119873, "learning_rate": 3.006404527852249e-05, "loss": 0.7367, "num_input_tokens_seen": 4682592, "step": 8075 }, { "epoch": 1.2034554661900507, "grad_norm": 1.0968817472457886, "learning_rate": 3.008266309204647e-05, "loss": 0.8905, "num_input_tokens_seen": 4685344, "step": 8080 }, { "epoch": 1.2042001787310097, "grad_norm": 1.1521902084350586, "learning_rate": 3.010128090557045e-05, "loss": 0.7608, "num_input_tokens_seen": 4688000, "step": 8085 }, { "epoch": 1.204944891271969, "grad_norm": 0.7029673457145691, "learning_rate": 3.0119898719094432e-05, "loss": 0.6254, "num_input_tokens_seen": 4691456, "step": 8090 }, { "epoch": 1.2056896038129281, "grad_norm": 0.8130751848220825, "learning_rate": 3.013851653261841e-05, "loss": 0.6341, "num_input_tokens_seen": 4694432, "step": 8095 }, { "epoch": 1.2064343163538873, "grad_norm": 1.9240646362304688, "learning_rate": 3.015713434614239e-05, "loss": 0.8347, "num_input_tokens_seen": 4697376, "step": 8100 }, { "epoch": 1.2071790288948465, "grad_norm": 0.8688897490501404, "learning_rate": 3.017575215966637e-05, "loss": 0.7281, "num_input_tokens_seen": 4700320, "step": 8105 }, { "epoch": 1.2079237414358057, "grad_norm": 0.9129258990287781, "learning_rate": 3.019436997319035e-05, "loss": 0.742, "num_input_tokens_seen": 4703232, "step": 8110 }, { "epoch": 1.208668453976765, "grad_norm": 1.2196848392486572, "learning_rate": 3.021298778671433e-05, "loss": 0.6749, "num_input_tokens_seen": 4706208, "step": 8115 }, { "epoch": 1.2094131665177241, "grad_norm": 0.6944990158081055, "learning_rate": 3.023160560023831e-05, "loss": 0.6783, "num_input_tokens_seen": 4709056, "step": 8120 }, { "epoch": 1.2101578790586833, "grad_norm": 1.3692030906677246, "learning_rate": 3.025022341376229e-05, "loss": 0.7732, "num_input_tokens_seen": 4711904, "step": 8125 }, { "epoch": 1.2109025915996425, "grad_norm": 0.8976647853851318, "learning_rate": 3.026884122728627e-05, "loss": 0.7255, "num_input_tokens_seen": 4714560, "step": 8130 }, { "epoch": 1.2116473041406017, "grad_norm": 1.1871174573898315, "learning_rate": 3.028745904081025e-05, "loss": 0.735, "num_input_tokens_seen": 4717472, "step": 8135 }, { "epoch": 1.212392016681561, "grad_norm": 0.7257463335990906, "learning_rate": 3.030607685433423e-05, "loss": 0.78, "num_input_tokens_seen": 4720288, "step": 8140 }, { "epoch": 1.2131367292225201, "grad_norm": 0.9407142996788025, "learning_rate": 3.0324694667858206e-05, "loss": 0.7545, "num_input_tokens_seen": 4723360, "step": 8145 }, { "epoch": 1.2138814417634793, "grad_norm": 0.7022506594657898, "learning_rate": 3.0343312481382184e-05, "loss": 0.7025, "num_input_tokens_seen": 4726208, "step": 8150 }, { "epoch": 1.2146261543044385, "grad_norm": 3.1767172813415527, "learning_rate": 3.0361930294906166e-05, "loss": 0.8148, "num_input_tokens_seen": 4729024, "step": 8155 }, { "epoch": 1.2153708668453977, "grad_norm": 1.2012414932250977, "learning_rate": 3.0380548108430147e-05, "loss": 0.6805, "num_input_tokens_seen": 4731904, "step": 8160 }, { "epoch": 1.216115579386357, "grad_norm": 1.238655686378479, "learning_rate": 3.0399165921954126e-05, "loss": 0.7586, "num_input_tokens_seen": 4734432, "step": 8165 }, { "epoch": 1.2168602919273162, "grad_norm": 0.734833300113678, "learning_rate": 3.0417783735478107e-05, "loss": 0.5872, "num_input_tokens_seen": 4737216, "step": 8170 }, { "epoch": 1.2176050044682754, "grad_norm": 1.5637835264205933, "learning_rate": 3.0436401549002085e-05, "loss": 0.7209, "num_input_tokens_seen": 4739936, "step": 8175 }, { "epoch": 1.2183497170092346, "grad_norm": 0.6918932795524597, "learning_rate": 3.0455019362526067e-05, "loss": 0.6783, "num_input_tokens_seen": 4743072, "step": 8180 }, { "epoch": 1.2190944295501935, "grad_norm": 1.0992143154144287, "learning_rate": 3.0473637176050045e-05, "loss": 0.7397, "num_input_tokens_seen": 4745984, "step": 8185 }, { "epoch": 1.2198391420911527, "grad_norm": 0.6158088445663452, "learning_rate": 3.0492254989574027e-05, "loss": 0.7291, "num_input_tokens_seen": 4748896, "step": 8190 }, { "epoch": 1.220583854632112, "grad_norm": 0.9826266765594482, "learning_rate": 3.0510872803098005e-05, "loss": 0.7003, "num_input_tokens_seen": 4751808, "step": 8195 }, { "epoch": 1.2213285671730711, "grad_norm": 0.9465678930282593, "learning_rate": 3.052949061662199e-05, "loss": 0.8387, "num_input_tokens_seen": 4754496, "step": 8200 }, { "epoch": 1.2220732797140303, "grad_norm": 0.495975524187088, "learning_rate": 3.0548108430145965e-05, "loss": 0.662, "num_input_tokens_seen": 4757344, "step": 8205 }, { "epoch": 1.2228179922549895, "grad_norm": 1.039905309677124, "learning_rate": 3.056672624366994e-05, "loss": 0.7618, "num_input_tokens_seen": 4760480, "step": 8210 }, { "epoch": 1.2235627047959488, "grad_norm": 1.2029865980148315, "learning_rate": 3.058534405719393e-05, "loss": 0.7152, "num_input_tokens_seen": 4763200, "step": 8215 }, { "epoch": 1.224307417336908, "grad_norm": 0.9443116784095764, "learning_rate": 3.0603961870717907e-05, "loss": 0.6477, "num_input_tokens_seen": 4766144, "step": 8220 }, { "epoch": 1.2250521298778672, "grad_norm": 0.6667184829711914, "learning_rate": 3.0622579684241885e-05, "loss": 0.7124, "num_input_tokens_seen": 4769440, "step": 8225 }, { "epoch": 1.2257968424188264, "grad_norm": 0.9440143704414368, "learning_rate": 3.064119749776586e-05, "loss": 0.6838, "num_input_tokens_seen": 4772320, "step": 8230 }, { "epoch": 1.2265415549597856, "grad_norm": 0.7514712810516357, "learning_rate": 3.065981531128984e-05, "loss": 0.7633, "num_input_tokens_seen": 4775232, "step": 8235 }, { "epoch": 1.2272862675007448, "grad_norm": 0.8071455359458923, "learning_rate": 3.067843312481382e-05, "loss": 0.5505, "num_input_tokens_seen": 4778368, "step": 8240 }, { "epoch": 1.228030980041704, "grad_norm": 0.7538226842880249, "learning_rate": 3.0697050938337804e-05, "loss": 0.7426, "num_input_tokens_seen": 4781312, "step": 8245 }, { "epoch": 1.2287756925826632, "grad_norm": 0.6941511034965515, "learning_rate": 3.071566875186178e-05, "loss": 0.7679, "num_input_tokens_seen": 4784128, "step": 8250 }, { "epoch": 1.2295204051236224, "grad_norm": 0.7796671986579895, "learning_rate": 3.073428656538576e-05, "loss": 0.5529, "num_input_tokens_seen": 4786976, "step": 8255 }, { "epoch": 1.2302651176645814, "grad_norm": 0.6891672611236572, "learning_rate": 3.075290437890974e-05, "loss": 0.6579, "num_input_tokens_seen": 4789856, "step": 8260 }, { "epoch": 1.2310098302055406, "grad_norm": 1.1374956369400024, "learning_rate": 3.0771522192433724e-05, "loss": 0.7164, "num_input_tokens_seen": 4792704, "step": 8265 }, { "epoch": 1.2317545427464998, "grad_norm": 0.7580830454826355, "learning_rate": 3.07901400059577e-05, "loss": 0.7824, "num_input_tokens_seen": 4795712, "step": 8270 }, { "epoch": 1.232499255287459, "grad_norm": 0.7890099287033081, "learning_rate": 3.080875781948168e-05, "loss": 0.803, "num_input_tokens_seen": 4798560, "step": 8275 }, { "epoch": 1.2332439678284182, "grad_norm": 0.8590041399002075, "learning_rate": 3.082737563300566e-05, "loss": 0.7584, "num_input_tokens_seen": 4801312, "step": 8280 }, { "epoch": 1.2339886803693774, "grad_norm": 0.9849494099617004, "learning_rate": 3.0845993446529644e-05, "loss": 0.718, "num_input_tokens_seen": 4804192, "step": 8285 }, { "epoch": 1.2347333929103366, "grad_norm": 1.7426671981811523, "learning_rate": 3.086461126005362e-05, "loss": 0.7263, "num_input_tokens_seen": 4807072, "step": 8290 }, { "epoch": 1.2354781054512958, "grad_norm": 0.7276670932769775, "learning_rate": 3.08832290735776e-05, "loss": 0.5503, "num_input_tokens_seen": 4810048, "step": 8295 }, { "epoch": 1.236222817992255, "grad_norm": 0.939372181892395, "learning_rate": 3.0901846887101585e-05, "loss": 0.7487, "num_input_tokens_seen": 4813120, "step": 8300 }, { "epoch": 1.2369675305332142, "grad_norm": 0.9002695679664612, "learning_rate": 3.0920464700625564e-05, "loss": 0.7133, "num_input_tokens_seen": 4816128, "step": 8305 }, { "epoch": 1.2377122430741734, "grad_norm": 0.9662193059921265, "learning_rate": 3.093908251414954e-05, "loss": 0.83, "num_input_tokens_seen": 4818976, "step": 8310 }, { "epoch": 1.2384569556151326, "grad_norm": 1.0514633655548096, "learning_rate": 3.095770032767352e-05, "loss": 0.8149, "num_input_tokens_seen": 4821760, "step": 8315 }, { "epoch": 1.2392016681560918, "grad_norm": 1.1790635585784912, "learning_rate": 3.09763181411975e-05, "loss": 0.674, "num_input_tokens_seen": 4824544, "step": 8320 }, { "epoch": 1.239946380697051, "grad_norm": 0.8987870812416077, "learning_rate": 3.0994935954721477e-05, "loss": 0.6527, "num_input_tokens_seen": 4827648, "step": 8325 }, { "epoch": 1.2406910932380102, "grad_norm": 0.9136849045753479, "learning_rate": 3.1013553768245455e-05, "loss": 0.5975, "num_input_tokens_seen": 4830336, "step": 8330 }, { "epoch": 1.2414358057789694, "grad_norm": 0.868553876876831, "learning_rate": 3.103217158176944e-05, "loss": 0.7924, "num_input_tokens_seen": 4833056, "step": 8335 }, { "epoch": 1.2421805183199286, "grad_norm": 0.8295322060585022, "learning_rate": 3.105078939529342e-05, "loss": 0.6665, "num_input_tokens_seen": 4836096, "step": 8340 }, { "epoch": 1.2429252308608878, "grad_norm": 0.7585185766220093, "learning_rate": 3.1069407208817396e-05, "loss": 0.7048, "num_input_tokens_seen": 4838912, "step": 8345 }, { "epoch": 1.243669943401847, "grad_norm": 1.1029759645462036, "learning_rate": 3.1088025022341374e-05, "loss": 0.8268, "num_input_tokens_seen": 4841728, "step": 8350 }, { "epoch": 1.244414655942806, "grad_norm": 0.6881669759750366, "learning_rate": 3.110664283586536e-05, "loss": 0.7845, "num_input_tokens_seen": 4844928, "step": 8355 }, { "epoch": 1.2451593684837652, "grad_norm": 1.1341925859451294, "learning_rate": 3.112526064938934e-05, "loss": 0.7763, "num_input_tokens_seen": 4848096, "step": 8360 }, { "epoch": 1.2459040810247244, "grad_norm": 0.6820374131202698, "learning_rate": 3.1143878462913316e-05, "loss": 0.7644, "num_input_tokens_seen": 4851296, "step": 8365 }, { "epoch": 1.2466487935656836, "grad_norm": 1.080929160118103, "learning_rate": 3.1162496276437294e-05, "loss": 0.7362, "num_input_tokens_seen": 4854208, "step": 8370 }, { "epoch": 1.2473935061066428, "grad_norm": 0.9775242805480957, "learning_rate": 3.118111408996128e-05, "loss": 0.7171, "num_input_tokens_seen": 4856896, "step": 8375 }, { "epoch": 1.248138218647602, "grad_norm": 0.7246795296669006, "learning_rate": 3.119973190348526e-05, "loss": 0.686, "num_input_tokens_seen": 4859744, "step": 8380 }, { "epoch": 1.2488829311885612, "grad_norm": 1.1424388885498047, "learning_rate": 3.1218349717009236e-05, "loss": 0.6887, "num_input_tokens_seen": 4862560, "step": 8385 }, { "epoch": 1.2496276437295204, "grad_norm": 0.9343826770782471, "learning_rate": 3.123696753053322e-05, "loss": 0.6538, "num_input_tokens_seen": 4865568, "step": 8390 }, { "epoch": 1.2503723562704796, "grad_norm": 0.7841506600379944, "learning_rate": 3.12555853440572e-05, "loss": 0.5458, "num_input_tokens_seen": 4868384, "step": 8395 }, { "epoch": 1.2511170688114388, "grad_norm": 0.956597089767456, "learning_rate": 3.127420315758118e-05, "loss": 0.6374, "num_input_tokens_seen": 4870912, "step": 8400 }, { "epoch": 1.251861781352398, "grad_norm": 1.046770453453064, "learning_rate": 3.1292820971105155e-05, "loss": 0.7122, "num_input_tokens_seen": 4873792, "step": 8405 }, { "epoch": 1.2526064938933572, "grad_norm": 0.8820906281471252, "learning_rate": 3.1311438784629134e-05, "loss": 0.644, "num_input_tokens_seen": 4876864, "step": 8410 }, { "epoch": 1.2533512064343164, "grad_norm": 0.8196119070053101, "learning_rate": 3.133005659815311e-05, "loss": 0.8516, "num_input_tokens_seen": 4879872, "step": 8415 }, { "epoch": 1.2540959189752756, "grad_norm": 0.9552655220031738, "learning_rate": 3.134867441167709e-05, "loss": 0.7209, "num_input_tokens_seen": 4882944, "step": 8420 }, { "epoch": 1.2548406315162346, "grad_norm": 0.6359178423881531, "learning_rate": 3.1367292225201075e-05, "loss": 0.6941, "num_input_tokens_seen": 4885952, "step": 8425 }, { "epoch": 1.2555853440571938, "grad_norm": 1.6631102561950684, "learning_rate": 3.138591003872505e-05, "loss": 0.8039, "num_input_tokens_seen": 4888768, "step": 8430 }, { "epoch": 1.256330056598153, "grad_norm": 0.5579536557197571, "learning_rate": 3.140452785224903e-05, "loss": 0.6987, "num_input_tokens_seen": 4891584, "step": 8435 }, { "epoch": 1.2570747691391122, "grad_norm": 1.1660658121109009, "learning_rate": 3.142314566577301e-05, "loss": 0.7284, "num_input_tokens_seen": 4894432, "step": 8440 }, { "epoch": 1.2578194816800714, "grad_norm": 1.0849817991256714, "learning_rate": 3.1441763479296995e-05, "loss": 0.7102, "num_input_tokens_seen": 4897152, "step": 8445 }, { "epoch": 1.2585641942210306, "grad_norm": 1.0290398597717285, "learning_rate": 3.146038129282097e-05, "loss": 0.7723, "num_input_tokens_seen": 4900224, "step": 8450 }, { "epoch": 1.2593089067619898, "grad_norm": 0.9202945828437805, "learning_rate": 3.147899910634495e-05, "loss": 0.7101, "num_input_tokens_seen": 4903072, "step": 8455 }, { "epoch": 1.260053619302949, "grad_norm": 0.8312545418739319, "learning_rate": 3.1497616919868936e-05, "loss": 0.7804, "num_input_tokens_seen": 4905792, "step": 8460 }, { "epoch": 1.2607983318439082, "grad_norm": 0.8061549067497253, "learning_rate": 3.1516234733392914e-05, "loss": 0.6885, "num_input_tokens_seen": 4908608, "step": 8465 }, { "epoch": 1.2615430443848674, "grad_norm": 0.737993597984314, "learning_rate": 3.153485254691689e-05, "loss": 0.8089, "num_input_tokens_seen": 4911648, "step": 8470 }, { "epoch": 1.2622877569258266, "grad_norm": 1.753991723060608, "learning_rate": 3.155347036044087e-05, "loss": 0.776, "num_input_tokens_seen": 4914816, "step": 8475 }, { "epoch": 1.2630324694667858, "grad_norm": 0.7662416696548462, "learning_rate": 3.1572088173964856e-05, "loss": 0.7707, "num_input_tokens_seen": 4917728, "step": 8480 }, { "epoch": 1.263777182007745, "grad_norm": 1.0001782178878784, "learning_rate": 3.1590705987488834e-05, "loss": 0.8087, "num_input_tokens_seen": 4920704, "step": 8485 }, { "epoch": 1.2645218945487042, "grad_norm": 1.601292610168457, "learning_rate": 3.1609323801012806e-05, "loss": 0.8006, "num_input_tokens_seen": 4923392, "step": 8490 }, { "epoch": 1.2652666070896634, "grad_norm": 1.122557282447815, "learning_rate": 3.162794161453679e-05, "loss": 0.7803, "num_input_tokens_seen": 4926432, "step": 8495 }, { "epoch": 1.2660113196306226, "grad_norm": 1.3651843070983887, "learning_rate": 3.164655942806077e-05, "loss": 0.7842, "num_input_tokens_seen": 4929600, "step": 8500 }, { "epoch": 1.2667560321715818, "grad_norm": 0.7718888521194458, "learning_rate": 3.166517724158475e-05, "loss": 0.7124, "num_input_tokens_seen": 4932288, "step": 8505 }, { "epoch": 1.267500744712541, "grad_norm": 0.6673718690872192, "learning_rate": 3.1683795055108725e-05, "loss": 0.6192, "num_input_tokens_seen": 4935168, "step": 8510 }, { "epoch": 1.2682454572535002, "grad_norm": 0.787990391254425, "learning_rate": 3.170241286863271e-05, "loss": 0.6933, "num_input_tokens_seen": 4937920, "step": 8515 }, { "epoch": 1.2689901697944594, "grad_norm": 0.8000136613845825, "learning_rate": 3.172103068215669e-05, "loss": 0.742, "num_input_tokens_seen": 4940992, "step": 8520 }, { "epoch": 1.2697348823354186, "grad_norm": 0.6323050260543823, "learning_rate": 3.173964849568067e-05, "loss": 0.8115, "num_input_tokens_seen": 4944128, "step": 8525 }, { "epoch": 1.2704795948763778, "grad_norm": 1.2004318237304688, "learning_rate": 3.1758266309204645e-05, "loss": 0.6737, "num_input_tokens_seen": 4947072, "step": 8530 }, { "epoch": 1.2712243074173368, "grad_norm": 0.6927597522735596, "learning_rate": 3.177688412272863e-05, "loss": 0.7735, "num_input_tokens_seen": 4949760, "step": 8535 }, { "epoch": 1.271969019958296, "grad_norm": 0.557817280292511, "learning_rate": 3.179550193625261e-05, "loss": 0.7657, "num_input_tokens_seen": 4952544, "step": 8540 }, { "epoch": 1.2727137324992552, "grad_norm": 1.3833402395248413, "learning_rate": 3.1814119749776586e-05, "loss": 0.7241, "num_input_tokens_seen": 4955520, "step": 8545 }, { "epoch": 1.2734584450402144, "grad_norm": 1.2404299974441528, "learning_rate": 3.183273756330057e-05, "loss": 0.8184, "num_input_tokens_seen": 4958272, "step": 8550 }, { "epoch": 1.2742031575811736, "grad_norm": 0.8931539058685303, "learning_rate": 3.185135537682455e-05, "loss": 0.6881, "num_input_tokens_seen": 4961280, "step": 8555 }, { "epoch": 1.2749478701221328, "grad_norm": 0.9277852773666382, "learning_rate": 3.186997319034853e-05, "loss": 0.7815, "num_input_tokens_seen": 4964032, "step": 8560 }, { "epoch": 1.275692582663092, "grad_norm": 0.9447735548019409, "learning_rate": 3.1888591003872506e-05, "loss": 0.6763, "num_input_tokens_seen": 4967008, "step": 8565 }, { "epoch": 1.2764372952040512, "grad_norm": 3.022714853286743, "learning_rate": 3.190720881739649e-05, "loss": 0.7875, "num_input_tokens_seen": 4969696, "step": 8570 }, { "epoch": 1.2771820077450105, "grad_norm": 1.5861282348632812, "learning_rate": 3.192582663092047e-05, "loss": 0.8348, "num_input_tokens_seen": 4973280, "step": 8575 }, { "epoch": 1.2779267202859697, "grad_norm": 0.8040211200714111, "learning_rate": 3.194444444444444e-05, "loss": 0.7127, "num_input_tokens_seen": 4976032, "step": 8580 }, { "epoch": 1.2786714328269289, "grad_norm": 0.8535705208778381, "learning_rate": 3.1963062257968426e-05, "loss": 0.6784, "num_input_tokens_seen": 4979168, "step": 8585 }, { "epoch": 1.279416145367888, "grad_norm": 0.7573575377464294, "learning_rate": 3.1981680071492404e-05, "loss": 0.7087, "num_input_tokens_seen": 4982112, "step": 8590 }, { "epoch": 1.2801608579088473, "grad_norm": 1.062549114227295, "learning_rate": 3.200029788501638e-05, "loss": 0.6906, "num_input_tokens_seen": 4984800, "step": 8595 }, { "epoch": 1.2809055704498062, "grad_norm": 0.7884287238121033, "learning_rate": 3.201891569854036e-05, "loss": 0.6999, "num_input_tokens_seen": 4987584, "step": 8600 }, { "epoch": 1.2816502829907654, "grad_norm": 1.1126959323883057, "learning_rate": 3.2037533512064346e-05, "loss": 0.7971, "num_input_tokens_seen": 4990592, "step": 8605 }, { "epoch": 1.2823949955317246, "grad_norm": 1.134619116783142, "learning_rate": 3.2056151325588324e-05, "loss": 0.7369, "num_input_tokens_seen": 4993696, "step": 8610 }, { "epoch": 1.2831397080726838, "grad_norm": 0.645961582660675, "learning_rate": 3.20747691391123e-05, "loss": 0.7344, "num_input_tokens_seen": 4996864, "step": 8615 }, { "epoch": 1.283884420613643, "grad_norm": 1.1363046169281006, "learning_rate": 3.209338695263629e-05, "loss": 0.7056, "num_input_tokens_seen": 4999744, "step": 8620 }, { "epoch": 1.2846291331546023, "grad_norm": 0.761626124382019, "learning_rate": 3.2112004766160265e-05, "loss": 0.6654, "num_input_tokens_seen": 5002656, "step": 8625 }, { "epoch": 1.2853738456955615, "grad_norm": 0.7727442383766174, "learning_rate": 3.2130622579684244e-05, "loss": 0.7202, "num_input_tokens_seen": 5005280, "step": 8630 }, { "epoch": 1.2861185582365207, "grad_norm": 0.6685166358947754, "learning_rate": 3.214924039320822e-05, "loss": 0.7798, "num_input_tokens_seen": 5008352, "step": 8635 }, { "epoch": 1.2868632707774799, "grad_norm": 0.8008362054824829, "learning_rate": 3.216785820673221e-05, "loss": 0.7535, "num_input_tokens_seen": 5011200, "step": 8640 }, { "epoch": 1.287607983318439, "grad_norm": 0.6237162947654724, "learning_rate": 3.2186476020256185e-05, "loss": 0.7122, "num_input_tokens_seen": 5014016, "step": 8645 }, { "epoch": 1.2883526958593983, "grad_norm": 0.8393365740776062, "learning_rate": 3.220509383378016e-05, "loss": 0.7264, "num_input_tokens_seen": 5016832, "step": 8650 }, { "epoch": 1.2890974084003575, "grad_norm": 0.9760174751281738, "learning_rate": 3.222371164730414e-05, "loss": 0.7677, "num_input_tokens_seen": 5019904, "step": 8655 }, { "epoch": 1.2898421209413167, "grad_norm": 1.36598539352417, "learning_rate": 3.2242329460828126e-05, "loss": 0.811, "num_input_tokens_seen": 5022784, "step": 8660 }, { "epoch": 1.2905868334822759, "grad_norm": 1.0592398643493652, "learning_rate": 3.22609472743521e-05, "loss": 0.6081, "num_input_tokens_seen": 5025696, "step": 8665 }, { "epoch": 1.291331546023235, "grad_norm": 0.6864753365516663, "learning_rate": 3.2279565087876076e-05, "loss": 0.6379, "num_input_tokens_seen": 5028704, "step": 8670 }, { "epoch": 1.2920762585641943, "grad_norm": 1.024445652961731, "learning_rate": 3.229818290140006e-05, "loss": 0.7271, "num_input_tokens_seen": 5031648, "step": 8675 }, { "epoch": 1.2928209711051535, "grad_norm": 0.862872838973999, "learning_rate": 3.231680071492404e-05, "loss": 0.6847, "num_input_tokens_seen": 5034752, "step": 8680 }, { "epoch": 1.2935656836461127, "grad_norm": 1.0960001945495605, "learning_rate": 3.233541852844802e-05, "loss": 0.6767, "num_input_tokens_seen": 5037568, "step": 8685 }, { "epoch": 1.2943103961870719, "grad_norm": 0.7143127918243408, "learning_rate": 3.2354036341972e-05, "loss": 0.7079, "num_input_tokens_seen": 5040000, "step": 8690 }, { "epoch": 1.295055108728031, "grad_norm": 1.0771416425704956, "learning_rate": 3.237265415549598e-05, "loss": 0.6472, "num_input_tokens_seen": 5042976, "step": 8695 }, { "epoch": 1.2957998212689903, "grad_norm": 0.8218916058540344, "learning_rate": 3.239127196901996e-05, "loss": 0.7546, "num_input_tokens_seen": 5045792, "step": 8700 }, { "epoch": 1.2965445338099495, "grad_norm": 0.9659327864646912, "learning_rate": 3.240988978254394e-05, "loss": 0.5979, "num_input_tokens_seen": 5048288, "step": 8705 }, { "epoch": 1.2972892463509085, "grad_norm": 1.4213950634002686, "learning_rate": 3.242850759606792e-05, "loss": 0.6784, "num_input_tokens_seen": 5051200, "step": 8710 }, { "epoch": 1.2980339588918677, "grad_norm": 1.360995888710022, "learning_rate": 3.24471254095919e-05, "loss": 0.7933, "num_input_tokens_seen": 5054048, "step": 8715 }, { "epoch": 1.2987786714328269, "grad_norm": 0.83002108335495, "learning_rate": 3.246574322311588e-05, "loss": 0.6065, "num_input_tokens_seen": 5056736, "step": 8720 }, { "epoch": 1.299523383973786, "grad_norm": 0.6063066124916077, "learning_rate": 3.248436103663986e-05, "loss": 0.8159, "num_input_tokens_seen": 5059936, "step": 8725 }, { "epoch": 1.3002680965147453, "grad_norm": 0.6713640093803406, "learning_rate": 3.250297885016384e-05, "loss": 0.6347, "num_input_tokens_seen": 5062688, "step": 8730 }, { "epoch": 1.3010128090557045, "grad_norm": 1.4960883855819702, "learning_rate": 3.252159666368782e-05, "loss": 0.7724, "num_input_tokens_seen": 5065792, "step": 8735 }, { "epoch": 1.3017575215966637, "grad_norm": 0.6820748448371887, "learning_rate": 3.25402144772118e-05, "loss": 0.688, "num_input_tokens_seen": 5068768, "step": 8740 }, { "epoch": 1.302502234137623, "grad_norm": 0.816655695438385, "learning_rate": 3.255883229073578e-05, "loss": 0.7809, "num_input_tokens_seen": 5071584, "step": 8745 }, { "epoch": 1.303246946678582, "grad_norm": 0.9760273098945618, "learning_rate": 3.2577450104259755e-05, "loss": 0.6604, "num_input_tokens_seen": 5074400, "step": 8750 }, { "epoch": 1.3039916592195413, "grad_norm": 0.8415826559066772, "learning_rate": 3.259606791778373e-05, "loss": 0.6671, "num_input_tokens_seen": 5077120, "step": 8755 }, { "epoch": 1.3047363717605005, "grad_norm": 1.7581150531768799, "learning_rate": 3.261468573130771e-05, "loss": 0.6843, "num_input_tokens_seen": 5079872, "step": 8760 }, { "epoch": 1.3054810843014597, "grad_norm": 1.2397174835205078, "learning_rate": 3.2633303544831696e-05, "loss": 0.6457, "num_input_tokens_seen": 5082720, "step": 8765 }, { "epoch": 1.306225796842419, "grad_norm": 1.694628119468689, "learning_rate": 3.2651921358355675e-05, "loss": 0.7613, "num_input_tokens_seen": 5085504, "step": 8770 }, { "epoch": 1.3069705093833779, "grad_norm": 0.8634694218635559, "learning_rate": 3.267053917187965e-05, "loss": 0.602, "num_input_tokens_seen": 5088000, "step": 8775 }, { "epoch": 1.307715221924337, "grad_norm": 2.5575757026672363, "learning_rate": 3.268915698540364e-05, "loss": 0.8827, "num_input_tokens_seen": 5090752, "step": 8780 }, { "epoch": 1.3084599344652963, "grad_norm": 0.8579412698745728, "learning_rate": 3.2707774798927616e-05, "loss": 0.7487, "num_input_tokens_seen": 5093440, "step": 8785 }, { "epoch": 1.3092046470062555, "grad_norm": 1.029982328414917, "learning_rate": 3.2726392612451594e-05, "loss": 0.7836, "num_input_tokens_seen": 5096288, "step": 8790 }, { "epoch": 1.3099493595472147, "grad_norm": 2.245513677597046, "learning_rate": 3.274501042597557e-05, "loss": 0.8253, "num_input_tokens_seen": 5099072, "step": 8795 }, { "epoch": 1.310694072088174, "grad_norm": 0.637413740158081, "learning_rate": 3.276362823949956e-05, "loss": 0.7721, "num_input_tokens_seen": 5102080, "step": 8800 }, { "epoch": 1.311438784629133, "grad_norm": 0.6720782518386841, "learning_rate": 3.2782246053023536e-05, "loss": 0.5895, "num_input_tokens_seen": 5105024, "step": 8805 }, { "epoch": 1.3121834971700923, "grad_norm": 1.1004369258880615, "learning_rate": 3.2800863866547514e-05, "loss": 0.7184, "num_input_tokens_seen": 5107904, "step": 8810 }, { "epoch": 1.3129282097110515, "grad_norm": 0.8584945797920227, "learning_rate": 3.281948168007149e-05, "loss": 0.6955, "num_input_tokens_seen": 5110880, "step": 8815 }, { "epoch": 1.3136729222520107, "grad_norm": 0.9360366463661194, "learning_rate": 3.283809949359548e-05, "loss": 0.7633, "num_input_tokens_seen": 5113472, "step": 8820 }, { "epoch": 1.31441763479297, "grad_norm": 1.2090662717819214, "learning_rate": 3.2856717307119456e-05, "loss": 0.754, "num_input_tokens_seen": 5116544, "step": 8825 }, { "epoch": 1.3151623473339291, "grad_norm": 0.6457202434539795, "learning_rate": 3.2875335120643434e-05, "loss": 0.5445, "num_input_tokens_seen": 5119136, "step": 8830 }, { "epoch": 1.3159070598748883, "grad_norm": 1.2379215955734253, "learning_rate": 3.289395293416741e-05, "loss": 0.7845, "num_input_tokens_seen": 5122016, "step": 8835 }, { "epoch": 1.3166517724158475, "grad_norm": 0.6860499978065491, "learning_rate": 3.291257074769139e-05, "loss": 0.6424, "num_input_tokens_seen": 5124960, "step": 8840 }, { "epoch": 1.3173964849568067, "grad_norm": 0.6191324591636658, "learning_rate": 3.293118856121537e-05, "loss": 0.7326, "num_input_tokens_seen": 5128000, "step": 8845 }, { "epoch": 1.318141197497766, "grad_norm": 0.9364475607872009, "learning_rate": 3.2949806374739354e-05, "loss": 0.6968, "num_input_tokens_seen": 5131008, "step": 8850 }, { "epoch": 1.3188859100387251, "grad_norm": 0.9825989007949829, "learning_rate": 3.296842418826333e-05, "loss": 0.935, "num_input_tokens_seen": 5133824, "step": 8855 }, { "epoch": 1.3196306225796843, "grad_norm": 0.8272316455841064, "learning_rate": 3.298704200178731e-05, "loss": 0.7275, "num_input_tokens_seen": 5136608, "step": 8860 }, { "epoch": 1.3203753351206435, "grad_norm": 0.6350182890892029, "learning_rate": 3.300565981531129e-05, "loss": 0.6259, "num_input_tokens_seen": 5139360, "step": 8865 }, { "epoch": 1.3211200476616027, "grad_norm": 0.7765825986862183, "learning_rate": 3.302427762883527e-05, "loss": 0.7373, "num_input_tokens_seen": 5142208, "step": 8870 }, { "epoch": 1.321864760202562, "grad_norm": 0.7266759276390076, "learning_rate": 3.304289544235925e-05, "loss": 0.6771, "num_input_tokens_seen": 5145280, "step": 8875 }, { "epoch": 1.322609472743521, "grad_norm": 0.5888853669166565, "learning_rate": 3.306151325588323e-05, "loss": 0.7639, "num_input_tokens_seen": 5148224, "step": 8880 }, { "epoch": 1.3233541852844801, "grad_norm": 0.7548285722732544, "learning_rate": 3.308013106940721e-05, "loss": 0.7908, "num_input_tokens_seen": 5150976, "step": 8885 }, { "epoch": 1.3240988978254393, "grad_norm": 0.7471370697021484, "learning_rate": 3.309874888293119e-05, "loss": 0.7991, "num_input_tokens_seen": 5153504, "step": 8890 }, { "epoch": 1.3248436103663985, "grad_norm": 0.6607269644737244, "learning_rate": 3.311736669645517e-05, "loss": 0.7852, "num_input_tokens_seen": 5156512, "step": 8895 }, { "epoch": 1.3255883229073577, "grad_norm": 0.8759587407112122, "learning_rate": 3.313598450997915e-05, "loss": 0.6785, "num_input_tokens_seen": 5159328, "step": 8900 }, { "epoch": 1.326333035448317, "grad_norm": 0.7344845533370972, "learning_rate": 3.3154602323503134e-05, "loss": 0.8029, "num_input_tokens_seen": 5162048, "step": 8905 }, { "epoch": 1.3270777479892761, "grad_norm": 0.8601431846618652, "learning_rate": 3.317322013702711e-05, "loss": 0.7047, "num_input_tokens_seen": 5165056, "step": 8910 }, { "epoch": 1.3278224605302353, "grad_norm": 0.9860232472419739, "learning_rate": 3.319183795055109e-05, "loss": 0.8115, "num_input_tokens_seen": 5168000, "step": 8915 }, { "epoch": 1.3285671730711945, "grad_norm": 1.1231833696365356, "learning_rate": 3.321045576407507e-05, "loss": 0.8218, "num_input_tokens_seen": 5170784, "step": 8920 }, { "epoch": 1.3293118856121537, "grad_norm": 1.0368880033493042, "learning_rate": 3.322907357759905e-05, "loss": 0.8254, "num_input_tokens_seen": 5174016, "step": 8925 }, { "epoch": 1.330056598153113, "grad_norm": 0.6849382519721985, "learning_rate": 3.3247691391123026e-05, "loss": 0.5785, "num_input_tokens_seen": 5177120, "step": 8930 }, { "epoch": 1.3308013106940721, "grad_norm": 0.9567258358001709, "learning_rate": 3.3266309204647004e-05, "loss": 0.7292, "num_input_tokens_seen": 5179904, "step": 8935 }, { "epoch": 1.3315460232350314, "grad_norm": 0.7997138500213623, "learning_rate": 3.328492701817099e-05, "loss": 0.7308, "num_input_tokens_seen": 5182912, "step": 8940 }, { "epoch": 1.3322907357759903, "grad_norm": 1.1671754121780396, "learning_rate": 3.330354483169497e-05, "loss": 0.8661, "num_input_tokens_seen": 5185888, "step": 8945 }, { "epoch": 1.3330354483169495, "grad_norm": 0.7824345827102661, "learning_rate": 3.3322162645218945e-05, "loss": 0.7029, "num_input_tokens_seen": 5188864, "step": 8950 }, { "epoch": 1.3337801608579087, "grad_norm": 1.2693955898284912, "learning_rate": 3.3340780458742924e-05, "loss": 0.6248, "num_input_tokens_seen": 5191392, "step": 8955 }, { "epoch": 1.334524873398868, "grad_norm": 1.0189307928085327, "learning_rate": 3.335939827226691e-05, "loss": 0.7733, "num_input_tokens_seen": 5193984, "step": 8960 }, { "epoch": 1.3352695859398271, "grad_norm": 1.2166210412979126, "learning_rate": 3.337801608579089e-05, "loss": 0.6453, "num_input_tokens_seen": 5196960, "step": 8965 }, { "epoch": 1.3360142984807863, "grad_norm": 1.3425567150115967, "learning_rate": 3.3396633899314865e-05, "loss": 0.6854, "num_input_tokens_seen": 5199776, "step": 8970 }, { "epoch": 1.3367590110217455, "grad_norm": 0.9233789443969727, "learning_rate": 3.341525171283884e-05, "loss": 0.6511, "num_input_tokens_seen": 5203040, "step": 8975 }, { "epoch": 1.3375037235627047, "grad_norm": 0.7734823226928711, "learning_rate": 3.343386952636283e-05, "loss": 0.5821, "num_input_tokens_seen": 5205952, "step": 8980 }, { "epoch": 1.338248436103664, "grad_norm": 0.6766423583030701, "learning_rate": 3.3452487339886806e-05, "loss": 0.6182, "num_input_tokens_seen": 5208768, "step": 8985 }, { "epoch": 1.3389931486446232, "grad_norm": 0.742243230342865, "learning_rate": 3.3471105153410785e-05, "loss": 0.745, "num_input_tokens_seen": 5211584, "step": 8990 }, { "epoch": 1.3397378611855824, "grad_norm": 1.067766547203064, "learning_rate": 3.348972296693477e-05, "loss": 0.8093, "num_input_tokens_seen": 5214752, "step": 8995 }, { "epoch": 1.3404825737265416, "grad_norm": 0.9580211639404297, "learning_rate": 3.350834078045875e-05, "loss": 0.6508, "num_input_tokens_seen": 5217536, "step": 9000 }, { "epoch": 1.3412272862675008, "grad_norm": 1.077825665473938, "learning_rate": 3.3526958593982726e-05, "loss": 0.7752, "num_input_tokens_seen": 5220480, "step": 9005 }, { "epoch": 1.34197199880846, "grad_norm": 1.1366422176361084, "learning_rate": 3.3545576407506704e-05, "loss": 0.7776, "num_input_tokens_seen": 5223520, "step": 9010 }, { "epoch": 1.3427167113494192, "grad_norm": 0.968633770942688, "learning_rate": 3.356419422103068e-05, "loss": 0.7656, "num_input_tokens_seen": 5227744, "step": 9015 }, { "epoch": 1.3434614238903784, "grad_norm": 0.9184970855712891, "learning_rate": 3.358281203455466e-05, "loss": 0.6852, "num_input_tokens_seen": 5230400, "step": 9020 }, { "epoch": 1.3442061364313376, "grad_norm": 0.9436644315719604, "learning_rate": 3.360142984807864e-05, "loss": 0.6858, "num_input_tokens_seen": 5233312, "step": 9025 }, { "epoch": 1.3449508489722968, "grad_norm": 0.9237022995948792, "learning_rate": 3.3620047661602624e-05, "loss": 0.7016, "num_input_tokens_seen": 5236256, "step": 9030 }, { "epoch": 1.345695561513256, "grad_norm": 0.8650490641593933, "learning_rate": 3.36386654751266e-05, "loss": 0.6275, "num_input_tokens_seen": 5239296, "step": 9035 }, { "epoch": 1.3464402740542152, "grad_norm": 0.6547936797142029, "learning_rate": 3.365728328865058e-05, "loss": 0.6629, "num_input_tokens_seen": 5242560, "step": 9040 }, { "epoch": 1.3471849865951744, "grad_norm": 1.0480014085769653, "learning_rate": 3.367590110217456e-05, "loss": 0.669, "num_input_tokens_seen": 5245408, "step": 9045 }, { "epoch": 1.3479296991361336, "grad_norm": 1.040092945098877, "learning_rate": 3.3694518915698544e-05, "loss": 0.6988, "num_input_tokens_seen": 5248448, "step": 9050 }, { "epoch": 1.3486744116770926, "grad_norm": 0.922480583190918, "learning_rate": 3.371313672922252e-05, "loss": 0.6361, "num_input_tokens_seen": 5251616, "step": 9055 }, { "epoch": 1.3494191242180518, "grad_norm": 0.7459295392036438, "learning_rate": 3.37317545427465e-05, "loss": 0.7602, "num_input_tokens_seen": 5254752, "step": 9060 }, { "epoch": 1.350163836759011, "grad_norm": 2.199507474899292, "learning_rate": 3.3750372356270485e-05, "loss": 0.8296, "num_input_tokens_seen": 5257696, "step": 9065 }, { "epoch": 1.3509085492999702, "grad_norm": 0.8046471476554871, "learning_rate": 3.3768990169794464e-05, "loss": 0.8138, "num_input_tokens_seen": 5260800, "step": 9070 }, { "epoch": 1.3516532618409294, "grad_norm": 1.0010186433792114, "learning_rate": 3.378760798331844e-05, "loss": 0.6789, "num_input_tokens_seen": 5263488, "step": 9075 }, { "epoch": 1.3523979743818886, "grad_norm": 0.8622665405273438, "learning_rate": 3.380622579684242e-05, "loss": 0.6743, "num_input_tokens_seen": 5266272, "step": 9080 }, { "epoch": 1.3531426869228478, "grad_norm": 0.7424997687339783, "learning_rate": 3.3824843610366405e-05, "loss": 0.5654, "num_input_tokens_seen": 5269248, "step": 9085 }, { "epoch": 1.353887399463807, "grad_norm": 0.8696985244750977, "learning_rate": 3.384346142389038e-05, "loss": 0.751, "num_input_tokens_seen": 5272448, "step": 9090 }, { "epoch": 1.3546321120047662, "grad_norm": 1.0497126579284668, "learning_rate": 3.3862079237414355e-05, "loss": 0.7389, "num_input_tokens_seen": 5275200, "step": 9095 }, { "epoch": 1.3553768245457254, "grad_norm": 1.238069772720337, "learning_rate": 3.388069705093834e-05, "loss": 0.6043, "num_input_tokens_seen": 5277984, "step": 9100 }, { "epoch": 1.3561215370866846, "grad_norm": 0.7397809624671936, "learning_rate": 3.389931486446232e-05, "loss": 0.6662, "num_input_tokens_seen": 5281024, "step": 9105 }, { "epoch": 1.3568662496276438, "grad_norm": 1.8584342002868652, "learning_rate": 3.3917932677986296e-05, "loss": 0.6867, "num_input_tokens_seen": 5284000, "step": 9110 }, { "epoch": 1.357610962168603, "grad_norm": 0.6425997018814087, "learning_rate": 3.3936550491510274e-05, "loss": 0.77, "num_input_tokens_seen": 5286624, "step": 9115 }, { "epoch": 1.358355674709562, "grad_norm": 0.9872860312461853, "learning_rate": 3.395516830503426e-05, "loss": 0.6398, "num_input_tokens_seen": 5289408, "step": 9120 }, { "epoch": 1.3591003872505212, "grad_norm": 1.8789619207382202, "learning_rate": 3.397378611855824e-05, "loss": 0.6605, "num_input_tokens_seen": 5292480, "step": 9125 }, { "epoch": 1.3598450997914804, "grad_norm": 1.1344140768051147, "learning_rate": 3.3992403932082216e-05, "loss": 0.6875, "num_input_tokens_seen": 5295776, "step": 9130 }, { "epoch": 1.3605898123324396, "grad_norm": 2.104459047317505, "learning_rate": 3.4011021745606194e-05, "loss": 0.7947, "num_input_tokens_seen": 5298400, "step": 9135 }, { "epoch": 1.3613345248733988, "grad_norm": 0.7639133930206299, "learning_rate": 3.402963955913018e-05, "loss": 0.6925, "num_input_tokens_seen": 5301152, "step": 9140 }, { "epoch": 1.362079237414358, "grad_norm": 0.802121102809906, "learning_rate": 3.404825737265416e-05, "loss": 0.6685, "num_input_tokens_seen": 5304480, "step": 9145 }, { "epoch": 1.3628239499553172, "grad_norm": 0.774797260761261, "learning_rate": 3.4066875186178136e-05, "loss": 0.6395, "num_input_tokens_seen": 5307520, "step": 9150 }, { "epoch": 1.3635686624962764, "grad_norm": 1.2304151058197021, "learning_rate": 3.408549299970212e-05, "loss": 0.8365, "num_input_tokens_seen": 5310240, "step": 9155 }, { "epoch": 1.3643133750372356, "grad_norm": 1.0577577352523804, "learning_rate": 3.41041108132261e-05, "loss": 0.7306, "num_input_tokens_seen": 5312928, "step": 9160 }, { "epoch": 1.3650580875781948, "grad_norm": 0.7859435677528381, "learning_rate": 3.412272862675008e-05, "loss": 0.6881, "num_input_tokens_seen": 5316000, "step": 9165 }, { "epoch": 1.365802800119154, "grad_norm": 1.050953984260559, "learning_rate": 3.4141346440274055e-05, "loss": 0.8313, "num_input_tokens_seen": 5319232, "step": 9170 }, { "epoch": 1.3665475126601132, "grad_norm": 1.0523406267166138, "learning_rate": 3.415996425379804e-05, "loss": 0.6075, "num_input_tokens_seen": 5322080, "step": 9175 }, { "epoch": 1.3672922252010724, "grad_norm": 1.4394491910934448, "learning_rate": 3.417858206732202e-05, "loss": 0.7855, "num_input_tokens_seen": 5324928, "step": 9180 }, { "epoch": 1.3680369377420316, "grad_norm": 0.828524112701416, "learning_rate": 3.419719988084599e-05, "loss": 0.7184, "num_input_tokens_seen": 5327744, "step": 9185 }, { "epoch": 1.3687816502829908, "grad_norm": 1.334468126296997, "learning_rate": 3.4215817694369975e-05, "loss": 0.7409, "num_input_tokens_seen": 5330400, "step": 9190 }, { "epoch": 1.36952636282395, "grad_norm": 0.7038580775260925, "learning_rate": 3.423443550789395e-05, "loss": 0.6818, "num_input_tokens_seen": 5333248, "step": 9195 }, { "epoch": 1.3702710753649092, "grad_norm": 1.2533273696899414, "learning_rate": 3.425305332141793e-05, "loss": 0.6132, "num_input_tokens_seen": 5336064, "step": 9200 }, { "epoch": 1.3710157879058684, "grad_norm": 0.722046971321106, "learning_rate": 3.427167113494191e-05, "loss": 0.7313, "num_input_tokens_seen": 5339072, "step": 9205 }, { "epoch": 1.3717605004468276, "grad_norm": 1.4416712522506714, "learning_rate": 3.4290288948465895e-05, "loss": 0.6912, "num_input_tokens_seen": 5342016, "step": 9210 }, { "epoch": 1.3725052129877868, "grad_norm": 0.7703729867935181, "learning_rate": 3.430890676198987e-05, "loss": 0.5558, "num_input_tokens_seen": 5344896, "step": 9215 }, { "epoch": 1.373249925528746, "grad_norm": 0.7001887559890747, "learning_rate": 3.432752457551385e-05, "loss": 0.7973, "num_input_tokens_seen": 5347648, "step": 9220 }, { "epoch": 1.3739946380697052, "grad_norm": 0.9209387302398682, "learning_rate": 3.4346142389037836e-05, "loss": 0.6349, "num_input_tokens_seen": 5350624, "step": 9225 }, { "epoch": 1.3747393506106642, "grad_norm": 0.6113775372505188, "learning_rate": 3.4364760202561814e-05, "loss": 0.6786, "num_input_tokens_seen": 5353728, "step": 9230 }, { "epoch": 1.3754840631516234, "grad_norm": 1.2404898405075073, "learning_rate": 3.438337801608579e-05, "loss": 0.6316, "num_input_tokens_seen": 5356640, "step": 9235 }, { "epoch": 1.3762287756925826, "grad_norm": 0.7591961622238159, "learning_rate": 3.440199582960977e-05, "loss": 0.8216, "num_input_tokens_seen": 5359552, "step": 9240 }, { "epoch": 1.3769734882335418, "grad_norm": 0.9101645946502686, "learning_rate": 3.4420613643133756e-05, "loss": 0.6283, "num_input_tokens_seen": 5362528, "step": 9245 }, { "epoch": 1.377718200774501, "grad_norm": 0.7025670409202576, "learning_rate": 3.4439231456657734e-05, "loss": 0.7573, "num_input_tokens_seen": 5365600, "step": 9250 }, { "epoch": 1.3784629133154602, "grad_norm": 0.7376137375831604, "learning_rate": 3.445784927018171e-05, "loss": 0.6405, "num_input_tokens_seen": 5368288, "step": 9255 }, { "epoch": 1.3792076258564194, "grad_norm": 1.616059422492981, "learning_rate": 3.447646708370569e-05, "loss": 0.8114, "num_input_tokens_seen": 5371296, "step": 9260 }, { "epoch": 1.3799523383973786, "grad_norm": 0.7514680027961731, "learning_rate": 3.4495084897229676e-05, "loss": 0.7355, "num_input_tokens_seen": 5374240, "step": 9265 }, { "epoch": 1.3806970509383378, "grad_norm": 0.5747900009155273, "learning_rate": 3.451370271075365e-05, "loss": 0.7297, "num_input_tokens_seen": 5377312, "step": 9270 }, { "epoch": 1.381441763479297, "grad_norm": 1.0485899448394775, "learning_rate": 3.4532320524277625e-05, "loss": 0.716, "num_input_tokens_seen": 5380064, "step": 9275 }, { "epoch": 1.3821864760202562, "grad_norm": 0.9644586443901062, "learning_rate": 3.455093833780161e-05, "loss": 0.8234, "num_input_tokens_seen": 5383072, "step": 9280 }, { "epoch": 1.3829311885612154, "grad_norm": 0.8402571082115173, "learning_rate": 3.456955615132559e-05, "loss": 0.7623, "num_input_tokens_seen": 5385792, "step": 9285 }, { "epoch": 1.3836759011021746, "grad_norm": 0.9461374282836914, "learning_rate": 3.458817396484957e-05, "loss": 0.69, "num_input_tokens_seen": 5388640, "step": 9290 }, { "epoch": 1.3844206136431336, "grad_norm": 0.8712512254714966, "learning_rate": 3.460679177837355e-05, "loss": 0.6832, "num_input_tokens_seen": 5391360, "step": 9295 }, { "epoch": 1.3851653261840928, "grad_norm": 0.7098516821861267, "learning_rate": 3.462540959189753e-05, "loss": 0.707, "num_input_tokens_seen": 5394240, "step": 9300 }, { "epoch": 1.385910038725052, "grad_norm": 0.9988044500350952, "learning_rate": 3.464402740542151e-05, "loss": 0.7684, "num_input_tokens_seen": 5396960, "step": 9305 }, { "epoch": 1.3866547512660112, "grad_norm": 1.0344159603118896, "learning_rate": 3.4662645218945486e-05, "loss": 0.8049, "num_input_tokens_seen": 5399616, "step": 9310 }, { "epoch": 1.3873994638069704, "grad_norm": 1.4980590343475342, "learning_rate": 3.468126303246947e-05, "loss": 0.8046, "num_input_tokens_seen": 5402144, "step": 9315 }, { "epoch": 1.3881441763479296, "grad_norm": 0.7436476349830627, "learning_rate": 3.469988084599345e-05, "loss": 0.7146, "num_input_tokens_seen": 5404992, "step": 9320 }, { "epoch": 1.3888888888888888, "grad_norm": 0.8884595036506653, "learning_rate": 3.471849865951743e-05, "loss": 0.6933, "num_input_tokens_seen": 5407808, "step": 9325 }, { "epoch": 1.389633601429848, "grad_norm": 0.6924651861190796, "learning_rate": 3.4737116473041406e-05, "loss": 0.8215, "num_input_tokens_seen": 5410752, "step": 9330 }, { "epoch": 1.3903783139708072, "grad_norm": 0.9674221277236938, "learning_rate": 3.475573428656539e-05, "loss": 0.6361, "num_input_tokens_seen": 5413504, "step": 9335 }, { "epoch": 1.3911230265117664, "grad_norm": 0.6915837526321411, "learning_rate": 3.477435210008937e-05, "loss": 0.7331, "num_input_tokens_seen": 5416672, "step": 9340 }, { "epoch": 1.3918677390527256, "grad_norm": 0.8446016311645508, "learning_rate": 3.479296991361335e-05, "loss": 0.7802, "num_input_tokens_seen": 5419840, "step": 9345 }, { "epoch": 1.3926124515936849, "grad_norm": 1.3015910387039185, "learning_rate": 3.4811587727137326e-05, "loss": 0.8106, "num_input_tokens_seen": 5422624, "step": 9350 }, { "epoch": 1.393357164134644, "grad_norm": 0.96652752161026, "learning_rate": 3.4830205540661304e-05, "loss": 0.6992, "num_input_tokens_seen": 5425568, "step": 9355 }, { "epoch": 1.3941018766756033, "grad_norm": 1.0997551679611206, "learning_rate": 3.484882335418528e-05, "loss": 0.7675, "num_input_tokens_seen": 5428416, "step": 9360 }, { "epoch": 1.3948465892165625, "grad_norm": 0.7854184508323669, "learning_rate": 3.486744116770926e-05, "loss": 0.5277, "num_input_tokens_seen": 5431264, "step": 9365 }, { "epoch": 1.3955913017575217, "grad_norm": 0.942309558391571, "learning_rate": 3.4886058981233246e-05, "loss": 0.6989, "num_input_tokens_seen": 5434048, "step": 9370 }, { "epoch": 1.3963360142984809, "grad_norm": 0.5925226807594299, "learning_rate": 3.4904676794757224e-05, "loss": 0.8395, "num_input_tokens_seen": 5437056, "step": 9375 }, { "epoch": 1.39708072683944, "grad_norm": 1.1135061979293823, "learning_rate": 3.49232946082812e-05, "loss": 0.6925, "num_input_tokens_seen": 5439840, "step": 9380 }, { "epoch": 1.3978254393803993, "grad_norm": 0.7037380933761597, "learning_rate": 3.494191242180519e-05, "loss": 0.622, "num_input_tokens_seen": 5442816, "step": 9385 }, { "epoch": 1.3985701519213585, "grad_norm": 1.0484702587127686, "learning_rate": 3.4960530235329165e-05, "loss": 0.8125, "num_input_tokens_seen": 5445664, "step": 9390 }, { "epoch": 1.3993148644623177, "grad_norm": 0.9633852243423462, "learning_rate": 3.4979148048853143e-05, "loss": 0.8506, "num_input_tokens_seen": 5448704, "step": 9395 }, { "epoch": 1.4000595770032767, "grad_norm": 0.8537940979003906, "learning_rate": 3.499776586237712e-05, "loss": 0.7813, "num_input_tokens_seen": 5451648, "step": 9400 }, { "epoch": 1.4008042895442359, "grad_norm": 0.7464414834976196, "learning_rate": 3.501638367590111e-05, "loss": 0.7102, "num_input_tokens_seen": 5454624, "step": 9405 }, { "epoch": 1.401549002085195, "grad_norm": 0.9517822861671448, "learning_rate": 3.5035001489425085e-05, "loss": 0.685, "num_input_tokens_seen": 5457248, "step": 9410 }, { "epoch": 1.4022937146261543, "grad_norm": 0.5496989488601685, "learning_rate": 3.505361930294906e-05, "loss": 0.6838, "num_input_tokens_seen": 5460032, "step": 9415 }, { "epoch": 1.4030384271671135, "grad_norm": 0.49594929814338684, "learning_rate": 3.507223711647304e-05, "loss": 0.6526, "num_input_tokens_seen": 5462816, "step": 9420 }, { "epoch": 1.4037831397080727, "grad_norm": 1.1128376722335815, "learning_rate": 3.5090854929997026e-05, "loss": 0.7745, "num_input_tokens_seen": 5465632, "step": 9425 }, { "epoch": 1.4045278522490319, "grad_norm": 0.8217671513557434, "learning_rate": 3.5109472743521005e-05, "loss": 0.6898, "num_input_tokens_seen": 5468544, "step": 9430 }, { "epoch": 1.405272564789991, "grad_norm": 1.1354196071624756, "learning_rate": 3.512809055704498e-05, "loss": 0.7079, "num_input_tokens_seen": 5471360, "step": 9435 }, { "epoch": 1.4060172773309503, "grad_norm": 0.7460463047027588, "learning_rate": 3.514670837056897e-05, "loss": 0.6203, "num_input_tokens_seen": 5474080, "step": 9440 }, { "epoch": 1.4067619898719095, "grad_norm": 1.2705211639404297, "learning_rate": 3.516532618409294e-05, "loss": 0.7592, "num_input_tokens_seen": 5477152, "step": 9445 }, { "epoch": 1.4075067024128687, "grad_norm": 0.5930393934249878, "learning_rate": 3.518394399761692e-05, "loss": 0.5377, "num_input_tokens_seen": 5480064, "step": 9450 }, { "epoch": 1.4082514149538279, "grad_norm": 1.1594719886779785, "learning_rate": 3.52025618111409e-05, "loss": 0.8272, "num_input_tokens_seen": 5483136, "step": 9455 }, { "epoch": 1.408996127494787, "grad_norm": 0.7871811985969543, "learning_rate": 3.522117962466488e-05, "loss": 0.7383, "num_input_tokens_seen": 5486368, "step": 9460 }, { "epoch": 1.409740840035746, "grad_norm": 2.221949577331543, "learning_rate": 3.523979743818886e-05, "loss": 0.8628, "num_input_tokens_seen": 5489024, "step": 9465 }, { "epoch": 1.4104855525767053, "grad_norm": 0.8701172471046448, "learning_rate": 3.525841525171284e-05, "loss": 0.622, "num_input_tokens_seen": 5491616, "step": 9470 }, { "epoch": 1.4112302651176645, "grad_norm": 0.8853961229324341, "learning_rate": 3.527703306523682e-05, "loss": 0.7158, "num_input_tokens_seen": 5494464, "step": 9475 }, { "epoch": 1.4119749776586237, "grad_norm": 0.8392130732536316, "learning_rate": 3.52956508787608e-05, "loss": 0.6553, "num_input_tokens_seen": 5497376, "step": 9480 }, { "epoch": 1.4127196901995829, "grad_norm": 0.6061367392539978, "learning_rate": 3.531426869228478e-05, "loss": 0.7149, "num_input_tokens_seen": 5500128, "step": 9485 }, { "epoch": 1.413464402740542, "grad_norm": 0.8614204525947571, "learning_rate": 3.533288650580876e-05, "loss": 0.6563, "num_input_tokens_seen": 5503136, "step": 9490 }, { "epoch": 1.4142091152815013, "grad_norm": 1.1046890020370483, "learning_rate": 3.535150431933274e-05, "loss": 0.7657, "num_input_tokens_seen": 5506048, "step": 9495 }, { "epoch": 1.4149538278224605, "grad_norm": 0.8884354829788208, "learning_rate": 3.537012213285672e-05, "loss": 0.6504, "num_input_tokens_seen": 5508736, "step": 9500 }, { "epoch": 1.4156985403634197, "grad_norm": 0.5164870619773865, "learning_rate": 3.53887399463807e-05, "loss": 0.7434, "num_input_tokens_seen": 5511776, "step": 9505 }, { "epoch": 1.416443252904379, "grad_norm": 0.8140254616737366, "learning_rate": 3.5407357759904683e-05, "loss": 0.77, "num_input_tokens_seen": 5514848, "step": 9510 }, { "epoch": 1.417187965445338, "grad_norm": 0.7663798928260803, "learning_rate": 3.542597557342866e-05, "loss": 0.7141, "num_input_tokens_seen": 5517600, "step": 9515 }, { "epoch": 1.4179326779862973, "grad_norm": 0.7120854258537292, "learning_rate": 3.544459338695264e-05, "loss": 0.6228, "num_input_tokens_seen": 5520384, "step": 9520 }, { "epoch": 1.4186773905272565, "grad_norm": 0.5208451747894287, "learning_rate": 3.546321120047662e-05, "loss": 0.809, "num_input_tokens_seen": 5523392, "step": 9525 }, { "epoch": 1.4194221030682157, "grad_norm": 1.09670889377594, "learning_rate": 3.5481829014000596e-05, "loss": 0.7688, "num_input_tokens_seen": 5526496, "step": 9530 }, { "epoch": 1.420166815609175, "grad_norm": 1.5419038534164429, "learning_rate": 3.5500446827524575e-05, "loss": 0.8054, "num_input_tokens_seen": 5529600, "step": 9535 }, { "epoch": 1.420911528150134, "grad_norm": 0.7802807688713074, "learning_rate": 3.551906464104855e-05, "loss": 0.6552, "num_input_tokens_seen": 5532352, "step": 9540 }, { "epoch": 1.4216562406910933, "grad_norm": 0.6271047592163086, "learning_rate": 3.553768245457254e-05, "loss": 0.7763, "num_input_tokens_seen": 5535456, "step": 9545 }, { "epoch": 1.4224009532320525, "grad_norm": 1.0986765623092651, "learning_rate": 3.5556300268096516e-05, "loss": 0.6559, "num_input_tokens_seen": 5538464, "step": 9550 }, { "epoch": 1.4231456657730117, "grad_norm": 1.4137802124023438, "learning_rate": 3.5574918081620494e-05, "loss": 0.6756, "num_input_tokens_seen": 5541344, "step": 9555 }, { "epoch": 1.423890378313971, "grad_norm": 1.2545772790908813, "learning_rate": 3.559353589514447e-05, "loss": 0.8639, "num_input_tokens_seen": 5544320, "step": 9560 }, { "epoch": 1.4246350908549301, "grad_norm": 0.8862587213516235, "learning_rate": 3.561215370866846e-05, "loss": 0.625, "num_input_tokens_seen": 5547008, "step": 9565 }, { "epoch": 1.4253798033958893, "grad_norm": 1.1236193180084229, "learning_rate": 3.5630771522192436e-05, "loss": 0.6838, "num_input_tokens_seen": 5549920, "step": 9570 }, { "epoch": 1.4261245159368483, "grad_norm": 1.2271431684494019, "learning_rate": 3.5649389335716414e-05, "loss": 0.7397, "num_input_tokens_seen": 5552960, "step": 9575 }, { "epoch": 1.4268692284778075, "grad_norm": 0.9142069816589355, "learning_rate": 3.566800714924039e-05, "loss": 0.6263, "num_input_tokens_seen": 5555840, "step": 9580 }, { "epoch": 1.4276139410187667, "grad_norm": 1.0070195198059082, "learning_rate": 3.568662496276438e-05, "loss": 0.7055, "num_input_tokens_seen": 5558912, "step": 9585 }, { "epoch": 1.428358653559726, "grad_norm": 0.7515415549278259, "learning_rate": 3.5705242776288356e-05, "loss": 0.6951, "num_input_tokens_seen": 5561792, "step": 9590 }, { "epoch": 1.4291033661006851, "grad_norm": 2.0238418579101562, "learning_rate": 3.5723860589812334e-05, "loss": 0.9381, "num_input_tokens_seen": 5564480, "step": 9595 }, { "epoch": 1.4298480786416443, "grad_norm": 0.8340509533882141, "learning_rate": 3.574247840333632e-05, "loss": 0.6819, "num_input_tokens_seen": 5567264, "step": 9600 }, { "epoch": 1.4305927911826035, "grad_norm": 1.1310255527496338, "learning_rate": 3.57610962168603e-05, "loss": 0.7029, "num_input_tokens_seen": 5569984, "step": 9605 }, { "epoch": 1.4313375037235627, "grad_norm": 0.5589237809181213, "learning_rate": 3.5779714030384275e-05, "loss": 0.7453, "num_input_tokens_seen": 5573472, "step": 9610 }, { "epoch": 1.432082216264522, "grad_norm": 0.6369048953056335, "learning_rate": 3.5798331843908253e-05, "loss": 0.7061, "num_input_tokens_seen": 5576416, "step": 9615 }, { "epoch": 1.4328269288054811, "grad_norm": 0.6277434229850769, "learning_rate": 3.581694965743223e-05, "loss": 0.692, "num_input_tokens_seen": 5579584, "step": 9620 }, { "epoch": 1.4335716413464403, "grad_norm": 1.1266697645187378, "learning_rate": 3.583556747095621e-05, "loss": 0.6465, "num_input_tokens_seen": 5582464, "step": 9625 }, { "epoch": 1.4343163538873995, "grad_norm": 1.1916990280151367, "learning_rate": 3.585418528448019e-05, "loss": 0.6624, "num_input_tokens_seen": 5585056, "step": 9630 }, { "epoch": 1.4350610664283587, "grad_norm": 0.62572181224823, "learning_rate": 3.587280309800417e-05, "loss": 0.7901, "num_input_tokens_seen": 5587968, "step": 9635 }, { "epoch": 1.4358057789693177, "grad_norm": 0.8911981582641602, "learning_rate": 3.589142091152815e-05, "loss": 0.7003, "num_input_tokens_seen": 5591136, "step": 9640 }, { "epoch": 1.436550491510277, "grad_norm": 0.7616434097290039, "learning_rate": 3.591003872505213e-05, "loss": 0.7419, "num_input_tokens_seen": 5594016, "step": 9645 }, { "epoch": 1.4372952040512361, "grad_norm": 3.684175491333008, "learning_rate": 3.592865653857611e-05, "loss": 0.5684, "num_input_tokens_seen": 5596736, "step": 9650 }, { "epoch": 1.4380399165921953, "grad_norm": 1.3675458431243896, "learning_rate": 3.594727435210009e-05, "loss": 0.7775, "num_input_tokens_seen": 5599552, "step": 9655 }, { "epoch": 1.4387846291331545, "grad_norm": 0.6636415123939514, "learning_rate": 3.596589216562407e-05, "loss": 0.58, "num_input_tokens_seen": 5602752, "step": 9660 }, { "epoch": 1.4395293416741137, "grad_norm": 0.7854171395301819, "learning_rate": 3.598450997914805e-05, "loss": 0.7204, "num_input_tokens_seen": 5605568, "step": 9665 }, { "epoch": 1.440274054215073, "grad_norm": 0.8641152381896973, "learning_rate": 3.6003127792672034e-05, "loss": 0.6802, "num_input_tokens_seen": 5608448, "step": 9670 }, { "epoch": 1.4410187667560321, "grad_norm": 1.5383516550064087, "learning_rate": 3.602174560619601e-05, "loss": 0.775, "num_input_tokens_seen": 5611584, "step": 9675 }, { "epoch": 1.4417634792969913, "grad_norm": 0.9408877491950989, "learning_rate": 3.604036341971999e-05, "loss": 0.7868, "num_input_tokens_seen": 5614080, "step": 9680 }, { "epoch": 1.4425081918379505, "grad_norm": 1.1500248908996582, "learning_rate": 3.605898123324397e-05, "loss": 0.6372, "num_input_tokens_seen": 5617088, "step": 9685 }, { "epoch": 1.4432529043789097, "grad_norm": 0.6831284165382385, "learning_rate": 3.6077599046767954e-05, "loss": 0.6389, "num_input_tokens_seen": 5619872, "step": 9690 }, { "epoch": 1.443997616919869, "grad_norm": 0.6667928099632263, "learning_rate": 3.609621686029193e-05, "loss": 0.6216, "num_input_tokens_seen": 5622496, "step": 9695 }, { "epoch": 1.4447423294608281, "grad_norm": 0.6016054749488831, "learning_rate": 3.6114834673815904e-05, "loss": 0.6061, "num_input_tokens_seen": 5625408, "step": 9700 }, { "epoch": 1.4454870420017873, "grad_norm": 0.849748432636261, "learning_rate": 3.613345248733989e-05, "loss": 0.886, "num_input_tokens_seen": 5628064, "step": 9705 }, { "epoch": 1.4462317545427466, "grad_norm": 0.9492668509483337, "learning_rate": 3.615207030086387e-05, "loss": 0.6217, "num_input_tokens_seen": 5631104, "step": 9710 }, { "epoch": 1.4469764670837058, "grad_norm": 1.1999362707138062, "learning_rate": 3.6170688114387845e-05, "loss": 0.7612, "num_input_tokens_seen": 5633920, "step": 9715 }, { "epoch": 1.447721179624665, "grad_norm": 0.8355751037597656, "learning_rate": 3.6189305927911823e-05, "loss": 0.5933, "num_input_tokens_seen": 5636672, "step": 9720 }, { "epoch": 1.4484658921656242, "grad_norm": 1.761229157447815, "learning_rate": 3.620792374143581e-05, "loss": 0.8552, "num_input_tokens_seen": 5639616, "step": 9725 }, { "epoch": 1.4492106047065834, "grad_norm": 1.048133134841919, "learning_rate": 3.622654155495979e-05, "loss": 0.6679, "num_input_tokens_seen": 5642880, "step": 9730 }, { "epoch": 1.4499553172475426, "grad_norm": 0.7508726716041565, "learning_rate": 3.6245159368483765e-05, "loss": 0.6886, "num_input_tokens_seen": 5645728, "step": 9735 }, { "epoch": 1.4507000297885018, "grad_norm": 1.331323266029358, "learning_rate": 3.626377718200774e-05, "loss": 0.6739, "num_input_tokens_seen": 5648928, "step": 9740 }, { "epoch": 1.4514447423294607, "grad_norm": 0.9150112867355347, "learning_rate": 3.628239499553173e-05, "loss": 0.703, "num_input_tokens_seen": 5651552, "step": 9745 }, { "epoch": 1.45218945487042, "grad_norm": 0.8160333633422852, "learning_rate": 3.6301012809055706e-05, "loss": 0.6772, "num_input_tokens_seen": 5654496, "step": 9750 }, { "epoch": 1.4529341674113792, "grad_norm": 1.2088686227798462, "learning_rate": 3.6319630622579685e-05, "loss": 0.6725, "num_input_tokens_seen": 5657280, "step": 9755 }, { "epoch": 1.4536788799523384, "grad_norm": 1.0688018798828125, "learning_rate": 3.633824843610367e-05, "loss": 0.7027, "num_input_tokens_seen": 5659872, "step": 9760 }, { "epoch": 1.4544235924932976, "grad_norm": 0.9321742057800293, "learning_rate": 3.635686624962765e-05, "loss": 0.6215, "num_input_tokens_seen": 5662752, "step": 9765 }, { "epoch": 1.4551683050342568, "grad_norm": 0.9023200869560242, "learning_rate": 3.6375484063151626e-05, "loss": 0.6568, "num_input_tokens_seen": 5665568, "step": 9770 }, { "epoch": 1.455913017575216, "grad_norm": 0.8747884035110474, "learning_rate": 3.6394101876675604e-05, "loss": 0.7799, "num_input_tokens_seen": 5668448, "step": 9775 }, { "epoch": 1.4566577301161752, "grad_norm": 1.0003252029418945, "learning_rate": 3.641271969019959e-05, "loss": 0.6902, "num_input_tokens_seen": 5671200, "step": 9780 }, { "epoch": 1.4574024426571344, "grad_norm": 0.7857992053031921, "learning_rate": 3.643133750372357e-05, "loss": 0.6735, "num_input_tokens_seen": 5674144, "step": 9785 }, { "epoch": 1.4581471551980936, "grad_norm": 0.7789284586906433, "learning_rate": 3.644995531724754e-05, "loss": 0.556, "num_input_tokens_seen": 5677152, "step": 9790 }, { "epoch": 1.4588918677390528, "grad_norm": 0.6469395756721497, "learning_rate": 3.6468573130771524e-05, "loss": 0.7064, "num_input_tokens_seen": 5680448, "step": 9795 }, { "epoch": 1.459636580280012, "grad_norm": 0.8996594548225403, "learning_rate": 3.64871909442955e-05, "loss": 0.6848, "num_input_tokens_seen": 5683424, "step": 9800 }, { "epoch": 1.4603812928209712, "grad_norm": 1.1000062227249146, "learning_rate": 3.650580875781948e-05, "loss": 0.8067, "num_input_tokens_seen": 5686336, "step": 9805 }, { "epoch": 1.4611260053619302, "grad_norm": 1.5464965105056763, "learning_rate": 3.652442657134346e-05, "loss": 0.7397, "num_input_tokens_seen": 5689120, "step": 9810 }, { "epoch": 1.4618707179028894, "grad_norm": 0.8674699068069458, "learning_rate": 3.6543044384867444e-05, "loss": 0.7119, "num_input_tokens_seen": 5691648, "step": 9815 }, { "epoch": 1.4626154304438486, "grad_norm": 1.5484490394592285, "learning_rate": 3.656166219839142e-05, "loss": 0.8662, "num_input_tokens_seen": 5694368, "step": 9820 }, { "epoch": 1.4633601429848078, "grad_norm": 0.9075344204902649, "learning_rate": 3.65802800119154e-05, "loss": 0.567, "num_input_tokens_seen": 5697280, "step": 9825 }, { "epoch": 1.464104855525767, "grad_norm": 0.8769876956939697, "learning_rate": 3.6598897825439385e-05, "loss": 0.7158, "num_input_tokens_seen": 5700448, "step": 9830 }, { "epoch": 1.4648495680667262, "grad_norm": 1.250610113143921, "learning_rate": 3.6617515638963363e-05, "loss": 0.7669, "num_input_tokens_seen": 5703584, "step": 9835 }, { "epoch": 1.4655942806076854, "grad_norm": 1.2692878246307373, "learning_rate": 3.663613345248734e-05, "loss": 0.7877, "num_input_tokens_seen": 5706400, "step": 9840 }, { "epoch": 1.4663389931486446, "grad_norm": 0.8980467319488525, "learning_rate": 3.665475126601132e-05, "loss": 0.8265, "num_input_tokens_seen": 5709184, "step": 9845 }, { "epoch": 1.4670837056896038, "grad_norm": 0.9193889498710632, "learning_rate": 3.6673369079535305e-05, "loss": 0.5483, "num_input_tokens_seen": 5711712, "step": 9850 }, { "epoch": 1.467828418230563, "grad_norm": 0.8783222436904907, "learning_rate": 3.669198689305928e-05, "loss": 0.6494, "num_input_tokens_seen": 5714464, "step": 9855 }, { "epoch": 1.4685731307715222, "grad_norm": 1.323391318321228, "learning_rate": 3.671060470658326e-05, "loss": 0.6243, "num_input_tokens_seen": 5717216, "step": 9860 }, { "epoch": 1.4693178433124814, "grad_norm": 0.8622720241546631, "learning_rate": 3.672922252010724e-05, "loss": 0.7814, "num_input_tokens_seen": 5720608, "step": 9865 }, { "epoch": 1.4700625558534406, "grad_norm": 1.209223747253418, "learning_rate": 3.6747840333631225e-05, "loss": 0.7209, "num_input_tokens_seen": 5723488, "step": 9870 }, { "epoch": 1.4708072683943998, "grad_norm": 0.7939414381980896, "learning_rate": 3.6766458147155196e-05, "loss": 0.7915, "num_input_tokens_seen": 5726624, "step": 9875 }, { "epoch": 1.471551980935359, "grad_norm": 0.8672036528587341, "learning_rate": 3.6785075960679174e-05, "loss": 0.9418, "num_input_tokens_seen": 5729408, "step": 9880 }, { "epoch": 1.4722966934763182, "grad_norm": 0.6152815818786621, "learning_rate": 3.680369377420316e-05, "loss": 0.749, "num_input_tokens_seen": 5732352, "step": 9885 }, { "epoch": 1.4730414060172774, "grad_norm": 0.7354810237884521, "learning_rate": 3.682231158772714e-05, "loss": 0.8893, "num_input_tokens_seen": 5735072, "step": 9890 }, { "epoch": 1.4737861185582366, "grad_norm": 0.7883205413818359, "learning_rate": 3.6840929401251116e-05, "loss": 0.7102, "num_input_tokens_seen": 5738272, "step": 9895 }, { "epoch": 1.4745308310991958, "grad_norm": 0.6585739850997925, "learning_rate": 3.68595472147751e-05, "loss": 0.7349, "num_input_tokens_seen": 5741024, "step": 9900 }, { "epoch": 1.475275543640155, "grad_norm": 1.0019135475158691, "learning_rate": 3.687816502829908e-05, "loss": 0.6023, "num_input_tokens_seen": 5743968, "step": 9905 }, { "epoch": 1.4760202561811142, "grad_norm": 0.8599717617034912, "learning_rate": 3.689678284182306e-05, "loss": 0.6943, "num_input_tokens_seen": 5746816, "step": 9910 }, { "epoch": 1.4767649687220734, "grad_norm": 1.1337063312530518, "learning_rate": 3.6915400655347035e-05, "loss": 0.8179, "num_input_tokens_seen": 5749312, "step": 9915 }, { "epoch": 1.4775096812630324, "grad_norm": 1.024612545967102, "learning_rate": 3.693401846887102e-05, "loss": 0.6851, "num_input_tokens_seen": 5751776, "step": 9920 }, { "epoch": 1.4782543938039916, "grad_norm": 1.4113143682479858, "learning_rate": 3.6952636282395e-05, "loss": 0.6376, "num_input_tokens_seen": 5754432, "step": 9925 }, { "epoch": 1.4789991063449508, "grad_norm": 1.1565678119659424, "learning_rate": 3.697125409591898e-05, "loss": 0.7889, "num_input_tokens_seen": 5757152, "step": 9930 }, { "epoch": 1.47974381888591, "grad_norm": 1.0029274225234985, "learning_rate": 3.6989871909442955e-05, "loss": 0.7061, "num_input_tokens_seen": 5760096, "step": 9935 }, { "epoch": 1.4804885314268692, "grad_norm": 0.5785654783248901, "learning_rate": 3.700848972296694e-05, "loss": 0.6087, "num_input_tokens_seen": 5762880, "step": 9940 }, { "epoch": 1.4812332439678284, "grad_norm": 0.8740646839141846, "learning_rate": 3.702710753649092e-05, "loss": 0.7509, "num_input_tokens_seen": 5765696, "step": 9945 }, { "epoch": 1.4819779565087876, "grad_norm": 1.060072660446167, "learning_rate": 3.70457253500149e-05, "loss": 0.6514, "num_input_tokens_seen": 5768448, "step": 9950 }, { "epoch": 1.4827226690497468, "grad_norm": 1.0472489595413208, "learning_rate": 3.7064343163538875e-05, "loss": 1.0472, "num_input_tokens_seen": 5771520, "step": 9955 }, { "epoch": 1.483467381590706, "grad_norm": 1.2885123491287231, "learning_rate": 3.708296097706285e-05, "loss": 0.732, "num_input_tokens_seen": 5774400, "step": 9960 }, { "epoch": 1.4842120941316652, "grad_norm": 0.5678247809410095, "learning_rate": 3.710157879058683e-05, "loss": 0.7163, "num_input_tokens_seen": 5777216, "step": 9965 }, { "epoch": 1.4849568066726244, "grad_norm": 1.2353274822235107, "learning_rate": 3.712019660411081e-05, "loss": 0.8169, "num_input_tokens_seen": 5779744, "step": 9970 }, { "epoch": 1.4857015192135836, "grad_norm": 1.0804533958435059, "learning_rate": 3.7138814417634795e-05, "loss": 0.7905, "num_input_tokens_seen": 5782720, "step": 9975 }, { "epoch": 1.4864462317545428, "grad_norm": 0.5919594764709473, "learning_rate": 3.715743223115877e-05, "loss": 0.6737, "num_input_tokens_seen": 5785344, "step": 9980 }, { "epoch": 1.4871909442955018, "grad_norm": 0.855837345123291, "learning_rate": 3.717605004468275e-05, "loss": 0.5739, "num_input_tokens_seen": 5787936, "step": 9985 }, { "epoch": 1.487935656836461, "grad_norm": 0.9040362238883972, "learning_rate": 3.7194667858206736e-05, "loss": 0.6674, "num_input_tokens_seen": 5790880, "step": 9990 }, { "epoch": 1.4886803693774202, "grad_norm": 0.6797532439231873, "learning_rate": 3.7213285671730714e-05, "loss": 0.635, "num_input_tokens_seen": 5793920, "step": 9995 }, { "epoch": 1.4894250819183794, "grad_norm": 1.1839231252670288, "learning_rate": 3.723190348525469e-05, "loss": 0.5723, "num_input_tokens_seen": 5796768, "step": 10000 }, { "epoch": 1.4901697944593386, "grad_norm": 0.9190068244934082, "learning_rate": 3.725052129877867e-05, "loss": 0.5917, "num_input_tokens_seen": 5799520, "step": 10005 }, { "epoch": 1.4909145070002978, "grad_norm": 0.937599778175354, "learning_rate": 3.7269139112302656e-05, "loss": 0.7045, "num_input_tokens_seen": 5802336, "step": 10010 }, { "epoch": 1.491659219541257, "grad_norm": 0.864120602607727, "learning_rate": 3.7287756925826634e-05, "loss": 0.6575, "num_input_tokens_seen": 5805600, "step": 10015 }, { "epoch": 1.4924039320822162, "grad_norm": 0.8499029874801636, "learning_rate": 3.730637473935061e-05, "loss": 0.7298, "num_input_tokens_seen": 5808320, "step": 10020 }, { "epoch": 1.4931486446231754, "grad_norm": 1.0367316007614136, "learning_rate": 3.732499255287459e-05, "loss": 0.6868, "num_input_tokens_seen": 5811264, "step": 10025 }, { "epoch": 1.4938933571641346, "grad_norm": 0.9495744109153748, "learning_rate": 3.7343610366398575e-05, "loss": 0.6875, "num_input_tokens_seen": 5813920, "step": 10030 }, { "epoch": 1.4946380697050938, "grad_norm": 1.1574469804763794, "learning_rate": 3.7362228179922554e-05, "loss": 0.6957, "num_input_tokens_seen": 5817024, "step": 10035 }, { "epoch": 1.495382782246053, "grad_norm": 0.8666263818740845, "learning_rate": 3.738084599344653e-05, "loss": 0.7397, "num_input_tokens_seen": 5820000, "step": 10040 }, { "epoch": 1.4961274947870122, "grad_norm": 0.9895386695861816, "learning_rate": 3.739946380697052e-05, "loss": 0.5951, "num_input_tokens_seen": 5822784, "step": 10045 }, { "epoch": 1.4968722073279714, "grad_norm": 0.6465814113616943, "learning_rate": 3.741808162049449e-05, "loss": 0.7676, "num_input_tokens_seen": 5825568, "step": 10050 }, { "epoch": 1.4976169198689306, "grad_norm": 0.9215379357337952, "learning_rate": 3.743669943401847e-05, "loss": 0.7149, "num_input_tokens_seen": 5828288, "step": 10055 }, { "epoch": 1.4983616324098898, "grad_norm": 0.8673646450042725, "learning_rate": 3.745531724754245e-05, "loss": 0.6724, "num_input_tokens_seen": 5831360, "step": 10060 }, { "epoch": 1.499106344950849, "grad_norm": 0.9466986656188965, "learning_rate": 3.747393506106643e-05, "loss": 0.7293, "num_input_tokens_seen": 5834208, "step": 10065 }, { "epoch": 1.4998510574918082, "grad_norm": 0.5298372507095337, "learning_rate": 3.749255287459041e-05, "loss": 0.6503, "num_input_tokens_seen": 5837184, "step": 10070 }, { "epoch": 1.5005957700327675, "grad_norm": 1.0430958271026611, "learning_rate": 3.7511170688114386e-05, "loss": 0.6935, "num_input_tokens_seen": 5840096, "step": 10075 }, { "epoch": 1.5013404825737267, "grad_norm": 0.7146922945976257, "learning_rate": 3.752978850163837e-05, "loss": 0.7295, "num_input_tokens_seen": 5843584, "step": 10080 }, { "epoch": 1.5020851951146859, "grad_norm": 0.605655312538147, "learning_rate": 3.754840631516235e-05, "loss": 0.6968, "num_input_tokens_seen": 5846144, "step": 10085 }, { "epoch": 1.502829907655645, "grad_norm": 0.6858327388763428, "learning_rate": 3.756702412868633e-05, "loss": 0.6069, "num_input_tokens_seen": 5848960, "step": 10090 }, { "epoch": 1.5035746201966043, "grad_norm": 1.369692325592041, "learning_rate": 3.7585641942210306e-05, "loss": 0.7276, "num_input_tokens_seen": 5851744, "step": 10095 }, { "epoch": 1.5043193327375635, "grad_norm": 0.6732301712036133, "learning_rate": 3.760425975573429e-05, "loss": 0.7606, "num_input_tokens_seen": 5854848, "step": 10100 }, { "epoch": 1.5050640452785224, "grad_norm": 1.1423509120941162, "learning_rate": 3.762287756925827e-05, "loss": 0.7453, "num_input_tokens_seen": 5857792, "step": 10105 }, { "epoch": 1.5058087578194816, "grad_norm": 4.268044471740723, "learning_rate": 3.764149538278225e-05, "loss": 0.8703, "num_input_tokens_seen": 5860672, "step": 10110 }, { "epoch": 1.5065534703604408, "grad_norm": 0.8449310064315796, "learning_rate": 3.766011319630623e-05, "loss": 0.7737, "num_input_tokens_seen": 5863552, "step": 10115 }, { "epoch": 1.5072981829014, "grad_norm": 1.0839881896972656, "learning_rate": 3.767873100983021e-05, "loss": 0.7851, "num_input_tokens_seen": 5866592, "step": 10120 }, { "epoch": 1.5080428954423593, "grad_norm": 1.281050205230713, "learning_rate": 3.769734882335419e-05, "loss": 0.6919, "num_input_tokens_seen": 5869408, "step": 10125 }, { "epoch": 1.5087876079833185, "grad_norm": 0.7045502066612244, "learning_rate": 3.771596663687817e-05, "loss": 0.6238, "num_input_tokens_seen": 5872128, "step": 10130 }, { "epoch": 1.5095323205242777, "grad_norm": 0.8847864270210266, "learning_rate": 3.7734584450402145e-05, "loss": 0.8241, "num_input_tokens_seen": 5875104, "step": 10135 }, { "epoch": 1.5102770330652369, "grad_norm": 1.5825392007827759, "learning_rate": 3.7753202263926124e-05, "loss": 0.7916, "num_input_tokens_seen": 5878176, "step": 10140 }, { "epoch": 1.5110217456061958, "grad_norm": 0.736207902431488, "learning_rate": 3.77718200774501e-05, "loss": 0.8379, "num_input_tokens_seen": 5881088, "step": 10145 }, { "epoch": 1.511766458147155, "grad_norm": 0.8889463543891907, "learning_rate": 3.779043789097409e-05, "loss": 0.666, "num_input_tokens_seen": 5883968, "step": 10150 }, { "epoch": 1.5125111706881142, "grad_norm": 0.5934556722640991, "learning_rate": 3.7809055704498065e-05, "loss": 0.7282, "num_input_tokens_seen": 5887104, "step": 10155 }, { "epoch": 1.5132558832290735, "grad_norm": 0.8835089206695557, "learning_rate": 3.7827673518022043e-05, "loss": 0.7063, "num_input_tokens_seen": 5889952, "step": 10160 }, { "epoch": 1.5140005957700327, "grad_norm": 0.485615074634552, "learning_rate": 3.784629133154602e-05, "loss": 0.6891, "num_input_tokens_seen": 5892896, "step": 10165 }, { "epoch": 1.5147453083109919, "grad_norm": 0.6276874542236328, "learning_rate": 3.786490914507001e-05, "loss": 0.7518, "num_input_tokens_seen": 5895680, "step": 10170 }, { "epoch": 1.515490020851951, "grad_norm": 0.732821524143219, "learning_rate": 3.7883526958593985e-05, "loss": 0.7038, "num_input_tokens_seen": 5898560, "step": 10175 }, { "epoch": 1.5162347333929103, "grad_norm": 0.9901906847953796, "learning_rate": 3.790214477211796e-05, "loss": 0.6729, "num_input_tokens_seen": 5901696, "step": 10180 }, { "epoch": 1.5169794459338695, "grad_norm": 0.9103869199752808, "learning_rate": 3.792076258564194e-05, "loss": 0.7221, "num_input_tokens_seen": 5904384, "step": 10185 }, { "epoch": 1.5177241584748287, "grad_norm": 0.7918343544006348, "learning_rate": 3.7939380399165926e-05, "loss": 0.7499, "num_input_tokens_seen": 5907424, "step": 10190 }, { "epoch": 1.5184688710157879, "grad_norm": 0.7765290141105652, "learning_rate": 3.7957998212689905e-05, "loss": 0.7315, "num_input_tokens_seen": 5910464, "step": 10195 }, { "epoch": 1.519213583556747, "grad_norm": 0.9816589951515198, "learning_rate": 3.797661602621388e-05, "loss": 0.7164, "num_input_tokens_seen": 5913504, "step": 10200 }, { "epoch": 1.5199582960977063, "grad_norm": 1.070563554763794, "learning_rate": 3.799523383973787e-05, "loss": 0.6102, "num_input_tokens_seen": 5916544, "step": 10205 }, { "epoch": 1.5207030086386655, "grad_norm": 0.8462275862693787, "learning_rate": 3.8013851653261846e-05, "loss": 0.7073, "num_input_tokens_seen": 5919744, "step": 10210 }, { "epoch": 1.5214477211796247, "grad_norm": 1.0583395957946777, "learning_rate": 3.8032469466785824e-05, "loss": 0.6042, "num_input_tokens_seen": 5922656, "step": 10215 }, { "epoch": 1.5221924337205839, "grad_norm": 1.4125205278396606, "learning_rate": 3.80510872803098e-05, "loss": 0.7822, "num_input_tokens_seen": 5925376, "step": 10220 }, { "epoch": 1.522937146261543, "grad_norm": 0.7942560315132141, "learning_rate": 3.806970509383378e-05, "loss": 0.7729, "num_input_tokens_seen": 5928160, "step": 10225 }, { "epoch": 1.5236818588025023, "grad_norm": 0.6980563998222351, "learning_rate": 3.808832290735776e-05, "loss": 0.7126, "num_input_tokens_seen": 5930848, "step": 10230 }, { "epoch": 1.5244265713434615, "grad_norm": 0.8375157117843628, "learning_rate": 3.810694072088174e-05, "loss": 0.7311, "num_input_tokens_seen": 5933760, "step": 10235 }, { "epoch": 1.5251712838844207, "grad_norm": 1.2270840406417847, "learning_rate": 3.812555853440572e-05, "loss": 0.6872, "num_input_tokens_seen": 5936544, "step": 10240 }, { "epoch": 1.52591599642538, "grad_norm": 2.972437858581543, "learning_rate": 3.81441763479297e-05, "loss": 0.8407, "num_input_tokens_seen": 5939520, "step": 10245 }, { "epoch": 1.526660708966339, "grad_norm": 0.7091430425643921, "learning_rate": 3.816279416145368e-05, "loss": 0.7804, "num_input_tokens_seen": 5942528, "step": 10250 }, { "epoch": 1.5274054215072983, "grad_norm": 0.8208349943161011, "learning_rate": 3.818141197497766e-05, "loss": 0.8479, "num_input_tokens_seen": 5945888, "step": 10255 }, { "epoch": 1.5281501340482575, "grad_norm": 0.7125742435455322, "learning_rate": 3.820002978850164e-05, "loss": 0.7301, "num_input_tokens_seen": 5949088, "step": 10260 }, { "epoch": 1.5288948465892167, "grad_norm": 0.5384941697120667, "learning_rate": 3.821864760202562e-05, "loss": 0.6975, "num_input_tokens_seen": 5952064, "step": 10265 }, { "epoch": 1.529639559130176, "grad_norm": 0.8330973386764526, "learning_rate": 3.82372654155496e-05, "loss": 0.7095, "num_input_tokens_seen": 5954944, "step": 10270 }, { "epoch": 1.5303842716711349, "grad_norm": 0.6228506565093994, "learning_rate": 3.8255883229073583e-05, "loss": 0.703, "num_input_tokens_seen": 5957728, "step": 10275 }, { "epoch": 1.531128984212094, "grad_norm": 1.1746389865875244, "learning_rate": 3.827450104259756e-05, "loss": 0.8285, "num_input_tokens_seen": 5960736, "step": 10280 }, { "epoch": 1.5318736967530533, "grad_norm": 0.7693277597427368, "learning_rate": 3.829311885612154e-05, "loss": 0.7534, "num_input_tokens_seen": 5963584, "step": 10285 }, { "epoch": 1.5326184092940125, "grad_norm": 1.1318684816360474, "learning_rate": 3.831173666964552e-05, "loss": 0.6405, "num_input_tokens_seen": 5966432, "step": 10290 }, { "epoch": 1.5333631218349717, "grad_norm": 1.0099276304244995, "learning_rate": 3.83303544831695e-05, "loss": 0.7968, "num_input_tokens_seen": 5969248, "step": 10295 }, { "epoch": 1.534107834375931, "grad_norm": 1.2918059825897217, "learning_rate": 3.834897229669348e-05, "loss": 0.7395, "num_input_tokens_seen": 5972064, "step": 10300 }, { "epoch": 1.53485254691689, "grad_norm": 0.8516501188278198, "learning_rate": 3.836759011021746e-05, "loss": 0.7349, "num_input_tokens_seen": 5974976, "step": 10305 }, { "epoch": 1.5355972594578493, "grad_norm": 0.681607723236084, "learning_rate": 3.838620792374144e-05, "loss": 0.7132, "num_input_tokens_seen": 5977728, "step": 10310 }, { "epoch": 1.5363419719988085, "grad_norm": 0.5247889757156372, "learning_rate": 3.8404825737265416e-05, "loss": 0.6496, "num_input_tokens_seen": 5980512, "step": 10315 }, { "epoch": 1.5370866845397675, "grad_norm": 0.7435112595558167, "learning_rate": 3.8423443550789394e-05, "loss": 0.7374, "num_input_tokens_seen": 5983616, "step": 10320 }, { "epoch": 1.5378313970807267, "grad_norm": 0.8484004139900208, "learning_rate": 3.844206136431337e-05, "loss": 0.6466, "num_input_tokens_seen": 5986464, "step": 10325 }, { "epoch": 1.538576109621686, "grad_norm": 1.0347644090652466, "learning_rate": 3.846067917783736e-05, "loss": 0.7415, "num_input_tokens_seen": 5989632, "step": 10330 }, { "epoch": 1.539320822162645, "grad_norm": 0.7228525876998901, "learning_rate": 3.8479296991361336e-05, "loss": 0.7978, "num_input_tokens_seen": 5992736, "step": 10335 }, { "epoch": 1.5400655347036043, "grad_norm": 1.658846139907837, "learning_rate": 3.8497914804885314e-05, "loss": 0.7443, "num_input_tokens_seen": 5995424, "step": 10340 }, { "epoch": 1.5408102472445635, "grad_norm": 1.029503583908081, "learning_rate": 3.851653261840929e-05, "loss": 0.6523, "num_input_tokens_seen": 5998176, "step": 10345 }, { "epoch": 1.5415549597855227, "grad_norm": 0.7942522764205933, "learning_rate": 3.853515043193328e-05, "loss": 0.7291, "num_input_tokens_seen": 6001408, "step": 10350 }, { "epoch": 1.542299672326482, "grad_norm": 0.6994162797927856, "learning_rate": 3.8553768245457255e-05, "loss": 0.6939, "num_input_tokens_seen": 6004064, "step": 10355 }, { "epoch": 1.543044384867441, "grad_norm": 1.2827060222625732, "learning_rate": 3.8572386058981234e-05, "loss": 0.696, "num_input_tokens_seen": 6006944, "step": 10360 }, { "epoch": 1.5437890974084003, "grad_norm": 1.1598570346832275, "learning_rate": 3.859100387250522e-05, "loss": 0.7773, "num_input_tokens_seen": 6009984, "step": 10365 }, { "epoch": 1.5445338099493595, "grad_norm": 0.9756345748901367, "learning_rate": 3.86096216860292e-05, "loss": 0.7063, "num_input_tokens_seen": 6012864, "step": 10370 }, { "epoch": 1.5452785224903187, "grad_norm": 1.4597580432891846, "learning_rate": 3.8628239499553175e-05, "loss": 0.7395, "num_input_tokens_seen": 6015680, "step": 10375 }, { "epoch": 1.546023235031278, "grad_norm": 0.673011839389801, "learning_rate": 3.8646857313077153e-05, "loss": 0.713, "num_input_tokens_seen": 6018496, "step": 10380 }, { "epoch": 1.5467679475722371, "grad_norm": 0.8760353922843933, "learning_rate": 3.866547512660114e-05, "loss": 0.6246, "num_input_tokens_seen": 6021600, "step": 10385 }, { "epoch": 1.5475126601131963, "grad_norm": 1.0297751426696777, "learning_rate": 3.868409294012512e-05, "loss": 0.6066, "num_input_tokens_seen": 6024480, "step": 10390 }, { "epoch": 1.5482573726541555, "grad_norm": 0.7491142153739929, "learning_rate": 3.870271075364909e-05, "loss": 0.7166, "num_input_tokens_seen": 6027456, "step": 10395 }, { "epoch": 1.5490020851951147, "grad_norm": 0.7848002910614014, "learning_rate": 3.872132856717307e-05, "loss": 0.7182, "num_input_tokens_seen": 6030432, "step": 10400 }, { "epoch": 1.549746797736074, "grad_norm": 1.3091111183166504, "learning_rate": 3.873994638069705e-05, "loss": 0.6392, "num_input_tokens_seen": 6033376, "step": 10405 }, { "epoch": 1.5504915102770331, "grad_norm": 1.4057835340499878, "learning_rate": 3.875856419422103e-05, "loss": 0.6878, "num_input_tokens_seen": 6036256, "step": 10410 }, { "epoch": 1.5512362228179923, "grad_norm": 0.8361655473709106, "learning_rate": 3.877718200774501e-05, "loss": 0.729, "num_input_tokens_seen": 6039232, "step": 10415 }, { "epoch": 1.5519809353589515, "grad_norm": 0.8696774840354919, "learning_rate": 3.879579982126899e-05, "loss": 0.7191, "num_input_tokens_seen": 6042240, "step": 10420 }, { "epoch": 1.5527256478999107, "grad_norm": 0.881288468837738, "learning_rate": 3.881441763479297e-05, "loss": 0.6037, "num_input_tokens_seen": 6045024, "step": 10425 }, { "epoch": 1.55347036044087, "grad_norm": 1.0279490947723389, "learning_rate": 3.883303544831695e-05, "loss": 0.6775, "num_input_tokens_seen": 6047968, "step": 10430 }, { "epoch": 1.5542150729818291, "grad_norm": 0.6717802882194519, "learning_rate": 3.8851653261840934e-05, "loss": 0.715, "num_input_tokens_seen": 6051360, "step": 10435 }, { "epoch": 1.5549597855227884, "grad_norm": 0.9129284620285034, "learning_rate": 3.887027107536491e-05, "loss": 0.7614, "num_input_tokens_seen": 6054080, "step": 10440 }, { "epoch": 1.5557044980637476, "grad_norm": 0.8848389983177185, "learning_rate": 3.888888888888889e-05, "loss": 0.7277, "num_input_tokens_seen": 6056672, "step": 10445 }, { "epoch": 1.5564492106047065, "grad_norm": 0.9261996746063232, "learning_rate": 3.890750670241287e-05, "loss": 0.7258, "num_input_tokens_seen": 6059648, "step": 10450 }, { "epoch": 1.5571939231456657, "grad_norm": 0.7685874104499817, "learning_rate": 3.8926124515936854e-05, "loss": 0.8557, "num_input_tokens_seen": 6062496, "step": 10455 }, { "epoch": 1.557938635686625, "grad_norm": 0.8724159598350525, "learning_rate": 3.894474232946083e-05, "loss": 0.7635, "num_input_tokens_seen": 6065344, "step": 10460 }, { "epoch": 1.5586833482275841, "grad_norm": 0.9199586510658264, "learning_rate": 3.896336014298481e-05, "loss": 0.806, "num_input_tokens_seen": 6068064, "step": 10465 }, { "epoch": 1.5594280607685433, "grad_norm": 0.801498293876648, "learning_rate": 3.898197795650879e-05, "loss": 0.8751, "num_input_tokens_seen": 6070944, "step": 10470 }, { "epoch": 1.5601727733095025, "grad_norm": 0.9996899962425232, "learning_rate": 3.9000595770032774e-05, "loss": 0.673, "num_input_tokens_seen": 6073856, "step": 10475 }, { "epoch": 1.5609174858504618, "grad_norm": 0.7238507866859436, "learning_rate": 3.9019213583556745e-05, "loss": 0.687, "num_input_tokens_seen": 6076608, "step": 10480 }, { "epoch": 1.561662198391421, "grad_norm": 0.8109301328659058, "learning_rate": 3.903783139708072e-05, "loss": 0.7321, "num_input_tokens_seen": 6079424, "step": 10485 }, { "epoch": 1.5624069109323802, "grad_norm": 0.7175372242927551, "learning_rate": 3.905644921060471e-05, "loss": 0.6575, "num_input_tokens_seen": 6082432, "step": 10490 }, { "epoch": 1.5631516234733391, "grad_norm": 1.2234065532684326, "learning_rate": 3.907506702412869e-05, "loss": 0.7006, "num_input_tokens_seen": 6085472, "step": 10495 }, { "epoch": 1.5638963360142983, "grad_norm": 0.5569387078285217, "learning_rate": 3.9093684837652665e-05, "loss": 0.6982, "num_input_tokens_seen": 6088448, "step": 10500 }, { "epoch": 1.5646410485552575, "grad_norm": 0.8556857705116272, "learning_rate": 3.911230265117665e-05, "loss": 0.7739, "num_input_tokens_seen": 6091552, "step": 10505 }, { "epoch": 1.5653857610962167, "grad_norm": 0.9503722190856934, "learning_rate": 3.913092046470063e-05, "loss": 0.7686, "num_input_tokens_seen": 6094496, "step": 10510 }, { "epoch": 1.566130473637176, "grad_norm": 0.8896042704582214, "learning_rate": 3.9149538278224606e-05, "loss": 0.7, "num_input_tokens_seen": 6097248, "step": 10515 }, { "epoch": 1.5668751861781351, "grad_norm": 0.7056937217712402, "learning_rate": 3.9168156091748585e-05, "loss": 0.6275, "num_input_tokens_seen": 6100000, "step": 10520 }, { "epoch": 1.5676198987190944, "grad_norm": 1.8068411350250244, "learning_rate": 3.918677390527257e-05, "loss": 0.8245, "num_input_tokens_seen": 6103104, "step": 10525 }, { "epoch": 1.5683646112600536, "grad_norm": 1.1844693422317505, "learning_rate": 3.920539171879655e-05, "loss": 0.7238, "num_input_tokens_seen": 6105984, "step": 10530 }, { "epoch": 1.5691093238010128, "grad_norm": 1.3008359670639038, "learning_rate": 3.9224009532320526e-05, "loss": 0.7949, "num_input_tokens_seen": 6108736, "step": 10535 }, { "epoch": 1.569854036341972, "grad_norm": 0.7860806584358215, "learning_rate": 3.9242627345844504e-05, "loss": 0.599, "num_input_tokens_seen": 6111840, "step": 10540 }, { "epoch": 1.5705987488829312, "grad_norm": 0.8589115738868713, "learning_rate": 3.926124515936849e-05, "loss": 0.7187, "num_input_tokens_seen": 6114848, "step": 10545 }, { "epoch": 1.5713434614238904, "grad_norm": 1.3645187616348267, "learning_rate": 3.927986297289247e-05, "loss": 0.6608, "num_input_tokens_seen": 6117600, "step": 10550 }, { "epoch": 1.5720881739648496, "grad_norm": 0.8034226298332214, "learning_rate": 3.9298480786416446e-05, "loss": 0.7025, "num_input_tokens_seen": 6120160, "step": 10555 }, { "epoch": 1.5728328865058088, "grad_norm": 0.7212163209915161, "learning_rate": 3.9317098599940424e-05, "loss": 0.6726, "num_input_tokens_seen": 6122848, "step": 10560 }, { "epoch": 1.573577599046768, "grad_norm": 0.6353380680084229, "learning_rate": 3.933571641346441e-05, "loss": 0.6796, "num_input_tokens_seen": 6125728, "step": 10565 }, { "epoch": 1.5743223115877272, "grad_norm": 0.931990385055542, "learning_rate": 3.935433422698838e-05, "loss": 0.7651, "num_input_tokens_seen": 6128448, "step": 10570 }, { "epoch": 1.5750670241286864, "grad_norm": 0.7960891127586365, "learning_rate": 3.937295204051236e-05, "loss": 0.7455, "num_input_tokens_seen": 6131584, "step": 10575 }, { "epoch": 1.5758117366696456, "grad_norm": 0.7172731161117554, "learning_rate": 3.9391569854036344e-05, "loss": 0.5763, "num_input_tokens_seen": 6134400, "step": 10580 }, { "epoch": 1.5765564492106048, "grad_norm": 0.7876104712486267, "learning_rate": 3.941018766756032e-05, "loss": 0.6453, "num_input_tokens_seen": 6137056, "step": 10585 }, { "epoch": 1.577301161751564, "grad_norm": 0.7944631576538086, "learning_rate": 3.94288054810843e-05, "loss": 0.6359, "num_input_tokens_seen": 6139744, "step": 10590 }, { "epoch": 1.5780458742925232, "grad_norm": 1.024726152420044, "learning_rate": 3.9447423294608285e-05, "loss": 0.7124, "num_input_tokens_seen": 6142432, "step": 10595 }, { "epoch": 1.5787905868334824, "grad_norm": 1.0068610906600952, "learning_rate": 3.946604110813226e-05, "loss": 0.695, "num_input_tokens_seen": 6145248, "step": 10600 }, { "epoch": 1.5795352993744416, "grad_norm": 0.7720962166786194, "learning_rate": 3.948465892165624e-05, "loss": 0.6457, "num_input_tokens_seen": 6148256, "step": 10605 }, { "epoch": 1.5802800119154008, "grad_norm": 0.9712343811988831, "learning_rate": 3.950327673518022e-05, "loss": 0.7923, "num_input_tokens_seen": 6151232, "step": 10610 }, { "epoch": 1.58102472445636, "grad_norm": 0.7985485196113586, "learning_rate": 3.9521894548704205e-05, "loss": 0.6816, "num_input_tokens_seen": 6154144, "step": 10615 }, { "epoch": 1.5817694369973192, "grad_norm": 1.0996849536895752, "learning_rate": 3.954051236222818e-05, "loss": 0.8063, "num_input_tokens_seen": 6157216, "step": 10620 }, { "epoch": 1.5825141495382782, "grad_norm": 0.6288041472434998, "learning_rate": 3.955913017575216e-05, "loss": 0.6383, "num_input_tokens_seen": 6160128, "step": 10625 }, { "epoch": 1.5832588620792374, "grad_norm": 0.6605249047279358, "learning_rate": 3.957774798927614e-05, "loss": 0.7497, "num_input_tokens_seen": 6163040, "step": 10630 }, { "epoch": 1.5840035746201966, "grad_norm": 0.8997422456741333, "learning_rate": 3.9596365802800125e-05, "loss": 0.6718, "num_input_tokens_seen": 6165824, "step": 10635 }, { "epoch": 1.5847482871611558, "grad_norm": 0.9032168984413147, "learning_rate": 3.96149836163241e-05, "loss": 0.7204, "num_input_tokens_seen": 6168800, "step": 10640 }, { "epoch": 1.585492999702115, "grad_norm": 0.7068557739257812, "learning_rate": 3.963360142984808e-05, "loss": 0.7854, "num_input_tokens_seen": 6171648, "step": 10645 }, { "epoch": 1.5862377122430742, "grad_norm": 0.8391706943511963, "learning_rate": 3.9652219243372066e-05, "loss": 0.74, "num_input_tokens_seen": 6174688, "step": 10650 }, { "epoch": 1.5869824247840334, "grad_norm": 0.9075152277946472, "learning_rate": 3.967083705689604e-05, "loss": 0.6837, "num_input_tokens_seen": 6177664, "step": 10655 }, { "epoch": 1.5877271373249926, "grad_norm": 0.8024614453315735, "learning_rate": 3.9689454870420016e-05, "loss": 0.7034, "num_input_tokens_seen": 6180672, "step": 10660 }, { "epoch": 1.5884718498659516, "grad_norm": 0.8676065802574158, "learning_rate": 3.9708072683944e-05, "loss": 0.7452, "num_input_tokens_seen": 6183616, "step": 10665 }, { "epoch": 1.5892165624069108, "grad_norm": 0.9126086831092834, "learning_rate": 3.972669049746798e-05, "loss": 0.7032, "num_input_tokens_seen": 6186368, "step": 10670 }, { "epoch": 1.58996127494787, "grad_norm": 0.7788288593292236, "learning_rate": 3.974530831099196e-05, "loss": 0.726, "num_input_tokens_seen": 6189344, "step": 10675 }, { "epoch": 1.5907059874888292, "grad_norm": 1.0719140768051147, "learning_rate": 3.9763926124515935e-05, "loss": 0.6928, "num_input_tokens_seen": 6192128, "step": 10680 }, { "epoch": 1.5914507000297884, "grad_norm": 0.7218198180198669, "learning_rate": 3.978254393803992e-05, "loss": 0.7026, "num_input_tokens_seen": 6195264, "step": 10685 }, { "epoch": 1.5921954125707476, "grad_norm": 1.382651448249817, "learning_rate": 3.98011617515639e-05, "loss": 0.7143, "num_input_tokens_seen": 6197920, "step": 10690 }, { "epoch": 1.5929401251117068, "grad_norm": 0.9419246315956116, "learning_rate": 3.981977956508788e-05, "loss": 0.8687, "num_input_tokens_seen": 6201088, "step": 10695 }, { "epoch": 1.593684837652666, "grad_norm": 0.6937283277511597, "learning_rate": 3.9838397378611855e-05, "loss": 0.6533, "num_input_tokens_seen": 6204224, "step": 10700 }, { "epoch": 1.5944295501936252, "grad_norm": 0.8185203671455383, "learning_rate": 3.985701519213584e-05, "loss": 0.7006, "num_input_tokens_seen": 6207040, "step": 10705 }, { "epoch": 1.5951742627345844, "grad_norm": 0.8377928733825684, "learning_rate": 3.987563300565982e-05, "loss": 0.7036, "num_input_tokens_seen": 6209952, "step": 10710 }, { "epoch": 1.5959189752755436, "grad_norm": 1.0306326150894165, "learning_rate": 3.98942508191838e-05, "loss": 0.7417, "num_input_tokens_seen": 6212896, "step": 10715 }, { "epoch": 1.5966636878165028, "grad_norm": 0.9370775818824768, "learning_rate": 3.991286863270778e-05, "loss": 0.7299, "num_input_tokens_seen": 6216448, "step": 10720 }, { "epoch": 1.597408400357462, "grad_norm": 0.7445511221885681, "learning_rate": 3.993148644623176e-05, "loss": 0.7261, "num_input_tokens_seen": 6219296, "step": 10725 }, { "epoch": 1.5981531128984212, "grad_norm": 0.940771222114563, "learning_rate": 3.995010425975574e-05, "loss": 0.7414, "num_input_tokens_seen": 6222528, "step": 10730 }, { "epoch": 1.5988978254393804, "grad_norm": 0.9737014174461365, "learning_rate": 3.9968722073279716e-05, "loss": 0.7337, "num_input_tokens_seen": 6225184, "step": 10735 }, { "epoch": 1.5996425379803396, "grad_norm": 0.7421203851699829, "learning_rate": 3.9987339886803695e-05, "loss": 0.4809, "num_input_tokens_seen": 6227936, "step": 10740 }, { "epoch": 1.6003872505212988, "grad_norm": 0.6618388295173645, "learning_rate": 4.000595770032767e-05, "loss": 0.6159, "num_input_tokens_seen": 6230688, "step": 10745 }, { "epoch": 1.601131963062258, "grad_norm": 0.8983571529388428, "learning_rate": 4.002457551385165e-05, "loss": 0.7196, "num_input_tokens_seen": 6233632, "step": 10750 }, { "epoch": 1.6018766756032172, "grad_norm": 0.8577005863189697, "learning_rate": 4.0043193327375636e-05, "loss": 0.6632, "num_input_tokens_seen": 6236480, "step": 10755 }, { "epoch": 1.6026213881441764, "grad_norm": 0.8857523202896118, "learning_rate": 4.0061811140899614e-05, "loss": 0.7891, "num_input_tokens_seen": 6239456, "step": 10760 }, { "epoch": 1.6033661006851356, "grad_norm": 0.5823855400085449, "learning_rate": 4.008042895442359e-05, "loss": 0.6357, "num_input_tokens_seen": 6242368, "step": 10765 }, { "epoch": 1.6041108132260948, "grad_norm": 0.6798807978630066, "learning_rate": 4.009904676794757e-05, "loss": 0.6536, "num_input_tokens_seen": 6245152, "step": 10770 }, { "epoch": 1.604855525767054, "grad_norm": 0.9250585436820984, "learning_rate": 4.0117664581471556e-05, "loss": 0.7426, "num_input_tokens_seen": 6248032, "step": 10775 }, { "epoch": 1.6056002383080132, "grad_norm": 0.7825592756271362, "learning_rate": 4.0136282394995534e-05, "loss": 0.7102, "num_input_tokens_seen": 6251008, "step": 10780 }, { "epoch": 1.6063449508489724, "grad_norm": 0.9780833721160889, "learning_rate": 4.015490020851951e-05, "loss": 0.7164, "num_input_tokens_seen": 6254240, "step": 10785 }, { "epoch": 1.6070896633899316, "grad_norm": 0.9136550426483154, "learning_rate": 4.017351802204349e-05, "loss": 0.6685, "num_input_tokens_seen": 6257376, "step": 10790 }, { "epoch": 1.6078343759308906, "grad_norm": 1.0166282653808594, "learning_rate": 4.0192135835567475e-05, "loss": 0.7083, "num_input_tokens_seen": 6260608, "step": 10795 }, { "epoch": 1.6085790884718498, "grad_norm": 0.5620527267456055, "learning_rate": 4.0210753649091454e-05, "loss": 0.5691, "num_input_tokens_seen": 6263200, "step": 10800 }, { "epoch": 1.609323801012809, "grad_norm": 0.9230571389198303, "learning_rate": 4.022937146261543e-05, "loss": 0.5254, "num_input_tokens_seen": 6266208, "step": 10805 }, { "epoch": 1.6100685135537682, "grad_norm": 0.9161855578422546, "learning_rate": 4.024798927613942e-05, "loss": 0.6442, "num_input_tokens_seen": 6269024, "step": 10810 }, { "epoch": 1.6108132260947274, "grad_norm": 1.6423922777175903, "learning_rate": 4.0266607089663395e-05, "loss": 0.687, "num_input_tokens_seen": 6272096, "step": 10815 }, { "epoch": 1.6115579386356866, "grad_norm": 0.78879314661026, "learning_rate": 4.028522490318737e-05, "loss": 0.7153, "num_input_tokens_seen": 6274944, "step": 10820 }, { "epoch": 1.6123026511766458, "grad_norm": 0.892494797706604, "learning_rate": 4.030384271671135e-05, "loss": 0.8241, "num_input_tokens_seen": 6277856, "step": 10825 }, { "epoch": 1.613047363717605, "grad_norm": 1.8877686262130737, "learning_rate": 4.032246053023533e-05, "loss": 0.7548, "num_input_tokens_seen": 6280544, "step": 10830 }, { "epoch": 1.6137920762585642, "grad_norm": 1.054832935333252, "learning_rate": 4.034107834375931e-05, "loss": 0.762, "num_input_tokens_seen": 6283232, "step": 10835 }, { "epoch": 1.6145367887995232, "grad_norm": 0.8797935247421265, "learning_rate": 4.0359696157283286e-05, "loss": 0.6341, "num_input_tokens_seen": 6286144, "step": 10840 }, { "epoch": 1.6152815013404824, "grad_norm": 1.1592016220092773, "learning_rate": 4.037831397080727e-05, "loss": 0.7238, "num_input_tokens_seen": 6288864, "step": 10845 }, { "epoch": 1.6160262138814416, "grad_norm": 1.158250093460083, "learning_rate": 4.039693178433125e-05, "loss": 0.7799, "num_input_tokens_seen": 6291872, "step": 10850 }, { "epoch": 1.6167709264224008, "grad_norm": 1.0960499048233032, "learning_rate": 4.041554959785523e-05, "loss": 0.7093, "num_input_tokens_seen": 6294880, "step": 10855 }, { "epoch": 1.61751563896336, "grad_norm": 0.7898444533348083, "learning_rate": 4.0434167411379206e-05, "loss": 0.571, "num_input_tokens_seen": 6297824, "step": 10860 }, { "epoch": 1.6182603515043192, "grad_norm": 1.183056354522705, "learning_rate": 4.045278522490319e-05, "loss": 0.7498, "num_input_tokens_seen": 6300576, "step": 10865 }, { "epoch": 1.6190050640452784, "grad_norm": 0.7894687652587891, "learning_rate": 4.047140303842717e-05, "loss": 0.734, "num_input_tokens_seen": 6303616, "step": 10870 }, { "epoch": 1.6197497765862376, "grad_norm": 0.9787790775299072, "learning_rate": 4.049002085195115e-05, "loss": 0.7275, "num_input_tokens_seen": 6306592, "step": 10875 }, { "epoch": 1.6204944891271968, "grad_norm": 0.8067414164543152, "learning_rate": 4.050863866547513e-05, "loss": 0.7601, "num_input_tokens_seen": 6309536, "step": 10880 }, { "epoch": 1.621239201668156, "grad_norm": 0.8608449697494507, "learning_rate": 4.052725647899911e-05, "loss": 0.5788, "num_input_tokens_seen": 6312448, "step": 10885 }, { "epoch": 1.6219839142091153, "grad_norm": 0.7328158020973206, "learning_rate": 4.054587429252309e-05, "loss": 0.6719, "num_input_tokens_seen": 6315168, "step": 10890 }, { "epoch": 1.6227286267500745, "grad_norm": 0.7882453799247742, "learning_rate": 4.056449210604707e-05, "loss": 0.6564, "num_input_tokens_seen": 6317952, "step": 10895 }, { "epoch": 1.6234733392910337, "grad_norm": 2.7159180641174316, "learning_rate": 4.058310991957105e-05, "loss": 0.8449, "num_input_tokens_seen": 6320832, "step": 10900 }, { "epoch": 1.6242180518319929, "grad_norm": 0.6647311449050903, "learning_rate": 4.060172773309503e-05, "loss": 0.7431, "num_input_tokens_seen": 6323872, "step": 10905 }, { "epoch": 1.624962764372952, "grad_norm": 1.1537705659866333, "learning_rate": 4.062034554661901e-05, "loss": 0.7355, "num_input_tokens_seen": 6326560, "step": 10910 }, { "epoch": 1.6257074769139113, "grad_norm": 0.8141167163848877, "learning_rate": 4.063896336014299e-05, "loss": 0.7532, "num_input_tokens_seen": 6329504, "step": 10915 }, { "epoch": 1.6264521894548705, "grad_norm": 1.2242246866226196, "learning_rate": 4.0657581173666965e-05, "loss": 0.6246, "num_input_tokens_seen": 6332608, "step": 10920 }, { "epoch": 1.6271969019958297, "grad_norm": 0.8892673254013062, "learning_rate": 4.067619898719094e-05, "loss": 0.9475, "num_input_tokens_seen": 6335328, "step": 10925 }, { "epoch": 1.6279416145367889, "grad_norm": 1.0318500995635986, "learning_rate": 4.069481680071492e-05, "loss": 0.7548, "num_input_tokens_seen": 6338304, "step": 10930 }, { "epoch": 1.628686327077748, "grad_norm": 0.782722532749176, "learning_rate": 4.0713434614238907e-05, "loss": 0.6555, "num_input_tokens_seen": 6341632, "step": 10935 }, { "epoch": 1.6294310396187073, "grad_norm": 0.8217963576316833, "learning_rate": 4.0732052427762885e-05, "loss": 0.646, "num_input_tokens_seen": 6344672, "step": 10940 }, { "epoch": 1.6301757521596665, "grad_norm": 0.8435741066932678, "learning_rate": 4.075067024128686e-05, "loss": 0.6991, "num_input_tokens_seen": 6347520, "step": 10945 }, { "epoch": 1.6309204647006257, "grad_norm": 0.8713759183883667, "learning_rate": 4.076928805481084e-05, "loss": 0.8154, "num_input_tokens_seen": 6350080, "step": 10950 }, { "epoch": 1.6316651772415849, "grad_norm": 1.178591012954712, "learning_rate": 4.0787905868334826e-05, "loss": 0.7505, "num_input_tokens_seen": 6352768, "step": 10955 }, { "epoch": 1.632409889782544, "grad_norm": 0.9788990020751953, "learning_rate": 4.0806523681858805e-05, "loss": 0.7797, "num_input_tokens_seen": 6355776, "step": 10960 }, { "epoch": 1.6331546023235033, "grad_norm": 1.5406888723373413, "learning_rate": 4.082514149538278e-05, "loss": 0.7478, "num_input_tokens_seen": 6358496, "step": 10965 }, { "epoch": 1.6338993148644623, "grad_norm": 1.0569206476211548, "learning_rate": 4.084375930890677e-05, "loss": 0.5752, "num_input_tokens_seen": 6361408, "step": 10970 }, { "epoch": 1.6346440274054215, "grad_norm": 0.9128642082214355, "learning_rate": 4.0862377122430746e-05, "loss": 0.6786, "num_input_tokens_seen": 6364320, "step": 10975 }, { "epoch": 1.6353887399463807, "grad_norm": 0.9688376188278198, "learning_rate": 4.0880994935954724e-05, "loss": 0.6201, "num_input_tokens_seen": 6366944, "step": 10980 }, { "epoch": 1.6361334524873399, "grad_norm": 0.746889054775238, "learning_rate": 4.08996127494787e-05, "loss": 0.7145, "num_input_tokens_seen": 6369568, "step": 10985 }, { "epoch": 1.636878165028299, "grad_norm": 0.7091821432113647, "learning_rate": 4.091823056300269e-05, "loss": 0.6903, "num_input_tokens_seen": 6372320, "step": 10990 }, { "epoch": 1.6376228775692583, "grad_norm": 0.9158923029899597, "learning_rate": 4.0936848376526666e-05, "loss": 0.6303, "num_input_tokens_seen": 6375264, "step": 10995 }, { "epoch": 1.6383675901102175, "grad_norm": 0.7862308621406555, "learning_rate": 4.095546619005064e-05, "loss": 0.8322, "num_input_tokens_seen": 6378016, "step": 11000 }, { "epoch": 1.6391123026511767, "grad_norm": 0.9481078386306763, "learning_rate": 4.097408400357462e-05, "loss": 0.6571, "num_input_tokens_seen": 6380576, "step": 11005 }, { "epoch": 1.6398570151921357, "grad_norm": 0.7898963689804077, "learning_rate": 4.09927018170986e-05, "loss": 0.694, "num_input_tokens_seen": 6383200, "step": 11010 }, { "epoch": 1.6406017277330949, "grad_norm": 1.8276914358139038, "learning_rate": 4.101131963062258e-05, "loss": 0.7439, "num_input_tokens_seen": 6386016, "step": 11015 }, { "epoch": 1.641346440274054, "grad_norm": 0.6592919230461121, "learning_rate": 4.102993744414656e-05, "loss": 0.6654, "num_input_tokens_seen": 6388768, "step": 11020 }, { "epoch": 1.6420911528150133, "grad_norm": 0.8947256207466125, "learning_rate": 4.104855525767054e-05, "loss": 0.6308, "num_input_tokens_seen": 6391616, "step": 11025 }, { "epoch": 1.6428358653559725, "grad_norm": 0.9801621437072754, "learning_rate": 4.106717307119452e-05, "loss": 0.7265, "num_input_tokens_seen": 6394240, "step": 11030 }, { "epoch": 1.6435805778969317, "grad_norm": 0.7159548401832581, "learning_rate": 4.10857908847185e-05, "loss": 0.625, "num_input_tokens_seen": 6397312, "step": 11035 }, { "epoch": 1.6443252904378909, "grad_norm": 0.7681586146354675, "learning_rate": 4.110440869824248e-05, "loss": 0.6623, "num_input_tokens_seen": 6399904, "step": 11040 }, { "epoch": 1.64507000297885, "grad_norm": 0.9437589645385742, "learning_rate": 4.112302651176646e-05, "loss": 0.6756, "num_input_tokens_seen": 6402816, "step": 11045 }, { "epoch": 1.6458147155198093, "grad_norm": 0.7641797065734863, "learning_rate": 4.114164432529044e-05, "loss": 0.7717, "num_input_tokens_seen": 6405792, "step": 11050 }, { "epoch": 1.6465594280607685, "grad_norm": 1.006155252456665, "learning_rate": 4.116026213881442e-05, "loss": 0.9243, "num_input_tokens_seen": 6408512, "step": 11055 }, { "epoch": 1.6473041406017277, "grad_norm": 1.0151878595352173, "learning_rate": 4.11788799523384e-05, "loss": 0.7125, "num_input_tokens_seen": 6411744, "step": 11060 }, { "epoch": 1.648048853142687, "grad_norm": 2.004973888397217, "learning_rate": 4.119749776586238e-05, "loss": 0.8858, "num_input_tokens_seen": 6414720, "step": 11065 }, { "epoch": 1.648793565683646, "grad_norm": 1.1053441762924194, "learning_rate": 4.121611557938636e-05, "loss": 0.7349, "num_input_tokens_seen": 6417536, "step": 11070 }, { "epoch": 1.6495382782246053, "grad_norm": 2.0231075286865234, "learning_rate": 4.123473339291034e-05, "loss": 0.9266, "num_input_tokens_seen": 6420256, "step": 11075 }, { "epoch": 1.6502829907655645, "grad_norm": 0.8736856579780579, "learning_rate": 4.125335120643432e-05, "loss": 0.5706, "num_input_tokens_seen": 6423296, "step": 11080 }, { "epoch": 1.6510277033065237, "grad_norm": 0.5955716967582703, "learning_rate": 4.1271969019958294e-05, "loss": 0.7439, "num_input_tokens_seen": 6426080, "step": 11085 }, { "epoch": 1.651772415847483, "grad_norm": 0.9776118397712708, "learning_rate": 4.129058683348227e-05, "loss": 0.6026, "num_input_tokens_seen": 6428864, "step": 11090 }, { "epoch": 1.6525171283884421, "grad_norm": 0.9023988246917725, "learning_rate": 4.130920464700626e-05, "loss": 0.7369, "num_input_tokens_seen": 6431584, "step": 11095 }, { "epoch": 1.6532618409294013, "grad_norm": 0.4869314134120941, "learning_rate": 4.1327822460530236e-05, "loss": 0.6026, "num_input_tokens_seen": 6434496, "step": 11100 }, { "epoch": 1.6540065534703605, "grad_norm": 0.7954160571098328, "learning_rate": 4.1346440274054214e-05, "loss": 0.8629, "num_input_tokens_seen": 6437280, "step": 11105 }, { "epoch": 1.6547512660113197, "grad_norm": 0.894413411617279, "learning_rate": 4.13650580875782e-05, "loss": 0.6842, "num_input_tokens_seen": 6440160, "step": 11110 }, { "epoch": 1.655495978552279, "grad_norm": 0.8606266379356384, "learning_rate": 4.138367590110218e-05, "loss": 0.6682, "num_input_tokens_seen": 6443328, "step": 11115 }, { "epoch": 1.6562406910932381, "grad_norm": 0.8292482495307922, "learning_rate": 4.1402293714626155e-05, "loss": 0.8271, "num_input_tokens_seen": 6446112, "step": 11120 }, { "epoch": 1.6569854036341973, "grad_norm": 0.5053991675376892, "learning_rate": 4.1420911528150134e-05, "loss": 0.7069, "num_input_tokens_seen": 6449152, "step": 11125 }, { "epoch": 1.6577301161751565, "grad_norm": 0.8848035931587219, "learning_rate": 4.143952934167412e-05, "loss": 0.7572, "num_input_tokens_seen": 6451840, "step": 11130 }, { "epoch": 1.6584748287161157, "grad_norm": 0.6043880581855774, "learning_rate": 4.14581471551981e-05, "loss": 0.5743, "num_input_tokens_seen": 6454784, "step": 11135 }, { "epoch": 1.6592195412570747, "grad_norm": 0.9683564305305481, "learning_rate": 4.1476764968722075e-05, "loss": 0.5942, "num_input_tokens_seen": 6457728, "step": 11140 }, { "epoch": 1.659964253798034, "grad_norm": 0.7735332250595093, "learning_rate": 4.149538278224605e-05, "loss": 0.6463, "num_input_tokens_seen": 6460416, "step": 11145 }, { "epoch": 1.6607089663389931, "grad_norm": 1.3271211385726929, "learning_rate": 4.151400059577004e-05, "loss": 0.6941, "num_input_tokens_seen": 6463360, "step": 11150 }, { "epoch": 1.6614536788799523, "grad_norm": 0.7056975960731506, "learning_rate": 4.1532618409294017e-05, "loss": 0.6756, "num_input_tokens_seen": 6466144, "step": 11155 }, { "epoch": 1.6621983914209115, "grad_norm": 1.0704865455627441, "learning_rate": 4.1551236222817995e-05, "loss": 0.7591, "num_input_tokens_seen": 6469120, "step": 11160 }, { "epoch": 1.6629431039618707, "grad_norm": 0.6697418689727783, "learning_rate": 4.156985403634197e-05, "loss": 0.6431, "num_input_tokens_seen": 6472320, "step": 11165 }, { "epoch": 1.66368781650283, "grad_norm": 0.922017514705658, "learning_rate": 4.158847184986596e-05, "loss": 0.6129, "num_input_tokens_seen": 6475296, "step": 11170 }, { "epoch": 1.6644325290437891, "grad_norm": 0.9926599264144897, "learning_rate": 4.160708966338993e-05, "loss": 0.6639, "num_input_tokens_seen": 6478496, "step": 11175 }, { "epoch": 1.6651772415847483, "grad_norm": 1.0139349699020386, "learning_rate": 4.162570747691391e-05, "loss": 0.6123, "num_input_tokens_seen": 6481440, "step": 11180 }, { "epoch": 1.6659219541257073, "grad_norm": 0.6004346013069153, "learning_rate": 4.164432529043789e-05, "loss": 0.7205, "num_input_tokens_seen": 6484800, "step": 11185 }, { "epoch": 1.6666666666666665, "grad_norm": 0.863162100315094, "learning_rate": 4.166294310396187e-05, "loss": 0.8999, "num_input_tokens_seen": 6487488, "step": 11190 }, { "epoch": 1.6674113792076257, "grad_norm": 0.9692040085792542, "learning_rate": 4.168156091748585e-05, "loss": 0.8355, "num_input_tokens_seen": 6490048, "step": 11195 }, { "epoch": 1.668156091748585, "grad_norm": 0.8884361982345581, "learning_rate": 4.1700178731009834e-05, "loss": 0.7513, "num_input_tokens_seen": 6493088, "step": 11200 }, { "epoch": 1.6689008042895441, "grad_norm": 0.6791101694107056, "learning_rate": 4.171879654453381e-05, "loss": 0.5482, "num_input_tokens_seen": 6496128, "step": 11205 }, { "epoch": 1.6696455168305033, "grad_norm": 1.0588034391403198, "learning_rate": 4.173741435805779e-05, "loss": 0.7286, "num_input_tokens_seen": 6498976, "step": 11210 }, { "epoch": 1.6703902293714625, "grad_norm": 0.8706889152526855, "learning_rate": 4.175603217158177e-05, "loss": 0.6192, "num_input_tokens_seen": 6502208, "step": 11215 }, { "epoch": 1.6711349419124217, "grad_norm": 0.7756167054176331, "learning_rate": 4.1774649985105754e-05, "loss": 0.6302, "num_input_tokens_seen": 6505408, "step": 11220 }, { "epoch": 1.671879654453381, "grad_norm": 0.6950978636741638, "learning_rate": 4.179326779862973e-05, "loss": 0.7892, "num_input_tokens_seen": 6508416, "step": 11225 }, { "epoch": 1.6726243669943401, "grad_norm": 0.7115098237991333, "learning_rate": 4.181188561215371e-05, "loss": 0.692, "num_input_tokens_seen": 6511232, "step": 11230 }, { "epoch": 1.6733690795352993, "grad_norm": 1.734706163406372, "learning_rate": 4.183050342567769e-05, "loss": 0.6386, "num_input_tokens_seen": 6514112, "step": 11235 }, { "epoch": 1.6741137920762585, "grad_norm": 0.9057305455207825, "learning_rate": 4.1849121239201674e-05, "loss": 0.5272, "num_input_tokens_seen": 6516928, "step": 11240 }, { "epoch": 1.6748585046172177, "grad_norm": 1.3012608289718628, "learning_rate": 4.186773905272565e-05, "loss": 0.648, "num_input_tokens_seen": 6519616, "step": 11245 }, { "epoch": 1.675603217158177, "grad_norm": 1.00068199634552, "learning_rate": 4.188635686624963e-05, "loss": 0.6901, "num_input_tokens_seen": 6522624, "step": 11250 }, { "epoch": 1.6763479296991362, "grad_norm": 1.235105037689209, "learning_rate": 4.1904974679773615e-05, "loss": 0.6937, "num_input_tokens_seen": 6525312, "step": 11255 }, { "epoch": 1.6770926422400954, "grad_norm": 0.9378980398178101, "learning_rate": 4.1923592493297587e-05, "loss": 0.6842, "num_input_tokens_seen": 6528320, "step": 11260 }, { "epoch": 1.6778373547810546, "grad_norm": 1.2013193368911743, "learning_rate": 4.1942210306821565e-05, "loss": 0.7128, "num_input_tokens_seen": 6531168, "step": 11265 }, { "epoch": 1.6785820673220138, "grad_norm": 1.185232162475586, "learning_rate": 4.196082812034555e-05, "loss": 0.8486, "num_input_tokens_seen": 6534208, "step": 11270 }, { "epoch": 1.679326779862973, "grad_norm": 0.6617110371589661, "learning_rate": 4.197944593386953e-05, "loss": 0.7544, "num_input_tokens_seen": 6536896, "step": 11275 }, { "epoch": 1.6800714924039322, "grad_norm": 1.0665409564971924, "learning_rate": 4.1998063747393506e-05, "loss": 0.7719, "num_input_tokens_seen": 6539744, "step": 11280 }, { "epoch": 1.6808162049448914, "grad_norm": 0.5931365489959717, "learning_rate": 4.2016681560917485e-05, "loss": 0.6246, "num_input_tokens_seen": 6542336, "step": 11285 }, { "epoch": 1.6815609174858506, "grad_norm": 0.757443368434906, "learning_rate": 4.203529937444147e-05, "loss": 0.7293, "num_input_tokens_seen": 6545632, "step": 11290 }, { "epoch": 1.6823056300268098, "grad_norm": 0.9115412831306458, "learning_rate": 4.205391718796545e-05, "loss": 0.6756, "num_input_tokens_seen": 6548352, "step": 11295 }, { "epoch": 1.683050342567769, "grad_norm": 0.6415169835090637, "learning_rate": 4.2072535001489426e-05, "loss": 0.6346, "num_input_tokens_seen": 6551488, "step": 11300 }, { "epoch": 1.6837950551087282, "grad_norm": 0.5833984017372131, "learning_rate": 4.2091152815013404e-05, "loss": 0.6708, "num_input_tokens_seen": 6554624, "step": 11305 }, { "epoch": 1.6845397676496874, "grad_norm": 0.9132222533226013, "learning_rate": 4.210977062853739e-05, "loss": 0.7678, "num_input_tokens_seen": 6557728, "step": 11310 }, { "epoch": 1.6852844801906464, "grad_norm": 1.1197519302368164, "learning_rate": 4.212838844206137e-05, "loss": 0.6164, "num_input_tokens_seen": 6560800, "step": 11315 }, { "epoch": 1.6860291927316056, "grad_norm": 1.0109727382659912, "learning_rate": 4.2147006255585346e-05, "loss": 0.6174, "num_input_tokens_seen": 6563776, "step": 11320 }, { "epoch": 1.6867739052725648, "grad_norm": 0.7439755797386169, "learning_rate": 4.216562406910933e-05, "loss": 0.7007, "num_input_tokens_seen": 6566624, "step": 11325 }, { "epoch": 1.687518617813524, "grad_norm": 0.7914832234382629, "learning_rate": 4.218424188263331e-05, "loss": 0.7719, "num_input_tokens_seen": 6570048, "step": 11330 }, { "epoch": 1.6882633303544832, "grad_norm": 0.7477091550827026, "learning_rate": 4.220285969615729e-05, "loss": 0.6531, "num_input_tokens_seen": 6572800, "step": 11335 }, { "epoch": 1.6890080428954424, "grad_norm": 0.8158696293830872, "learning_rate": 4.2221477509681265e-05, "loss": 0.7608, "num_input_tokens_seen": 6575712, "step": 11340 }, { "epoch": 1.6897527554364016, "grad_norm": 1.6561871767044067, "learning_rate": 4.2240095323205244e-05, "loss": 0.8146, "num_input_tokens_seen": 6578688, "step": 11345 }, { "epoch": 1.6904974679773608, "grad_norm": 0.8257113099098206, "learning_rate": 4.225871313672922e-05, "loss": 0.7519, "num_input_tokens_seen": 6581568, "step": 11350 }, { "epoch": 1.69124218051832, "grad_norm": 0.7141494154930115, "learning_rate": 4.22773309502532e-05, "loss": 0.7741, "num_input_tokens_seen": 6584672, "step": 11355 }, { "epoch": 1.691986893059279, "grad_norm": 0.6254489421844482, "learning_rate": 4.2295948763777185e-05, "loss": 0.7107, "num_input_tokens_seen": 6587552, "step": 11360 }, { "epoch": 1.6927316056002382, "grad_norm": 0.5335524678230286, "learning_rate": 4.231456657730116e-05, "loss": 0.6433, "num_input_tokens_seen": 6590624, "step": 11365 }, { "epoch": 1.6934763181411974, "grad_norm": 0.8884290456771851, "learning_rate": 4.233318439082514e-05, "loss": 0.5635, "num_input_tokens_seen": 6593728, "step": 11370 }, { "epoch": 1.6942210306821566, "grad_norm": 0.8462398052215576, "learning_rate": 4.235180220434912e-05, "loss": 0.6858, "num_input_tokens_seen": 6596480, "step": 11375 }, { "epoch": 1.6949657432231158, "grad_norm": 0.6853266358375549, "learning_rate": 4.2370420017873105e-05, "loss": 0.5705, "num_input_tokens_seen": 6599360, "step": 11380 }, { "epoch": 1.695710455764075, "grad_norm": 1.0011916160583496, "learning_rate": 4.238903783139708e-05, "loss": 0.592, "num_input_tokens_seen": 6602176, "step": 11385 }, { "epoch": 1.6964551683050342, "grad_norm": 0.865649938583374, "learning_rate": 4.240765564492106e-05, "loss": 0.6714, "num_input_tokens_seen": 6605024, "step": 11390 }, { "epoch": 1.6971998808459934, "grad_norm": 0.817833662033081, "learning_rate": 4.242627345844504e-05, "loss": 0.6904, "num_input_tokens_seen": 6607744, "step": 11395 }, { "epoch": 1.6979445933869526, "grad_norm": 0.9553525447845459, "learning_rate": 4.2444891271969025e-05, "loss": 0.6883, "num_input_tokens_seen": 6610848, "step": 11400 }, { "epoch": 1.6986893059279118, "grad_norm": 0.6764267086982727, "learning_rate": 4.2463509085493e-05, "loss": 0.5846, "num_input_tokens_seen": 6613760, "step": 11405 }, { "epoch": 1.699434018468871, "grad_norm": 0.9689304232597351, "learning_rate": 4.248212689901698e-05, "loss": 0.6623, "num_input_tokens_seen": 6617248, "step": 11410 }, { "epoch": 1.7001787310098302, "grad_norm": 0.8141366243362427, "learning_rate": 4.2500744712540966e-05, "loss": 0.6542, "num_input_tokens_seen": 6619904, "step": 11415 }, { "epoch": 1.7009234435507894, "grad_norm": 1.1613812446594238, "learning_rate": 4.2519362526064944e-05, "loss": 0.7599, "num_input_tokens_seen": 6622528, "step": 11420 }, { "epoch": 1.7016681560917486, "grad_norm": 0.9139557480812073, "learning_rate": 4.253798033958892e-05, "loss": 0.6285, "num_input_tokens_seen": 6625504, "step": 11425 }, { "epoch": 1.7024128686327078, "grad_norm": 0.8862561583518982, "learning_rate": 4.25565981531129e-05, "loss": 0.6528, "num_input_tokens_seen": 6628320, "step": 11430 }, { "epoch": 1.703157581173667, "grad_norm": 0.9214081764221191, "learning_rate": 4.257521596663688e-05, "loss": 0.7723, "num_input_tokens_seen": 6631328, "step": 11435 }, { "epoch": 1.7039022937146262, "grad_norm": 0.8796489238739014, "learning_rate": 4.259383378016086e-05, "loss": 0.7818, "num_input_tokens_seen": 6634208, "step": 11440 }, { "epoch": 1.7046470062555854, "grad_norm": 1.1785706281661987, "learning_rate": 4.2612451593684835e-05, "loss": 0.675, "num_input_tokens_seen": 6637216, "step": 11445 }, { "epoch": 1.7053917187965446, "grad_norm": 0.747234582901001, "learning_rate": 4.263106940720882e-05, "loss": 0.712, "num_input_tokens_seen": 6639936, "step": 11450 }, { "epoch": 1.7061364313375038, "grad_norm": 1.2287101745605469, "learning_rate": 4.26496872207328e-05, "loss": 0.6891, "num_input_tokens_seen": 6643712, "step": 11455 }, { "epoch": 1.706881143878463, "grad_norm": 0.9997260570526123, "learning_rate": 4.266830503425678e-05, "loss": 0.6151, "num_input_tokens_seen": 6646688, "step": 11460 }, { "epoch": 1.7076258564194222, "grad_norm": 0.8348660469055176, "learning_rate": 4.2686922847780755e-05, "loss": 0.6855, "num_input_tokens_seen": 6649728, "step": 11465 }, { "epoch": 1.7083705689603814, "grad_norm": 0.7659286856651306, "learning_rate": 4.270554066130474e-05, "loss": 0.7431, "num_input_tokens_seen": 6652640, "step": 11470 }, { "epoch": 1.7091152815013406, "grad_norm": 0.6211164593696594, "learning_rate": 4.272415847482872e-05, "loss": 0.8199, "num_input_tokens_seen": 6655872, "step": 11475 }, { "epoch": 1.7098599940422998, "grad_norm": 0.8465545773506165, "learning_rate": 4.2742776288352697e-05, "loss": 0.6921, "num_input_tokens_seen": 6658848, "step": 11480 }, { "epoch": 1.710604706583259, "grad_norm": 1.1359103918075562, "learning_rate": 4.276139410187668e-05, "loss": 0.7792, "num_input_tokens_seen": 6661760, "step": 11485 }, { "epoch": 1.711349419124218, "grad_norm": 0.5976957082748413, "learning_rate": 4.278001191540066e-05, "loss": 0.6357, "num_input_tokens_seen": 6664608, "step": 11490 }, { "epoch": 1.7120941316651772, "grad_norm": 0.8963690996170044, "learning_rate": 4.279862972892464e-05, "loss": 0.6193, "num_input_tokens_seen": 6667744, "step": 11495 }, { "epoch": 1.7128388442061364, "grad_norm": 0.9855800867080688, "learning_rate": 4.2817247542448616e-05, "loss": 0.8008, "num_input_tokens_seen": 6670816, "step": 11500 }, { "epoch": 1.7135835567470956, "grad_norm": 1.187083125114441, "learning_rate": 4.28358653559726e-05, "loss": 0.6974, "num_input_tokens_seen": 6673664, "step": 11505 }, { "epoch": 1.7143282692880548, "grad_norm": 0.8067960143089294, "learning_rate": 4.285448316949658e-05, "loss": 0.64, "num_input_tokens_seen": 6676640, "step": 11510 }, { "epoch": 1.715072981829014, "grad_norm": 2.3775553703308105, "learning_rate": 4.287310098302056e-05, "loss": 0.6984, "num_input_tokens_seen": 6679424, "step": 11515 }, { "epoch": 1.7158176943699732, "grad_norm": 0.7230485677719116, "learning_rate": 4.2891718796544536e-05, "loss": 0.6546, "num_input_tokens_seen": 6682464, "step": 11520 }, { "epoch": 1.7165624069109324, "grad_norm": 0.7320828437805176, "learning_rate": 4.2910336610068514e-05, "loss": 0.7738, "num_input_tokens_seen": 6685280, "step": 11525 }, { "epoch": 1.7173071194518914, "grad_norm": 2.2856154441833496, "learning_rate": 4.292895442359249e-05, "loss": 0.8913, "num_input_tokens_seen": 6688160, "step": 11530 }, { "epoch": 1.7180518319928506, "grad_norm": 0.7047014236450195, "learning_rate": 4.294757223711647e-05, "loss": 0.6941, "num_input_tokens_seen": 6691200, "step": 11535 }, { "epoch": 1.7187965445338098, "grad_norm": 0.751301646232605, "learning_rate": 4.2966190050640456e-05, "loss": 0.6257, "num_input_tokens_seen": 6693888, "step": 11540 }, { "epoch": 1.719541257074769, "grad_norm": 0.7750285863876343, "learning_rate": 4.2984807864164434e-05, "loss": 0.6692, "num_input_tokens_seen": 6696768, "step": 11545 }, { "epoch": 1.7202859696157282, "grad_norm": 2.513427972793579, "learning_rate": 4.300342567768841e-05, "loss": 0.746, "num_input_tokens_seen": 6699808, "step": 11550 }, { "epoch": 1.7210306821566874, "grad_norm": 1.0583115816116333, "learning_rate": 4.302204349121239e-05, "loss": 0.8536, "num_input_tokens_seen": 6702784, "step": 11555 }, { "epoch": 1.7217753946976466, "grad_norm": 0.9616263508796692, "learning_rate": 4.3040661304736375e-05, "loss": 0.6439, "num_input_tokens_seen": 6705760, "step": 11560 }, { "epoch": 1.7225201072386058, "grad_norm": 0.7579523921012878, "learning_rate": 4.3059279118260354e-05, "loss": 0.5957, "num_input_tokens_seen": 6708736, "step": 11565 }, { "epoch": 1.723264819779565, "grad_norm": 1.6383353471755981, "learning_rate": 4.307789693178433e-05, "loss": 0.8699, "num_input_tokens_seen": 6711392, "step": 11570 }, { "epoch": 1.7240095323205242, "grad_norm": 0.7511764764785767, "learning_rate": 4.309651474530832e-05, "loss": 0.6211, "num_input_tokens_seen": 6714496, "step": 11575 }, { "epoch": 1.7247542448614834, "grad_norm": 2.4075961112976074, "learning_rate": 4.3115132558832295e-05, "loss": 0.6371, "num_input_tokens_seen": 6717280, "step": 11580 }, { "epoch": 1.7254989574024426, "grad_norm": 0.8291836977005005, "learning_rate": 4.313375037235627e-05, "loss": 0.7436, "num_input_tokens_seen": 6719904, "step": 11585 }, { "epoch": 1.7262436699434018, "grad_norm": 0.8828557729721069, "learning_rate": 4.315236818588025e-05, "loss": 0.6659, "num_input_tokens_seen": 6722560, "step": 11590 }, { "epoch": 1.726988382484361, "grad_norm": 0.4564291536808014, "learning_rate": 4.3170985999404237e-05, "loss": 0.5879, "num_input_tokens_seen": 6725344, "step": 11595 }, { "epoch": 1.7277330950253202, "grad_norm": 1.5248545408248901, "learning_rate": 4.3189603812928215e-05, "loss": 0.8397, "num_input_tokens_seen": 6728384, "step": 11600 }, { "epoch": 1.7284778075662794, "grad_norm": 0.5988898277282715, "learning_rate": 4.3208221626452186e-05, "loss": 0.5748, "num_input_tokens_seen": 6731360, "step": 11605 }, { "epoch": 1.7292225201072386, "grad_norm": 0.9707646369934082, "learning_rate": 4.322683943997617e-05, "loss": 0.7144, "num_input_tokens_seen": 6734016, "step": 11610 }, { "epoch": 1.7299672326481979, "grad_norm": 0.8606809377670288, "learning_rate": 4.324545725350015e-05, "loss": 0.5322, "num_input_tokens_seen": 6736896, "step": 11615 }, { "epoch": 1.730711945189157, "grad_norm": 0.9848670363426208, "learning_rate": 4.326407506702413e-05, "loss": 0.5684, "num_input_tokens_seen": 6739744, "step": 11620 }, { "epoch": 1.7314566577301163, "grad_norm": 0.8281556367874146, "learning_rate": 4.3282692880548106e-05, "loss": 0.5774, "num_input_tokens_seen": 6742528, "step": 11625 }, { "epoch": 1.7322013702710755, "grad_norm": 0.8352028131484985, "learning_rate": 4.330131069407209e-05, "loss": 0.7035, "num_input_tokens_seen": 6745472, "step": 11630 }, { "epoch": 1.7329460828120347, "grad_norm": 1.5397650003433228, "learning_rate": 4.331992850759607e-05, "loss": 0.6699, "num_input_tokens_seen": 6748736, "step": 11635 }, { "epoch": 1.7336907953529939, "grad_norm": 1.3245092630386353, "learning_rate": 4.333854632112005e-05, "loss": 0.5884, "num_input_tokens_seen": 6751424, "step": 11640 }, { "epoch": 1.734435507893953, "grad_norm": 2.2949655055999756, "learning_rate": 4.335716413464403e-05, "loss": 0.8627, "num_input_tokens_seen": 6754176, "step": 11645 }, { "epoch": 1.7351802204349123, "grad_norm": 1.525322437286377, "learning_rate": 4.337578194816801e-05, "loss": 0.6747, "num_input_tokens_seen": 6757024, "step": 11650 }, { "epoch": 1.7359249329758715, "grad_norm": 0.7897469401359558, "learning_rate": 4.339439976169199e-05, "loss": 0.7601, "num_input_tokens_seen": 6759872, "step": 11655 }, { "epoch": 1.7366696455168305, "grad_norm": 0.8901336193084717, "learning_rate": 4.341301757521597e-05, "loss": 0.7659, "num_input_tokens_seen": 6763104, "step": 11660 }, { "epoch": 1.7374143580577897, "grad_norm": 0.6563915610313416, "learning_rate": 4.343163538873995e-05, "loss": 0.769, "num_input_tokens_seen": 6765824, "step": 11665 }, { "epoch": 1.7381590705987489, "grad_norm": 1.0739130973815918, "learning_rate": 4.345025320226393e-05, "loss": 0.6735, "num_input_tokens_seen": 6768672, "step": 11670 }, { "epoch": 1.738903783139708, "grad_norm": 0.7455183267593384, "learning_rate": 4.346887101578791e-05, "loss": 0.6857, "num_input_tokens_seen": 6771616, "step": 11675 }, { "epoch": 1.7396484956806673, "grad_norm": 0.9469426274299622, "learning_rate": 4.348748882931189e-05, "loss": 0.7737, "num_input_tokens_seen": 6774272, "step": 11680 }, { "epoch": 1.7403932082216265, "grad_norm": 1.0281484127044678, "learning_rate": 4.350610664283587e-05, "loss": 0.598, "num_input_tokens_seen": 6777280, "step": 11685 }, { "epoch": 1.7411379207625857, "grad_norm": 1.1941310167312622, "learning_rate": 4.352472445635984e-05, "loss": 0.7292, "num_input_tokens_seen": 6780096, "step": 11690 }, { "epoch": 1.7418826333035449, "grad_norm": 1.0205451250076294, "learning_rate": 4.354334226988382e-05, "loss": 0.7742, "num_input_tokens_seen": 6783072, "step": 11695 }, { "epoch": 1.742627345844504, "grad_norm": 0.5796513557434082, "learning_rate": 4.3561960083407807e-05, "loss": 0.7421, "num_input_tokens_seen": 6785792, "step": 11700 }, { "epoch": 1.743372058385463, "grad_norm": 0.8728430867195129, "learning_rate": 4.3580577896931785e-05, "loss": 0.5839, "num_input_tokens_seen": 6788544, "step": 11705 }, { "epoch": 1.7441167709264223, "grad_norm": 0.6903425455093384, "learning_rate": 4.359919571045576e-05, "loss": 0.7985, "num_input_tokens_seen": 6791680, "step": 11710 }, { "epoch": 1.7448614834673815, "grad_norm": 1.1883329153060913, "learning_rate": 4.361781352397975e-05, "loss": 0.7386, "num_input_tokens_seen": 6794496, "step": 11715 }, { "epoch": 1.7456061960083407, "grad_norm": 1.021229863166809, "learning_rate": 4.3636431337503726e-05, "loss": 0.7267, "num_input_tokens_seen": 6797408, "step": 11720 }, { "epoch": 1.7463509085492999, "grad_norm": 0.6778115630149841, "learning_rate": 4.3655049151027704e-05, "loss": 0.6001, "num_input_tokens_seen": 6800320, "step": 11725 }, { "epoch": 1.747095621090259, "grad_norm": 0.7175254225730896, "learning_rate": 4.367366696455168e-05, "loss": 0.7092, "num_input_tokens_seen": 6803360, "step": 11730 }, { "epoch": 1.7478403336312183, "grad_norm": 0.7132259607315063, "learning_rate": 4.369228477807567e-05, "loss": 0.6215, "num_input_tokens_seen": 6806592, "step": 11735 }, { "epoch": 1.7485850461721775, "grad_norm": 0.9708728194236755, "learning_rate": 4.3710902591599646e-05, "loss": 0.6126, "num_input_tokens_seen": 6809440, "step": 11740 }, { "epoch": 1.7493297587131367, "grad_norm": 0.9149253368377686, "learning_rate": 4.3729520405123624e-05, "loss": 0.7152, "num_input_tokens_seen": 6812320, "step": 11745 }, { "epoch": 1.7500744712540959, "grad_norm": 1.1419895887374878, "learning_rate": 4.37481382186476e-05, "loss": 0.6845, "num_input_tokens_seen": 6814976, "step": 11750 }, { "epoch": 1.750819183795055, "grad_norm": 0.5114970207214355, "learning_rate": 4.376675603217159e-05, "loss": 0.5669, "num_input_tokens_seen": 6817728, "step": 11755 }, { "epoch": 1.7515638963360143, "grad_norm": 1.6272846460342407, "learning_rate": 4.3785373845695566e-05, "loss": 0.7378, "num_input_tokens_seen": 6820672, "step": 11760 }, { "epoch": 1.7523086088769735, "grad_norm": 1.1529700756072998, "learning_rate": 4.3803991659219544e-05, "loss": 0.7601, "num_input_tokens_seen": 6823552, "step": 11765 }, { "epoch": 1.7530533214179327, "grad_norm": 1.180191993713379, "learning_rate": 4.382260947274352e-05, "loss": 0.7317, "num_input_tokens_seen": 6826976, "step": 11770 }, { "epoch": 1.7537980339588919, "grad_norm": 0.7174044847488403, "learning_rate": 4.384122728626751e-05, "loss": 0.7086, "num_input_tokens_seen": 6830048, "step": 11775 }, { "epoch": 1.754542746499851, "grad_norm": 0.9034592509269714, "learning_rate": 4.385984509979148e-05, "loss": 0.7003, "num_input_tokens_seen": 6832896, "step": 11780 }, { "epoch": 1.7552874590408103, "grad_norm": 0.8704028129577637, "learning_rate": 4.387846291331546e-05, "loss": 0.7943, "num_input_tokens_seen": 6835712, "step": 11785 }, { "epoch": 1.7560321715817695, "grad_norm": 1.1994625329971313, "learning_rate": 4.389708072683944e-05, "loss": 0.834, "num_input_tokens_seen": 6838368, "step": 11790 }, { "epoch": 1.7567768841227287, "grad_norm": 0.7318022847175598, "learning_rate": 4.391569854036342e-05, "loss": 0.6272, "num_input_tokens_seen": 6841216, "step": 11795 }, { "epoch": 1.757521596663688, "grad_norm": 1.0771496295928955, "learning_rate": 4.39343163538874e-05, "loss": 0.6817, "num_input_tokens_seen": 6844512, "step": 11800 }, { "epoch": 1.758266309204647, "grad_norm": 0.828056812286377, "learning_rate": 4.395293416741138e-05, "loss": 0.6386, "num_input_tokens_seen": 6847584, "step": 11805 }, { "epoch": 1.7590110217456063, "grad_norm": 0.6145459413528442, "learning_rate": 4.397155198093536e-05, "loss": 0.6797, "num_input_tokens_seen": 6850528, "step": 11810 }, { "epoch": 1.7597557342865655, "grad_norm": 0.956864058971405, "learning_rate": 4.399016979445934e-05, "loss": 0.587, "num_input_tokens_seen": 6853504, "step": 11815 }, { "epoch": 1.7605004468275247, "grad_norm": 0.6931551694869995, "learning_rate": 4.400878760798332e-05, "loss": 0.6858, "num_input_tokens_seen": 6856064, "step": 11820 }, { "epoch": 1.761245159368484, "grad_norm": 0.7795594930648804, "learning_rate": 4.40274054215073e-05, "loss": 0.5687, "num_input_tokens_seen": 6859040, "step": 11825 }, { "epoch": 1.7619898719094431, "grad_norm": 0.7750445008277893, "learning_rate": 4.404602323503128e-05, "loss": 0.8685, "num_input_tokens_seen": 6861888, "step": 11830 }, { "epoch": 1.762734584450402, "grad_norm": 1.1092240810394287, "learning_rate": 4.406464104855526e-05, "loss": 0.8001, "num_input_tokens_seen": 6864640, "step": 11835 }, { "epoch": 1.7634792969913613, "grad_norm": 1.4435890913009644, "learning_rate": 4.408325886207924e-05, "loss": 0.7109, "num_input_tokens_seen": 6867680, "step": 11840 }, { "epoch": 1.7642240095323205, "grad_norm": 1.199277639389038, "learning_rate": 4.410187667560322e-05, "loss": 0.7618, "num_input_tokens_seen": 6870592, "step": 11845 }, { "epoch": 1.7649687220732797, "grad_norm": 0.8612386584281921, "learning_rate": 4.41204944891272e-05, "loss": 0.8036, "num_input_tokens_seen": 6873344, "step": 11850 }, { "epoch": 1.765713434614239, "grad_norm": 0.8291127681732178, "learning_rate": 4.413911230265118e-05, "loss": 0.7397, "num_input_tokens_seen": 6876576, "step": 11855 }, { "epoch": 1.766458147155198, "grad_norm": 0.8362825512886047, "learning_rate": 4.4157730116175164e-05, "loss": 0.858, "num_input_tokens_seen": 6879232, "step": 11860 }, { "epoch": 1.7672028596961573, "grad_norm": 1.0596016645431519, "learning_rate": 4.4176347929699136e-05, "loss": 0.7147, "num_input_tokens_seen": 6882048, "step": 11865 }, { "epoch": 1.7679475722371165, "grad_norm": 1.0514143705368042, "learning_rate": 4.4194965743223114e-05, "loss": 0.7629, "num_input_tokens_seen": 6884832, "step": 11870 }, { "epoch": 1.7686922847780755, "grad_norm": 0.7121261954307556, "learning_rate": 4.42135835567471e-05, "loss": 0.7347, "num_input_tokens_seen": 6887904, "step": 11875 }, { "epoch": 1.7694369973190347, "grad_norm": 1.2063616514205933, "learning_rate": 4.423220137027108e-05, "loss": 0.8048, "num_input_tokens_seen": 6890720, "step": 11880 }, { "epoch": 1.770181709859994, "grad_norm": 0.822581946849823, "learning_rate": 4.4250819183795055e-05, "loss": 0.7718, "num_input_tokens_seen": 6893440, "step": 11885 }, { "epoch": 1.770926422400953, "grad_norm": 1.0066193342208862, "learning_rate": 4.4269436997319034e-05, "loss": 0.7012, "num_input_tokens_seen": 6896352, "step": 11890 }, { "epoch": 1.7716711349419123, "grad_norm": 2.6120171546936035, "learning_rate": 4.428805481084302e-05, "loss": 0.8878, "num_input_tokens_seen": 6898816, "step": 11895 }, { "epoch": 1.7724158474828715, "grad_norm": 0.8891427516937256, "learning_rate": 4.4306672624367e-05, "loss": 0.7651, "num_input_tokens_seen": 6901888, "step": 11900 }, { "epoch": 1.7731605600238307, "grad_norm": 0.8253044486045837, "learning_rate": 4.4325290437890975e-05, "loss": 0.6305, "num_input_tokens_seen": 6904704, "step": 11905 }, { "epoch": 1.77390527256479, "grad_norm": 2.4146971702575684, "learning_rate": 4.434390825141495e-05, "loss": 0.7914, "num_input_tokens_seen": 6907648, "step": 11910 }, { "epoch": 1.7746499851057491, "grad_norm": 0.632481038570404, "learning_rate": 4.436252606493894e-05, "loss": 0.7189, "num_input_tokens_seen": 6910272, "step": 11915 }, { "epoch": 1.7753946976467083, "grad_norm": 0.6890574097633362, "learning_rate": 4.4381143878462917e-05, "loss": 0.6855, "num_input_tokens_seen": 6913344, "step": 11920 }, { "epoch": 1.7761394101876675, "grad_norm": 0.5683222413063049, "learning_rate": 4.4399761691986895e-05, "loss": 0.6723, "num_input_tokens_seen": 6916224, "step": 11925 }, { "epoch": 1.7768841227286267, "grad_norm": 0.872157096862793, "learning_rate": 4.441837950551088e-05, "loss": 0.8515, "num_input_tokens_seen": 6919456, "step": 11930 }, { "epoch": 1.777628835269586, "grad_norm": 1.2994651794433594, "learning_rate": 4.443699731903486e-05, "loss": 0.6379, "num_input_tokens_seen": 6922144, "step": 11935 }, { "epoch": 1.7783735478105451, "grad_norm": 0.8452973961830139, "learning_rate": 4.4455615132558836e-05, "loss": 0.7397, "num_input_tokens_seen": 6925344, "step": 11940 }, { "epoch": 1.7791182603515043, "grad_norm": 0.6918259859085083, "learning_rate": 4.4474232946082814e-05, "loss": 0.756, "num_input_tokens_seen": 6928224, "step": 11945 }, { "epoch": 1.7798629728924635, "grad_norm": 1.2990460395812988, "learning_rate": 4.449285075960679e-05, "loss": 0.7796, "num_input_tokens_seen": 6931328, "step": 11950 }, { "epoch": 1.7806076854334227, "grad_norm": 0.9382709860801697, "learning_rate": 4.451146857313077e-05, "loss": 0.6864, "num_input_tokens_seen": 6934432, "step": 11955 }, { "epoch": 1.781352397974382, "grad_norm": 1.155676007270813, "learning_rate": 4.453008638665475e-05, "loss": 0.7695, "num_input_tokens_seen": 6937216, "step": 11960 }, { "epoch": 1.7820971105153411, "grad_norm": 0.929633617401123, "learning_rate": 4.4548704200178734e-05, "loss": 0.5648, "num_input_tokens_seen": 6940032, "step": 11965 }, { "epoch": 1.7828418230563003, "grad_norm": 1.9159036874771118, "learning_rate": 4.456732201370271e-05, "loss": 0.7732, "num_input_tokens_seen": 6942816, "step": 11970 }, { "epoch": 1.7835865355972595, "grad_norm": 0.6676117181777954, "learning_rate": 4.458593982722669e-05, "loss": 0.7246, "num_input_tokens_seen": 6945760, "step": 11975 }, { "epoch": 1.7843312481382188, "grad_norm": 0.6897764801979065, "learning_rate": 4.460455764075067e-05, "loss": 0.6658, "num_input_tokens_seen": 6948896, "step": 11980 }, { "epoch": 1.785075960679178, "grad_norm": 0.9802858233451843, "learning_rate": 4.4623175454274654e-05, "loss": 0.5336, "num_input_tokens_seen": 6951904, "step": 11985 }, { "epoch": 1.7858206732201372, "grad_norm": 0.6369481086730957, "learning_rate": 4.464179326779863e-05, "loss": 0.644, "num_input_tokens_seen": 6955104, "step": 11990 }, { "epoch": 1.7865653857610964, "grad_norm": 1.0396745204925537, "learning_rate": 4.466041108132261e-05, "loss": 0.7295, "num_input_tokens_seen": 6958048, "step": 11995 }, { "epoch": 1.7873100983020556, "grad_norm": 0.8546991944313049, "learning_rate": 4.467902889484659e-05, "loss": 0.6553, "num_input_tokens_seen": 6960864, "step": 12000 }, { "epoch": 1.7880548108430145, "grad_norm": 0.8350956439971924, "learning_rate": 4.4697646708370574e-05, "loss": 0.7411, "num_input_tokens_seen": 6963648, "step": 12005 }, { "epoch": 1.7887995233839737, "grad_norm": 1.1339105367660522, "learning_rate": 4.471626452189455e-05, "loss": 0.7299, "num_input_tokens_seen": 6966528, "step": 12010 }, { "epoch": 1.789544235924933, "grad_norm": 0.696742594242096, "learning_rate": 4.473488233541853e-05, "loss": 0.6319, "num_input_tokens_seen": 6969280, "step": 12015 }, { "epoch": 1.7902889484658921, "grad_norm": 1.1215084791183472, "learning_rate": 4.4753500148942515e-05, "loss": 0.7373, "num_input_tokens_seen": 6972320, "step": 12020 }, { "epoch": 1.7910336610068514, "grad_norm": 0.7890985608100891, "learning_rate": 4.477211796246649e-05, "loss": 0.8074, "num_input_tokens_seen": 6975552, "step": 12025 }, { "epoch": 1.7917783735478106, "grad_norm": 0.6955908536911011, "learning_rate": 4.479073577599047e-05, "loss": 0.7409, "num_input_tokens_seen": 6978560, "step": 12030 }, { "epoch": 1.7925230860887698, "grad_norm": 0.6475319862365723, "learning_rate": 4.480935358951445e-05, "loss": 0.5532, "num_input_tokens_seen": 6981408, "step": 12035 }, { "epoch": 1.793267798629729, "grad_norm": 0.7406405806541443, "learning_rate": 4.482797140303843e-05, "loss": 0.6326, "num_input_tokens_seen": 6984384, "step": 12040 }, { "epoch": 1.7940125111706882, "grad_norm": 1.5753520727157593, "learning_rate": 4.4846589216562406e-05, "loss": 0.7172, "num_input_tokens_seen": 6987104, "step": 12045 }, { "epoch": 1.7947572237116471, "grad_norm": 0.9419552087783813, "learning_rate": 4.4865207030086384e-05, "loss": 0.7123, "num_input_tokens_seen": 6990112, "step": 12050 }, { "epoch": 1.7955019362526063, "grad_norm": 0.7636163234710693, "learning_rate": 4.488382484361037e-05, "loss": 0.6164, "num_input_tokens_seen": 6992928, "step": 12055 }, { "epoch": 1.7962466487935655, "grad_norm": 0.8317691683769226, "learning_rate": 4.490244265713435e-05, "loss": 0.6851, "num_input_tokens_seen": 6995872, "step": 12060 }, { "epoch": 1.7969913613345248, "grad_norm": 1.0104936361312866, "learning_rate": 4.4921060470658326e-05, "loss": 0.7518, "num_input_tokens_seen": 6998784, "step": 12065 }, { "epoch": 1.797736073875484, "grad_norm": 0.7198978662490845, "learning_rate": 4.4939678284182304e-05, "loss": 0.7194, "num_input_tokens_seen": 7001728, "step": 12070 }, { "epoch": 1.7984807864164432, "grad_norm": 0.712298572063446, "learning_rate": 4.495829609770629e-05, "loss": 0.5464, "num_input_tokens_seen": 7004672, "step": 12075 }, { "epoch": 1.7992254989574024, "grad_norm": 0.5445116758346558, "learning_rate": 4.497691391123027e-05, "loss": 0.6805, "num_input_tokens_seen": 7007712, "step": 12080 }, { "epoch": 1.7999702114983616, "grad_norm": 1.2280722856521606, "learning_rate": 4.4995531724754246e-05, "loss": 0.8416, "num_input_tokens_seen": 7010656, "step": 12085 }, { "epoch": 1.8007149240393208, "grad_norm": 0.5832712054252625, "learning_rate": 4.501414953827823e-05, "loss": 0.8281, "num_input_tokens_seen": 7013472, "step": 12090 }, { "epoch": 1.80145963658028, "grad_norm": 0.7913941144943237, "learning_rate": 4.503276735180221e-05, "loss": 0.7511, "num_input_tokens_seen": 7016352, "step": 12095 }, { "epoch": 1.8022043491212392, "grad_norm": 1.0056490898132324, "learning_rate": 4.505138516532619e-05, "loss": 0.6251, "num_input_tokens_seen": 7019168, "step": 12100 }, { "epoch": 1.8029490616621984, "grad_norm": 0.8059632778167725, "learning_rate": 4.5070002978850165e-05, "loss": 0.7288, "num_input_tokens_seen": 7022400, "step": 12105 }, { "epoch": 1.8036937742031576, "grad_norm": 0.98952716588974, "learning_rate": 4.508862079237415e-05, "loss": 0.7021, "num_input_tokens_seen": 7025440, "step": 12110 }, { "epoch": 1.8044384867441168, "grad_norm": 0.7016228437423706, "learning_rate": 4.510723860589813e-05, "loss": 0.6428, "num_input_tokens_seen": 7028320, "step": 12115 }, { "epoch": 1.805183199285076, "grad_norm": 0.835932195186615, "learning_rate": 4.512585641942211e-05, "loss": 0.6085, "num_input_tokens_seen": 7031168, "step": 12120 }, { "epoch": 1.8059279118260352, "grad_norm": 1.2073909044265747, "learning_rate": 4.5144474232946085e-05, "loss": 0.7008, "num_input_tokens_seen": 7033888, "step": 12125 }, { "epoch": 1.8066726243669944, "grad_norm": 0.7998731136322021, "learning_rate": 4.516309204647006e-05, "loss": 0.8521, "num_input_tokens_seen": 7036544, "step": 12130 }, { "epoch": 1.8074173369079536, "grad_norm": 0.7581228613853455, "learning_rate": 4.518170985999404e-05, "loss": 0.6943, "num_input_tokens_seen": 7039264, "step": 12135 }, { "epoch": 1.8081620494489128, "grad_norm": 0.7465994358062744, "learning_rate": 4.520032767351802e-05, "loss": 0.6996, "num_input_tokens_seen": 7041920, "step": 12140 }, { "epoch": 1.808906761989872, "grad_norm": 0.871535062789917, "learning_rate": 4.5218945487042005e-05, "loss": 0.6057, "num_input_tokens_seen": 7044800, "step": 12145 }, { "epoch": 1.8096514745308312, "grad_norm": 0.6903116703033447, "learning_rate": 4.523756330056598e-05, "loss": 0.9523, "num_input_tokens_seen": 7047968, "step": 12150 }, { "epoch": 1.8103961870717904, "grad_norm": 1.0192360877990723, "learning_rate": 4.525618111408996e-05, "loss": 0.7354, "num_input_tokens_seen": 7050784, "step": 12155 }, { "epoch": 1.8111408996127496, "grad_norm": 0.9257208108901978, "learning_rate": 4.527479892761394e-05, "loss": 0.6157, "num_input_tokens_seen": 7053632, "step": 12160 }, { "epoch": 1.8118856121537088, "grad_norm": 1.7648667097091675, "learning_rate": 4.5293416741137924e-05, "loss": 0.8211, "num_input_tokens_seen": 7056736, "step": 12165 }, { "epoch": 1.812630324694668, "grad_norm": 0.910398542881012, "learning_rate": 4.53120345546619e-05, "loss": 0.6561, "num_input_tokens_seen": 7059488, "step": 12170 }, { "epoch": 1.8133750372356272, "grad_norm": 0.8760908246040344, "learning_rate": 4.533065236818588e-05, "loss": 0.5902, "num_input_tokens_seen": 7063616, "step": 12175 }, { "epoch": 1.8141197497765862, "grad_norm": 0.7792332172393799, "learning_rate": 4.5349270181709866e-05, "loss": 0.7617, "num_input_tokens_seen": 7066720, "step": 12180 }, { "epoch": 1.8148644623175454, "grad_norm": 0.5577234029769897, "learning_rate": 4.5367887995233844e-05, "loss": 0.597, "num_input_tokens_seen": 7069568, "step": 12185 }, { "epoch": 1.8156091748585046, "grad_norm": 0.6135870218276978, "learning_rate": 4.538650580875782e-05, "loss": 0.7467, "num_input_tokens_seen": 7072320, "step": 12190 }, { "epoch": 1.8163538873994638, "grad_norm": 1.8546916246414185, "learning_rate": 4.54051236222818e-05, "loss": 0.6579, "num_input_tokens_seen": 7075264, "step": 12195 }, { "epoch": 1.817098599940423, "grad_norm": 0.7450732588768005, "learning_rate": 4.5423741435805786e-05, "loss": 0.8137, "num_input_tokens_seen": 7078272, "step": 12200 }, { "epoch": 1.8178433124813822, "grad_norm": 1.307934045791626, "learning_rate": 4.5442359249329764e-05, "loss": 0.7586, "num_input_tokens_seen": 7081088, "step": 12205 }, { "epoch": 1.8185880250223414, "grad_norm": 1.5562235116958618, "learning_rate": 4.5460977062853735e-05, "loss": 0.728, "num_input_tokens_seen": 7084032, "step": 12210 }, { "epoch": 1.8193327375633006, "grad_norm": 1.5083215236663818, "learning_rate": 4.547959487637772e-05, "loss": 0.7592, "num_input_tokens_seen": 7087104, "step": 12215 }, { "epoch": 1.8200774501042598, "grad_norm": 1.0387037992477417, "learning_rate": 4.54982126899017e-05, "loss": 0.6805, "num_input_tokens_seen": 7089920, "step": 12220 }, { "epoch": 1.8208221626452188, "grad_norm": 0.9018644690513611, "learning_rate": 4.551683050342568e-05, "loss": 0.7945, "num_input_tokens_seen": 7093056, "step": 12225 }, { "epoch": 1.821566875186178, "grad_norm": 0.955748438835144, "learning_rate": 4.5535448316949655e-05, "loss": 0.6461, "num_input_tokens_seen": 7095584, "step": 12230 }, { "epoch": 1.8223115877271372, "grad_norm": 0.7947168350219727, "learning_rate": 4.555406613047364e-05, "loss": 0.7401, "num_input_tokens_seen": 7098464, "step": 12235 }, { "epoch": 1.8230563002680964, "grad_norm": 0.9473564624786377, "learning_rate": 4.557268394399762e-05, "loss": 0.6636, "num_input_tokens_seen": 7101440, "step": 12240 }, { "epoch": 1.8238010128090556, "grad_norm": 0.8547927737236023, "learning_rate": 4.5591301757521596e-05, "loss": 0.6124, "num_input_tokens_seen": 7104224, "step": 12245 }, { "epoch": 1.8245457253500148, "grad_norm": 1.0168125629425049, "learning_rate": 4.560991957104558e-05, "loss": 0.7311, "num_input_tokens_seen": 7107488, "step": 12250 }, { "epoch": 1.825290437890974, "grad_norm": 0.7847484946250916, "learning_rate": 4.562853738456956e-05, "loss": 0.741, "num_input_tokens_seen": 7110176, "step": 12255 }, { "epoch": 1.8260351504319332, "grad_norm": 0.9097045660018921, "learning_rate": 4.564715519809354e-05, "loss": 0.5854, "num_input_tokens_seen": 7113056, "step": 12260 }, { "epoch": 1.8267798629728924, "grad_norm": 0.6919212937355042, "learning_rate": 4.5665773011617516e-05, "loss": 0.6724, "num_input_tokens_seen": 7115840, "step": 12265 }, { "epoch": 1.8275245755138516, "grad_norm": 0.627202033996582, "learning_rate": 4.56843908251415e-05, "loss": 0.6846, "num_input_tokens_seen": 7118880, "step": 12270 }, { "epoch": 1.8282692880548108, "grad_norm": 0.775134265422821, "learning_rate": 4.570300863866548e-05, "loss": 0.6323, "num_input_tokens_seen": 7121664, "step": 12275 }, { "epoch": 1.82901400059577, "grad_norm": 1.086731195449829, "learning_rate": 4.572162645218946e-05, "loss": 0.6828, "num_input_tokens_seen": 7124480, "step": 12280 }, { "epoch": 1.8297587131367292, "grad_norm": 0.5597050786018372, "learning_rate": 4.5740244265713436e-05, "loss": 0.6171, "num_input_tokens_seen": 7127424, "step": 12285 }, { "epoch": 1.8305034256776884, "grad_norm": 1.5113780498504639, "learning_rate": 4.575886207923742e-05, "loss": 0.8619, "num_input_tokens_seen": 7130208, "step": 12290 }, { "epoch": 1.8312481382186476, "grad_norm": 1.4172663688659668, "learning_rate": 4.57774798927614e-05, "loss": 0.7113, "num_input_tokens_seen": 7133344, "step": 12295 }, { "epoch": 1.8319928507596068, "grad_norm": 0.8717410564422607, "learning_rate": 4.579609770628537e-05, "loss": 0.7655, "num_input_tokens_seen": 7136416, "step": 12300 }, { "epoch": 1.832737563300566, "grad_norm": 0.9791768789291382, "learning_rate": 4.5814715519809356e-05, "loss": 0.5898, "num_input_tokens_seen": 7139168, "step": 12305 }, { "epoch": 1.8334822758415252, "grad_norm": 0.6361991167068481, "learning_rate": 4.5833333333333334e-05, "loss": 0.6667, "num_input_tokens_seen": 7142080, "step": 12310 }, { "epoch": 1.8342269883824844, "grad_norm": 1.0757333040237427, "learning_rate": 4.585195114685731e-05, "loss": 0.7658, "num_input_tokens_seen": 7144832, "step": 12315 }, { "epoch": 1.8349717009234436, "grad_norm": 0.7916536331176758, "learning_rate": 4.58705689603813e-05, "loss": 0.7442, "num_input_tokens_seen": 7147840, "step": 12320 }, { "epoch": 1.8357164134644028, "grad_norm": 0.6561915278434753, "learning_rate": 4.5889186773905275e-05, "loss": 0.5878, "num_input_tokens_seen": 7150912, "step": 12325 }, { "epoch": 1.836461126005362, "grad_norm": 0.9334366321563721, "learning_rate": 4.5907804587429254e-05, "loss": 0.713, "num_input_tokens_seen": 7153952, "step": 12330 }, { "epoch": 1.8372058385463212, "grad_norm": 0.5546029806137085, "learning_rate": 4.592642240095323e-05, "loss": 0.5619, "num_input_tokens_seen": 7156864, "step": 12335 }, { "epoch": 1.8379505510872804, "grad_norm": 0.7918701171875, "learning_rate": 4.594504021447722e-05, "loss": 0.816, "num_input_tokens_seen": 7159872, "step": 12340 }, { "epoch": 1.8386952636282397, "grad_norm": 0.9461214542388916, "learning_rate": 4.5963658028001195e-05, "loss": 0.6464, "num_input_tokens_seen": 7162944, "step": 12345 }, { "epoch": 1.8394399761691989, "grad_norm": 0.6981583833694458, "learning_rate": 4.598227584152517e-05, "loss": 0.6956, "num_input_tokens_seen": 7165952, "step": 12350 }, { "epoch": 1.8401846887101578, "grad_norm": 0.91587895154953, "learning_rate": 4.600089365504915e-05, "loss": 0.6596, "num_input_tokens_seen": 7168768, "step": 12355 }, { "epoch": 1.840929401251117, "grad_norm": 0.9248529076576233, "learning_rate": 4.6019511468573136e-05, "loss": 0.6463, "num_input_tokens_seen": 7171744, "step": 12360 }, { "epoch": 1.8416741137920762, "grad_norm": 1.2173190116882324, "learning_rate": 4.6038129282097115e-05, "loss": 0.7451, "num_input_tokens_seen": 7174816, "step": 12365 }, { "epoch": 1.8424188263330354, "grad_norm": 0.9964344501495361, "learning_rate": 4.605674709562109e-05, "loss": 0.6688, "num_input_tokens_seen": 7177792, "step": 12370 }, { "epoch": 1.8431635388739946, "grad_norm": 0.800915002822876, "learning_rate": 4.607536490914507e-05, "loss": 0.675, "num_input_tokens_seen": 7180704, "step": 12375 }, { "epoch": 1.8439082514149538, "grad_norm": 0.8201249241828918, "learning_rate": 4.6093982722669056e-05, "loss": 0.6652, "num_input_tokens_seen": 7183648, "step": 12380 }, { "epoch": 1.844652963955913, "grad_norm": 0.5758357644081116, "learning_rate": 4.611260053619303e-05, "loss": 0.6578, "num_input_tokens_seen": 7186592, "step": 12385 }, { "epoch": 1.8453976764968723, "grad_norm": 0.678438127040863, "learning_rate": 4.6131218349717006e-05, "loss": 0.5755, "num_input_tokens_seen": 7189280, "step": 12390 }, { "epoch": 1.8461423890378312, "grad_norm": 0.6192798614501953, "learning_rate": 4.614983616324099e-05, "loss": 0.7286, "num_input_tokens_seen": 7192160, "step": 12395 }, { "epoch": 1.8468871015787904, "grad_norm": 0.9221042394638062, "learning_rate": 4.616845397676497e-05, "loss": 0.7131, "num_input_tokens_seen": 7195232, "step": 12400 }, { "epoch": 1.8476318141197496, "grad_norm": 1.0354880094528198, "learning_rate": 4.618707179028895e-05, "loss": 0.6582, "num_input_tokens_seen": 7198112, "step": 12405 }, { "epoch": 1.8483765266607088, "grad_norm": 0.5568985342979431, "learning_rate": 4.620568960381293e-05, "loss": 0.5637, "num_input_tokens_seen": 7201056, "step": 12410 }, { "epoch": 1.849121239201668, "grad_norm": 0.8988638520240784, "learning_rate": 4.622430741733691e-05, "loss": 0.825, "num_input_tokens_seen": 7203968, "step": 12415 }, { "epoch": 1.8498659517426272, "grad_norm": 1.2308546304702759, "learning_rate": 4.624292523086089e-05, "loss": 0.6017, "num_input_tokens_seen": 7206976, "step": 12420 }, { "epoch": 1.8506106642835864, "grad_norm": 0.7551759481430054, "learning_rate": 4.626154304438487e-05, "loss": 0.777, "num_input_tokens_seen": 7209920, "step": 12425 }, { "epoch": 1.8513553768245457, "grad_norm": 1.165971040725708, "learning_rate": 4.628016085790885e-05, "loss": 0.6327, "num_input_tokens_seen": 7212640, "step": 12430 }, { "epoch": 1.8521000893655049, "grad_norm": 1.730883240699768, "learning_rate": 4.629877867143283e-05, "loss": 0.7515, "num_input_tokens_seen": 7215328, "step": 12435 }, { "epoch": 1.852844801906464, "grad_norm": 0.5885474681854248, "learning_rate": 4.631739648495681e-05, "loss": 0.7128, "num_input_tokens_seen": 7218080, "step": 12440 }, { "epoch": 1.8535895144474233, "grad_norm": 1.3926681280136108, "learning_rate": 4.633601429848079e-05, "loss": 0.7977, "num_input_tokens_seen": 7220832, "step": 12445 }, { "epoch": 1.8543342269883825, "grad_norm": 1.0116342306137085, "learning_rate": 4.635463211200477e-05, "loss": 0.6835, "num_input_tokens_seen": 7223680, "step": 12450 }, { "epoch": 1.8550789395293417, "grad_norm": 0.9595067501068115, "learning_rate": 4.637324992552875e-05, "loss": 0.7287, "num_input_tokens_seen": 7226368, "step": 12455 }, { "epoch": 1.8558236520703009, "grad_norm": 0.8287784457206726, "learning_rate": 4.639186773905273e-05, "loss": 0.6863, "num_input_tokens_seen": 7229120, "step": 12460 }, { "epoch": 1.85656836461126, "grad_norm": 0.7753048539161682, "learning_rate": 4.641048555257671e-05, "loss": 0.7073, "num_input_tokens_seen": 7232000, "step": 12465 }, { "epoch": 1.8573130771522193, "grad_norm": 1.0776258707046509, "learning_rate": 4.6429103366100685e-05, "loss": 0.7145, "num_input_tokens_seen": 7234784, "step": 12470 }, { "epoch": 1.8580577896931785, "grad_norm": 1.7774971723556519, "learning_rate": 4.644772117962466e-05, "loss": 0.7753, "num_input_tokens_seen": 7237632, "step": 12475 }, { "epoch": 1.8588025022341377, "grad_norm": 0.6508864164352417, "learning_rate": 4.646633899314865e-05, "loss": 0.8742, "num_input_tokens_seen": 7240608, "step": 12480 }, { "epoch": 1.8595472147750969, "grad_norm": 1.0727119445800781, "learning_rate": 4.6484956806672626e-05, "loss": 0.6684, "num_input_tokens_seen": 7243392, "step": 12485 }, { "epoch": 1.860291927316056, "grad_norm": 0.9367815256118774, "learning_rate": 4.6503574620196604e-05, "loss": 0.7816, "num_input_tokens_seen": 7246432, "step": 12490 }, { "epoch": 1.8610366398570153, "grad_norm": 0.8881452083587646, "learning_rate": 4.652219243372058e-05, "loss": 0.7003, "num_input_tokens_seen": 7249120, "step": 12495 }, { "epoch": 1.8617813523979745, "grad_norm": 0.9602224826812744, "learning_rate": 4.654081024724457e-05, "loss": 0.6121, "num_input_tokens_seen": 7251872, "step": 12500 }, { "epoch": 1.8625260649389337, "grad_norm": 1.6457324028015137, "learning_rate": 4.6559428060768546e-05, "loss": 0.7009, "num_input_tokens_seen": 7254752, "step": 12505 }, { "epoch": 1.863270777479893, "grad_norm": 1.1445468664169312, "learning_rate": 4.6578045874292524e-05, "loss": 0.7452, "num_input_tokens_seen": 7257536, "step": 12510 }, { "epoch": 1.864015490020852, "grad_norm": 1.1517760753631592, "learning_rate": 4.65966636878165e-05, "loss": 0.8186, "num_input_tokens_seen": 7260576, "step": 12515 }, { "epoch": 1.8647602025618113, "grad_norm": 2.2648212909698486, "learning_rate": 4.661528150134049e-05, "loss": 0.7689, "num_input_tokens_seen": 7263424, "step": 12520 }, { "epoch": 1.8655049151027703, "grad_norm": 1.0418720245361328, "learning_rate": 4.6633899314864466e-05, "loss": 0.7527, "num_input_tokens_seen": 7266208, "step": 12525 }, { "epoch": 1.8662496276437295, "grad_norm": 0.9994491934776306, "learning_rate": 4.6652517128388444e-05, "loss": 0.7654, "num_input_tokens_seen": 7269248, "step": 12530 }, { "epoch": 1.8669943401846887, "grad_norm": 0.7716771364212036, "learning_rate": 4.667113494191243e-05, "loss": 0.6923, "num_input_tokens_seen": 7272320, "step": 12535 }, { "epoch": 1.8677390527256479, "grad_norm": 1.1706199645996094, "learning_rate": 4.668975275543641e-05, "loss": 0.7332, "num_input_tokens_seen": 7275552, "step": 12540 }, { "epoch": 1.868483765266607, "grad_norm": 0.8682060837745667, "learning_rate": 4.6708370568960385e-05, "loss": 0.8264, "num_input_tokens_seen": 7278432, "step": 12545 }, { "epoch": 1.8692284778075663, "grad_norm": 0.891865611076355, "learning_rate": 4.6726988382484364e-05, "loss": 0.8124, "num_input_tokens_seen": 7281024, "step": 12550 }, { "epoch": 1.8699731903485255, "grad_norm": 0.9358736276626587, "learning_rate": 4.674560619600835e-05, "loss": 0.6974, "num_input_tokens_seen": 7284032, "step": 12555 }, { "epoch": 1.8707179028894847, "grad_norm": 1.2675795555114746, "learning_rate": 4.676422400953232e-05, "loss": 0.5692, "num_input_tokens_seen": 7287424, "step": 12560 }, { "epoch": 1.871462615430444, "grad_norm": 0.7295622825622559, "learning_rate": 4.67828418230563e-05, "loss": 0.6798, "num_input_tokens_seen": 7290272, "step": 12565 }, { "epoch": 1.8722073279714029, "grad_norm": 0.7299432754516602, "learning_rate": 4.680145963658028e-05, "loss": 0.5263, "num_input_tokens_seen": 7292864, "step": 12570 }, { "epoch": 1.872952040512362, "grad_norm": 1.4334185123443604, "learning_rate": 4.682007745010426e-05, "loss": 0.7215, "num_input_tokens_seen": 7295680, "step": 12575 }, { "epoch": 1.8736967530533213, "grad_norm": 0.8163166642189026, "learning_rate": 4.683869526362824e-05, "loss": 0.7268, "num_input_tokens_seen": 7298656, "step": 12580 }, { "epoch": 1.8744414655942805, "grad_norm": 1.080801248550415, "learning_rate": 4.685731307715222e-05, "loss": 0.7807, "num_input_tokens_seen": 7301408, "step": 12585 }, { "epoch": 1.8751861781352397, "grad_norm": 0.9119265675544739, "learning_rate": 4.68759308906762e-05, "loss": 0.7133, "num_input_tokens_seen": 7304128, "step": 12590 }, { "epoch": 1.875930890676199, "grad_norm": 0.6557957530021667, "learning_rate": 4.689454870420018e-05, "loss": 0.7717, "num_input_tokens_seen": 7306976, "step": 12595 }, { "epoch": 1.876675603217158, "grad_norm": 0.6299301385879517, "learning_rate": 4.691316651772416e-05, "loss": 0.458, "num_input_tokens_seen": 7309504, "step": 12600 }, { "epoch": 1.8774203157581173, "grad_norm": 1.5607342720031738, "learning_rate": 4.693178433124814e-05, "loss": 0.6849, "num_input_tokens_seen": 7312256, "step": 12605 }, { "epoch": 1.8781650282990765, "grad_norm": 0.8622545599937439, "learning_rate": 4.695040214477212e-05, "loss": 0.5746, "num_input_tokens_seen": 7315040, "step": 12610 }, { "epoch": 1.8789097408400357, "grad_norm": 0.9875202178955078, "learning_rate": 4.69690199582961e-05, "loss": 0.718, "num_input_tokens_seen": 7318016, "step": 12615 }, { "epoch": 1.879654453380995, "grad_norm": 0.835345983505249, "learning_rate": 4.698763777182008e-05, "loss": 0.7192, "num_input_tokens_seen": 7320608, "step": 12620 }, { "epoch": 1.880399165921954, "grad_norm": 0.6988547444343567, "learning_rate": 4.7006255585344064e-05, "loss": 0.7498, "num_input_tokens_seen": 7323520, "step": 12625 }, { "epoch": 1.8811438784629133, "grad_norm": 0.8316541910171509, "learning_rate": 4.702487339886804e-05, "loss": 0.6702, "num_input_tokens_seen": 7326816, "step": 12630 }, { "epoch": 1.8818885910038725, "grad_norm": 0.9641820192337036, "learning_rate": 4.704349121239202e-05, "loss": 0.6547, "num_input_tokens_seen": 7329824, "step": 12635 }, { "epoch": 1.8826333035448317, "grad_norm": 0.981949508190155, "learning_rate": 4.7062109025916e-05, "loss": 0.7256, "num_input_tokens_seen": 7332736, "step": 12640 }, { "epoch": 1.883378016085791, "grad_norm": 1.8227678537368774, "learning_rate": 4.708072683943998e-05, "loss": 0.7349, "num_input_tokens_seen": 7335584, "step": 12645 }, { "epoch": 1.8841227286267501, "grad_norm": 0.8059225082397461, "learning_rate": 4.7099344652963955e-05, "loss": 0.6879, "num_input_tokens_seen": 7338432, "step": 12650 }, { "epoch": 1.8848674411677093, "grad_norm": 0.9345542192459106, "learning_rate": 4.7117962466487934e-05, "loss": 0.5697, "num_input_tokens_seen": 7341344, "step": 12655 }, { "epoch": 1.8856121537086685, "grad_norm": 0.8949432373046875, "learning_rate": 4.713658028001192e-05, "loss": 0.7183, "num_input_tokens_seen": 7344512, "step": 12660 }, { "epoch": 1.8863568662496277, "grad_norm": 0.9980221390724182, "learning_rate": 4.71551980935359e-05, "loss": 0.8255, "num_input_tokens_seen": 7347264, "step": 12665 }, { "epoch": 1.887101578790587, "grad_norm": 0.6309382915496826, "learning_rate": 4.7173815907059875e-05, "loss": 0.684, "num_input_tokens_seen": 7350336, "step": 12670 }, { "epoch": 1.8878462913315461, "grad_norm": 0.8006239533424377, "learning_rate": 4.719243372058385e-05, "loss": 0.7273, "num_input_tokens_seen": 7353216, "step": 12675 }, { "epoch": 1.8885910038725053, "grad_norm": 0.7686233520507812, "learning_rate": 4.721105153410784e-05, "loss": 0.5756, "num_input_tokens_seen": 7355968, "step": 12680 }, { "epoch": 1.8893357164134645, "grad_norm": 0.9029635787010193, "learning_rate": 4.7229669347631816e-05, "loss": 0.7195, "num_input_tokens_seen": 7358912, "step": 12685 }, { "epoch": 1.8900804289544237, "grad_norm": 0.7976394295692444, "learning_rate": 4.7248287161155795e-05, "loss": 0.8386, "num_input_tokens_seen": 7361664, "step": 12690 }, { "epoch": 1.890825141495383, "grad_norm": 0.9768561124801636, "learning_rate": 4.726690497467978e-05, "loss": 0.7494, "num_input_tokens_seen": 7364704, "step": 12695 }, { "epoch": 1.891569854036342, "grad_norm": 0.7660055756568909, "learning_rate": 4.728552278820376e-05, "loss": 0.7881, "num_input_tokens_seen": 7367424, "step": 12700 }, { "epoch": 1.8923145665773011, "grad_norm": 0.8261312246322632, "learning_rate": 4.7304140601727736e-05, "loss": 0.778, "num_input_tokens_seen": 7370432, "step": 12705 }, { "epoch": 1.8930592791182603, "grad_norm": 1.1734493970870972, "learning_rate": 4.7322758415251714e-05, "loss": 0.7077, "num_input_tokens_seen": 7373152, "step": 12710 }, { "epoch": 1.8938039916592195, "grad_norm": 0.8612189888954163, "learning_rate": 4.73413762287757e-05, "loss": 0.6475, "num_input_tokens_seen": 7375872, "step": 12715 }, { "epoch": 1.8945487042001787, "grad_norm": 0.9793077707290649, "learning_rate": 4.735999404229968e-05, "loss": 0.7448, "num_input_tokens_seen": 7378624, "step": 12720 }, { "epoch": 1.895293416741138, "grad_norm": 0.7390219569206238, "learning_rate": 4.7378611855823656e-05, "loss": 0.7497, "num_input_tokens_seen": 7381632, "step": 12725 }, { "epoch": 1.8960381292820971, "grad_norm": 0.8885179162025452, "learning_rate": 4.7397229669347634e-05, "loss": 0.7574, "num_input_tokens_seen": 7384832, "step": 12730 }, { "epoch": 1.8967828418230563, "grad_norm": 0.6751461029052734, "learning_rate": 4.741584748287161e-05, "loss": 0.6149, "num_input_tokens_seen": 7387808, "step": 12735 }, { "epoch": 1.8975275543640155, "grad_norm": 0.8557964563369751, "learning_rate": 4.743446529639559e-05, "loss": 0.6784, "num_input_tokens_seen": 7390752, "step": 12740 }, { "epoch": 1.8982722669049745, "grad_norm": 0.7901074290275574, "learning_rate": 4.745308310991957e-05, "loss": 0.5811, "num_input_tokens_seen": 7393408, "step": 12745 }, { "epoch": 1.8990169794459337, "grad_norm": 1.1402249336242676, "learning_rate": 4.7471700923443554e-05, "loss": 0.6514, "num_input_tokens_seen": 7396448, "step": 12750 }, { "epoch": 1.899761691986893, "grad_norm": 0.5890709161758423, "learning_rate": 4.749031873696753e-05, "loss": 0.7769, "num_input_tokens_seen": 7399584, "step": 12755 }, { "epoch": 1.9005064045278521, "grad_norm": 0.8991829752922058, "learning_rate": 4.750893655049151e-05, "loss": 0.7577, "num_input_tokens_seen": 7402400, "step": 12760 }, { "epoch": 1.9012511170688113, "grad_norm": 0.7484419941902161, "learning_rate": 4.752755436401549e-05, "loss": 0.6467, "num_input_tokens_seen": 7405280, "step": 12765 }, { "epoch": 1.9019958296097705, "grad_norm": 0.9092236757278442, "learning_rate": 4.7546172177539474e-05, "loss": 0.784, "num_input_tokens_seen": 7408160, "step": 12770 }, { "epoch": 1.9027405421507297, "grad_norm": 1.0463697910308838, "learning_rate": 4.756478999106345e-05, "loss": 0.815, "num_input_tokens_seen": 7411136, "step": 12775 }, { "epoch": 1.903485254691689, "grad_norm": 0.9562498927116394, "learning_rate": 4.758340780458743e-05, "loss": 0.534, "num_input_tokens_seen": 7413984, "step": 12780 }, { "epoch": 1.9042299672326481, "grad_norm": 1.0864017009735107, "learning_rate": 4.7602025618111415e-05, "loss": 0.7391, "num_input_tokens_seen": 7416960, "step": 12785 }, { "epoch": 1.9049746797736073, "grad_norm": 0.9218994975090027, "learning_rate": 4.762064343163539e-05, "loss": 0.6308, "num_input_tokens_seen": 7419776, "step": 12790 }, { "epoch": 1.9057193923145666, "grad_norm": 0.7449970245361328, "learning_rate": 4.763926124515937e-05, "loss": 0.8136, "num_input_tokens_seen": 7422752, "step": 12795 }, { "epoch": 1.9064641048555258, "grad_norm": 0.6413530707359314, "learning_rate": 4.765787905868335e-05, "loss": 0.599, "num_input_tokens_seen": 7425664, "step": 12800 }, { "epoch": 1.907208817396485, "grad_norm": 0.7456724047660828, "learning_rate": 4.7676496872207335e-05, "loss": 0.7326, "num_input_tokens_seen": 7428448, "step": 12805 }, { "epoch": 1.9079535299374442, "grad_norm": 0.7000278830528259, "learning_rate": 4.769511468573131e-05, "loss": 0.6212, "num_input_tokens_seen": 7431200, "step": 12810 }, { "epoch": 1.9086982424784034, "grad_norm": 0.867206871509552, "learning_rate": 4.7713732499255284e-05, "loss": 0.5922, "num_input_tokens_seen": 7434144, "step": 12815 }, { "epoch": 1.9094429550193626, "grad_norm": 0.8733346462249756, "learning_rate": 4.773235031277927e-05, "loss": 0.7897, "num_input_tokens_seen": 7437056, "step": 12820 }, { "epoch": 1.9101876675603218, "grad_norm": 0.8449758887290955, "learning_rate": 4.775096812630325e-05, "loss": 0.7015, "num_input_tokens_seen": 7439648, "step": 12825 }, { "epoch": 1.910932380101281, "grad_norm": 0.7884416580200195, "learning_rate": 4.7769585939827226e-05, "loss": 0.7582, "num_input_tokens_seen": 7442272, "step": 12830 }, { "epoch": 1.9116770926422402, "grad_norm": 1.0369923114776611, "learning_rate": 4.7788203753351204e-05, "loss": 0.7277, "num_input_tokens_seen": 7445152, "step": 12835 }, { "epoch": 1.9124218051831994, "grad_norm": 1.2022593021392822, "learning_rate": 4.780682156687519e-05, "loss": 0.6898, "num_input_tokens_seen": 7448192, "step": 12840 }, { "epoch": 1.9131665177241586, "grad_norm": 1.1473653316497803, "learning_rate": 4.782543938039917e-05, "loss": 0.6909, "num_input_tokens_seen": 7450976, "step": 12845 }, { "epoch": 1.9139112302651178, "grad_norm": 1.2394509315490723, "learning_rate": 4.7844057193923146e-05, "loss": 0.7456, "num_input_tokens_seen": 7454048, "step": 12850 }, { "epoch": 1.914655942806077, "grad_norm": 0.578166127204895, "learning_rate": 4.786267500744713e-05, "loss": 0.4917, "num_input_tokens_seen": 7457248, "step": 12855 }, { "epoch": 1.9154006553470362, "grad_norm": 0.8551971316337585, "learning_rate": 4.788129282097111e-05, "loss": 0.7059, "num_input_tokens_seen": 7460192, "step": 12860 }, { "epoch": 1.9161453678879954, "grad_norm": 0.8354623317718506, "learning_rate": 4.789991063449509e-05, "loss": 0.6855, "num_input_tokens_seen": 7463008, "step": 12865 }, { "epoch": 1.9168900804289544, "grad_norm": 0.9834239482879639, "learning_rate": 4.7918528448019065e-05, "loss": 0.6057, "num_input_tokens_seen": 7465952, "step": 12870 }, { "epoch": 1.9176347929699136, "grad_norm": 1.019208312034607, "learning_rate": 4.793714626154305e-05, "loss": 0.6701, "num_input_tokens_seen": 7468864, "step": 12875 }, { "epoch": 1.9183795055108728, "grad_norm": 1.246898889541626, "learning_rate": 4.795576407506703e-05, "loss": 0.7759, "num_input_tokens_seen": 7472032, "step": 12880 }, { "epoch": 1.919124218051832, "grad_norm": 1.3877652883529663, "learning_rate": 4.797438188859101e-05, "loss": 0.674, "num_input_tokens_seen": 7474912, "step": 12885 }, { "epoch": 1.9198689305927912, "grad_norm": 1.0178009271621704, "learning_rate": 4.7992999702114985e-05, "loss": 0.7227, "num_input_tokens_seen": 7477568, "step": 12890 }, { "epoch": 1.9206136431337504, "grad_norm": 0.6288526654243469, "learning_rate": 4.801161751563897e-05, "loss": 0.7162, "num_input_tokens_seen": 7480256, "step": 12895 }, { "epoch": 1.9213583556747096, "grad_norm": 0.6615798473358154, "learning_rate": 4.803023532916295e-05, "loss": 0.5813, "num_input_tokens_seen": 7483072, "step": 12900 }, { "epoch": 1.9221030682156688, "grad_norm": 1.2011891603469849, "learning_rate": 4.804885314268692e-05, "loss": 0.8241, "num_input_tokens_seen": 7486016, "step": 12905 }, { "epoch": 1.922847780756628, "grad_norm": 0.9854448437690735, "learning_rate": 4.8067470956210905e-05, "loss": 0.7079, "num_input_tokens_seen": 7488864, "step": 12910 }, { "epoch": 1.923592493297587, "grad_norm": 0.797900915145874, "learning_rate": 4.808608876973488e-05, "loss": 0.5877, "num_input_tokens_seen": 7491808, "step": 12915 }, { "epoch": 1.9243372058385462, "grad_norm": 0.8189070224761963, "learning_rate": 4.810470658325886e-05, "loss": 0.6527, "num_input_tokens_seen": 7495072, "step": 12920 }, { "epoch": 1.9250819183795054, "grad_norm": 1.1699292659759521, "learning_rate": 4.8123324396782846e-05, "loss": 0.8023, "num_input_tokens_seen": 7497984, "step": 12925 }, { "epoch": 1.9258266309204646, "grad_norm": 0.6605051159858704, "learning_rate": 4.8141942210306824e-05, "loss": 0.7329, "num_input_tokens_seen": 7500736, "step": 12930 }, { "epoch": 1.9265713434614238, "grad_norm": 0.8068364262580872, "learning_rate": 4.81605600238308e-05, "loss": 0.8584, "num_input_tokens_seen": 7503520, "step": 12935 }, { "epoch": 1.927316056002383, "grad_norm": 1.2180078029632568, "learning_rate": 4.817917783735478e-05, "loss": 0.7116, "num_input_tokens_seen": 7506304, "step": 12940 }, { "epoch": 1.9280607685433422, "grad_norm": 0.9437804818153381, "learning_rate": 4.8197795650878766e-05, "loss": 0.6362, "num_input_tokens_seen": 7509248, "step": 12945 }, { "epoch": 1.9288054810843014, "grad_norm": 0.8607161641120911, "learning_rate": 4.8216413464402744e-05, "loss": 0.8678, "num_input_tokens_seen": 7512224, "step": 12950 }, { "epoch": 1.9295501936252606, "grad_norm": 0.9511206150054932, "learning_rate": 4.823503127792672e-05, "loss": 0.7803, "num_input_tokens_seen": 7515328, "step": 12955 }, { "epoch": 1.9302949061662198, "grad_norm": 0.6946478486061096, "learning_rate": 4.82536490914507e-05, "loss": 0.5573, "num_input_tokens_seen": 7517984, "step": 12960 }, { "epoch": 1.931039618707179, "grad_norm": 0.9083018898963928, "learning_rate": 4.8272266904974686e-05, "loss": 0.5843, "num_input_tokens_seen": 7520864, "step": 12965 }, { "epoch": 1.9317843312481382, "grad_norm": 1.7669565677642822, "learning_rate": 4.8290884718498664e-05, "loss": 0.7382, "num_input_tokens_seen": 7524000, "step": 12970 }, { "epoch": 1.9325290437890974, "grad_norm": 1.8113945722579956, "learning_rate": 4.830950253202264e-05, "loss": 0.8348, "num_input_tokens_seen": 7526688, "step": 12975 }, { "epoch": 1.9332737563300566, "grad_norm": 0.6187591552734375, "learning_rate": 4.832812034554662e-05, "loss": 0.791, "num_input_tokens_seen": 7529440, "step": 12980 }, { "epoch": 1.9340184688710158, "grad_norm": 0.5293627977371216, "learning_rate": 4.8346738159070605e-05, "loss": 0.7549, "num_input_tokens_seen": 7532448, "step": 12985 }, { "epoch": 1.934763181411975, "grad_norm": 0.5262786746025085, "learning_rate": 4.836535597259458e-05, "loss": 0.7593, "num_input_tokens_seen": 7535328, "step": 12990 }, { "epoch": 1.9355078939529342, "grad_norm": 0.6614857912063599, "learning_rate": 4.8383973786118555e-05, "loss": 0.78, "num_input_tokens_seen": 7537984, "step": 12995 }, { "epoch": 1.9362526064938934, "grad_norm": 0.7007845640182495, "learning_rate": 4.840259159964254e-05, "loss": 0.5586, "num_input_tokens_seen": 7540928, "step": 13000 }, { "epoch": 1.9369973190348526, "grad_norm": 0.6588127613067627, "learning_rate": 4.842120941316652e-05, "loss": 0.747, "num_input_tokens_seen": 7543680, "step": 13005 }, { "epoch": 1.9377420315758118, "grad_norm": 0.9259496331214905, "learning_rate": 4.8439827226690496e-05, "loss": 0.6895, "num_input_tokens_seen": 7546720, "step": 13010 }, { "epoch": 1.938486744116771, "grad_norm": 0.6415332555770874, "learning_rate": 4.845844504021448e-05, "loss": 0.7065, "num_input_tokens_seen": 7549920, "step": 13015 }, { "epoch": 1.9392314566577302, "grad_norm": 0.7739807963371277, "learning_rate": 4.847706285373846e-05, "loss": 0.8263, "num_input_tokens_seen": 7552960, "step": 13020 }, { "epoch": 1.9399761691986894, "grad_norm": 1.0940546989440918, "learning_rate": 4.849568066726244e-05, "loss": 0.5693, "num_input_tokens_seen": 7555584, "step": 13025 }, { "epoch": 1.9407208817396486, "grad_norm": 0.9591498374938965, "learning_rate": 4.8514298480786416e-05, "loss": 0.6245, "num_input_tokens_seen": 7558688, "step": 13030 }, { "epoch": 1.9414655942806078, "grad_norm": 0.779329240322113, "learning_rate": 4.85329162943104e-05, "loss": 0.7069, "num_input_tokens_seen": 7561344, "step": 13035 }, { "epoch": 1.942210306821567, "grad_norm": 0.7060886025428772, "learning_rate": 4.855153410783438e-05, "loss": 0.7698, "num_input_tokens_seen": 7564544, "step": 13040 }, { "epoch": 1.942955019362526, "grad_norm": 1.0914361476898193, "learning_rate": 4.857015192135836e-05, "loss": 0.6686, "num_input_tokens_seen": 7567488, "step": 13045 }, { "epoch": 1.9436997319034852, "grad_norm": 0.8300396800041199, "learning_rate": 4.8588769734882336e-05, "loss": 0.7571, "num_input_tokens_seen": 7570048, "step": 13050 }, { "epoch": 1.9444444444444444, "grad_norm": 0.9385725259780884, "learning_rate": 4.860738754840632e-05, "loss": 0.6768, "num_input_tokens_seen": 7572704, "step": 13055 }, { "epoch": 1.9451891569854036, "grad_norm": 0.6740312576293945, "learning_rate": 4.86260053619303e-05, "loss": 0.621, "num_input_tokens_seen": 7575712, "step": 13060 }, { "epoch": 1.9459338695263628, "grad_norm": 1.017906904220581, "learning_rate": 4.864462317545428e-05, "loss": 0.6848, "num_input_tokens_seen": 7578496, "step": 13065 }, { "epoch": 1.946678582067322, "grad_norm": 0.6809269785881042, "learning_rate": 4.866324098897826e-05, "loss": 0.7293, "num_input_tokens_seen": 7581248, "step": 13070 }, { "epoch": 1.9474232946082812, "grad_norm": 0.7586163282394409, "learning_rate": 4.8681858802502234e-05, "loss": 0.6706, "num_input_tokens_seen": 7583712, "step": 13075 }, { "epoch": 1.9481680071492404, "grad_norm": 1.170938491821289, "learning_rate": 4.870047661602621e-05, "loss": 0.7051, "num_input_tokens_seen": 7586336, "step": 13080 }, { "epoch": 1.9489127196901996, "grad_norm": 0.8840734958648682, "learning_rate": 4.87190944295502e-05, "loss": 0.7251, "num_input_tokens_seen": 7589216, "step": 13085 }, { "epoch": 1.9496574322311586, "grad_norm": 1.1164273023605347, "learning_rate": 4.8737712243074175e-05, "loss": 0.7643, "num_input_tokens_seen": 7592128, "step": 13090 }, { "epoch": 1.9504021447721178, "grad_norm": 0.7973006963729858, "learning_rate": 4.8756330056598153e-05, "loss": 0.643, "num_input_tokens_seen": 7594880, "step": 13095 }, { "epoch": 1.951146857313077, "grad_norm": 3.1827754974365234, "learning_rate": 4.877494787012213e-05, "loss": 0.8885, "num_input_tokens_seen": 7597632, "step": 13100 }, { "epoch": 1.9518915698540362, "grad_norm": 1.0221800804138184, "learning_rate": 4.879356568364612e-05, "loss": 0.6363, "num_input_tokens_seen": 7600672, "step": 13105 }, { "epoch": 1.9526362823949954, "grad_norm": 0.7353776693344116, "learning_rate": 4.8812183497170095e-05, "loss": 0.5863, "num_input_tokens_seen": 7603680, "step": 13110 }, { "epoch": 1.9533809949359546, "grad_norm": 1.02915620803833, "learning_rate": 4.883080131069407e-05, "loss": 0.8055, "num_input_tokens_seen": 7606464, "step": 13115 }, { "epoch": 1.9541257074769138, "grad_norm": 0.667409360408783, "learning_rate": 4.884941912421805e-05, "loss": 0.5906, "num_input_tokens_seen": 7609408, "step": 13120 }, { "epoch": 1.954870420017873, "grad_norm": 0.9365502595901489, "learning_rate": 4.8868036937742036e-05, "loss": 0.8487, "num_input_tokens_seen": 7612960, "step": 13125 }, { "epoch": 1.9556151325588322, "grad_norm": 0.8645147681236267, "learning_rate": 4.8886654751266015e-05, "loss": 0.8294, "num_input_tokens_seen": 7615680, "step": 13130 }, { "epoch": 1.9563598450997914, "grad_norm": 0.9164861440658569, "learning_rate": 4.890527256478999e-05, "loss": 0.6995, "num_input_tokens_seen": 7618752, "step": 13135 }, { "epoch": 1.9571045576407506, "grad_norm": 0.8361726999282837, "learning_rate": 4.892389037831398e-05, "loss": 0.7172, "num_input_tokens_seen": 7621472, "step": 13140 }, { "epoch": 1.9578492701817098, "grad_norm": 0.9917513132095337, "learning_rate": 4.8942508191837956e-05, "loss": 0.6718, "num_input_tokens_seen": 7624480, "step": 13145 }, { "epoch": 1.958593982722669, "grad_norm": 0.6749933362007141, "learning_rate": 4.8961126005361934e-05, "loss": 0.7445, "num_input_tokens_seen": 7627360, "step": 13150 }, { "epoch": 1.9593386952636282, "grad_norm": 0.822227418422699, "learning_rate": 4.897974381888591e-05, "loss": 0.5941, "num_input_tokens_seen": 7630336, "step": 13155 }, { "epoch": 1.9600834078045875, "grad_norm": 0.9468845725059509, "learning_rate": 4.89983616324099e-05, "loss": 0.6849, "num_input_tokens_seen": 7632992, "step": 13160 }, { "epoch": 1.9608281203455467, "grad_norm": 0.6110829710960388, "learning_rate": 4.901697944593387e-05, "loss": 0.7288, "num_input_tokens_seen": 7635648, "step": 13165 }, { "epoch": 1.9615728328865059, "grad_norm": 0.7968673706054688, "learning_rate": 4.903559725945785e-05, "loss": 0.457, "num_input_tokens_seen": 7638656, "step": 13170 }, { "epoch": 1.962317545427465, "grad_norm": 0.7092122435569763, "learning_rate": 4.905421507298183e-05, "loss": 0.6215, "num_input_tokens_seen": 7641760, "step": 13175 }, { "epoch": 1.9630622579684243, "grad_norm": 1.0514004230499268, "learning_rate": 4.907283288650581e-05, "loss": 0.6985, "num_input_tokens_seen": 7644448, "step": 13180 }, { "epoch": 1.9638069705093835, "grad_norm": 0.9494042992591858, "learning_rate": 4.909145070002979e-05, "loss": 0.6609, "num_input_tokens_seen": 7647392, "step": 13185 }, { "epoch": 1.9645516830503427, "grad_norm": 0.8077470064163208, "learning_rate": 4.911006851355377e-05, "loss": 0.7159, "num_input_tokens_seen": 7650304, "step": 13190 }, { "epoch": 1.9652963955913019, "grad_norm": 0.6982101202011108, "learning_rate": 4.912868632707775e-05, "loss": 0.599, "num_input_tokens_seen": 7652960, "step": 13195 }, { "epoch": 1.966041108132261, "grad_norm": 0.9678599834442139, "learning_rate": 4.914730414060173e-05, "loss": 0.6697, "num_input_tokens_seen": 7655808, "step": 13200 }, { "epoch": 1.9667858206732203, "grad_norm": 0.9198148846626282, "learning_rate": 4.916592195412571e-05, "loss": 0.6595, "num_input_tokens_seen": 7658560, "step": 13205 }, { "epoch": 1.9675305332141795, "grad_norm": 1.011609435081482, "learning_rate": 4.918453976764969e-05, "loss": 0.688, "num_input_tokens_seen": 7661568, "step": 13210 }, { "epoch": 1.9682752457551387, "grad_norm": 0.7950023412704468, "learning_rate": 4.920315758117367e-05, "loss": 0.6898, "num_input_tokens_seen": 7664352, "step": 13215 }, { "epoch": 1.9690199582960977, "grad_norm": 0.6118741631507874, "learning_rate": 4.922177539469765e-05, "loss": 0.6372, "num_input_tokens_seen": 7667392, "step": 13220 }, { "epoch": 1.9697646708370569, "grad_norm": 0.9153499603271484, "learning_rate": 4.924039320822163e-05, "loss": 0.6853, "num_input_tokens_seen": 7670240, "step": 13225 }, { "epoch": 1.970509383378016, "grad_norm": 0.8265352845191956, "learning_rate": 4.925901102174561e-05, "loss": 0.8685, "num_input_tokens_seen": 7673312, "step": 13230 }, { "epoch": 1.9712540959189753, "grad_norm": 0.9599980115890503, "learning_rate": 4.927762883526959e-05, "loss": 0.7124, "num_input_tokens_seen": 7676448, "step": 13235 }, { "epoch": 1.9719988084599345, "grad_norm": 1.2943845987319946, "learning_rate": 4.929624664879357e-05, "loss": 0.8543, "num_input_tokens_seen": 7679360, "step": 13240 }, { "epoch": 1.9727435210008937, "grad_norm": 0.820090115070343, "learning_rate": 4.931486446231755e-05, "loss": 0.6833, "num_input_tokens_seen": 7682272, "step": 13245 }, { "epoch": 1.9734882335418529, "grad_norm": 0.939619779586792, "learning_rate": 4.9333482275841526e-05, "loss": 0.6756, "num_input_tokens_seen": 7685280, "step": 13250 }, { "epoch": 1.974232946082812, "grad_norm": 1.3748308420181274, "learning_rate": 4.9352100089365504e-05, "loss": 0.6877, "num_input_tokens_seen": 7687968, "step": 13255 }, { "epoch": 1.974977658623771, "grad_norm": 0.934374988079071, "learning_rate": 4.937071790288948e-05, "loss": 0.7409, "num_input_tokens_seen": 7690976, "step": 13260 }, { "epoch": 1.9757223711647303, "grad_norm": 0.7337211966514587, "learning_rate": 4.938933571641347e-05, "loss": 0.6484, "num_input_tokens_seen": 7693856, "step": 13265 }, { "epoch": 1.9764670837056895, "grad_norm": 1.0087651014328003, "learning_rate": 4.9407953529937446e-05, "loss": 0.7267, "num_input_tokens_seen": 7696672, "step": 13270 }, { "epoch": 1.9772117962466487, "grad_norm": 1.1909979581832886, "learning_rate": 4.9426571343461424e-05, "loss": 0.6135, "num_input_tokens_seen": 7699808, "step": 13275 }, { "epoch": 1.9779565087876079, "grad_norm": 1.3709378242492676, "learning_rate": 4.94451891569854e-05, "loss": 0.7358, "num_input_tokens_seen": 7702720, "step": 13280 }, { "epoch": 1.978701221328567, "grad_norm": 0.6265214085578918, "learning_rate": 4.946380697050939e-05, "loss": 0.6828, "num_input_tokens_seen": 7705504, "step": 13285 }, { "epoch": 1.9794459338695263, "grad_norm": 0.6401516795158386, "learning_rate": 4.9482424784033366e-05, "loss": 0.6238, "num_input_tokens_seen": 7708672, "step": 13290 }, { "epoch": 1.9801906464104855, "grad_norm": 1.0909950733184814, "learning_rate": 4.9501042597557344e-05, "loss": 0.7273, "num_input_tokens_seen": 7711744, "step": 13295 }, { "epoch": 1.9809353589514447, "grad_norm": 0.7156492471694946, "learning_rate": 4.951966041108133e-05, "loss": 0.594, "num_input_tokens_seen": 7714656, "step": 13300 }, { "epoch": 1.9816800714924039, "grad_norm": 0.7294518947601318, "learning_rate": 4.953827822460531e-05, "loss": 0.7312, "num_input_tokens_seen": 7717344, "step": 13305 }, { "epoch": 1.982424784033363, "grad_norm": 0.6857102513313293, "learning_rate": 4.9556896038129285e-05, "loss": 0.6987, "num_input_tokens_seen": 7720384, "step": 13310 }, { "epoch": 1.9831694965743223, "grad_norm": 0.9175675511360168, "learning_rate": 4.9575513851653263e-05, "loss": 0.5804, "num_input_tokens_seen": 7723040, "step": 13315 }, { "epoch": 1.9839142091152815, "grad_norm": 1.2215479612350464, "learning_rate": 4.959413166517725e-05, "loss": 0.678, "num_input_tokens_seen": 7725920, "step": 13320 }, { "epoch": 1.9846589216562407, "grad_norm": 0.9749146699905396, "learning_rate": 4.961274947870123e-05, "loss": 0.7636, "num_input_tokens_seen": 7728960, "step": 13325 }, { "epoch": 1.9854036341972, "grad_norm": 0.7741090059280396, "learning_rate": 4.9631367292225205e-05, "loss": 0.601, "num_input_tokens_seen": 7731616, "step": 13330 }, { "epoch": 1.986148346738159, "grad_norm": 0.6016538143157959, "learning_rate": 4.964998510574918e-05, "loss": 0.6458, "num_input_tokens_seen": 7734688, "step": 13335 }, { "epoch": 1.9868930592791183, "grad_norm": 0.8365470170974731, "learning_rate": 4.966860291927316e-05, "loss": 0.5681, "num_input_tokens_seen": 7737376, "step": 13340 }, { "epoch": 1.9876377718200775, "grad_norm": 1.6762467622756958, "learning_rate": 4.968722073279714e-05, "loss": 0.916, "num_input_tokens_seen": 7740352, "step": 13345 }, { "epoch": 1.9883824843610367, "grad_norm": 0.843856930732727, "learning_rate": 4.970583854632112e-05, "loss": 0.6544, "num_input_tokens_seen": 7743200, "step": 13350 }, { "epoch": 1.989127196901996, "grad_norm": 1.205116868019104, "learning_rate": 4.97244563598451e-05, "loss": 0.7485, "num_input_tokens_seen": 7746144, "step": 13355 }, { "epoch": 1.9898719094429551, "grad_norm": 0.6201145648956299, "learning_rate": 4.974307417336908e-05, "loss": 0.6861, "num_input_tokens_seen": 7749024, "step": 13360 }, { "epoch": 1.9906166219839143, "grad_norm": 0.6050704121589661, "learning_rate": 4.976169198689306e-05, "loss": 0.6745, "num_input_tokens_seen": 7752224, "step": 13365 }, { "epoch": 1.9913613345248735, "grad_norm": 0.9399620294570923, "learning_rate": 4.978030980041704e-05, "loss": 0.5962, "num_input_tokens_seen": 7755008, "step": 13370 }, { "epoch": 1.9921060470658327, "grad_norm": 0.7874840497970581, "learning_rate": 4.979892761394102e-05, "loss": 0.6498, "num_input_tokens_seen": 7757728, "step": 13375 }, { "epoch": 1.992850759606792, "grad_norm": 0.8696102499961853, "learning_rate": 4.9817545427465e-05, "loss": 0.7277, "num_input_tokens_seen": 7760640, "step": 13380 }, { "epoch": 1.9935954721477511, "grad_norm": 0.7981179356575012, "learning_rate": 4.983616324098898e-05, "loss": 0.9428, "num_input_tokens_seen": 7763552, "step": 13385 }, { "epoch": 1.99434018468871, "grad_norm": 2.2925567626953125, "learning_rate": 4.9854781054512964e-05, "loss": 0.7492, "num_input_tokens_seen": 7766368, "step": 13390 }, { "epoch": 1.9950848972296693, "grad_norm": 0.8000440001487732, "learning_rate": 4.987339886803694e-05, "loss": 0.7332, "num_input_tokens_seen": 7769664, "step": 13395 }, { "epoch": 1.9958296097706285, "grad_norm": 1.0336772203445435, "learning_rate": 4.989201668156092e-05, "loss": 0.7126, "num_input_tokens_seen": 7772704, "step": 13400 }, { "epoch": 1.9965743223115877, "grad_norm": 0.5263153314590454, "learning_rate": 4.99106344950849e-05, "loss": 0.618, "num_input_tokens_seen": 7776032, "step": 13405 }, { "epoch": 1.997319034852547, "grad_norm": 1.4588227272033691, "learning_rate": 4.9929252308608884e-05, "loss": 0.6663, "num_input_tokens_seen": 7779008, "step": 13410 }, { "epoch": 1.9980637473935061, "grad_norm": 1.049464225769043, "learning_rate": 4.994787012213286e-05, "loss": 0.7277, "num_input_tokens_seen": 7781888, "step": 13415 }, { "epoch": 1.9988084599344653, "grad_norm": 0.8828349113464355, "learning_rate": 4.996648793565684e-05, "loss": 0.5905, "num_input_tokens_seen": 7785120, "step": 13420 }, { "epoch": 1.9995531724754245, "grad_norm": 0.7980069518089294, "learning_rate": 4.998510574918082e-05, "loss": 0.7764, "num_input_tokens_seen": 7788096, "step": 13425 }, { "epoch": 2.0, "eval_loss": 0.6897483468055725, "eval_runtime": 47.2048, "eval_samples_per_second": 63.214, "eval_steps_per_second": 15.803, "num_input_tokens_seen": 7789256, "step": 13428 }, { "epoch": 2.0002978850163835, "grad_norm": 0.781358003616333, "learning_rate": 4.999999999155301e-05, "loss": 0.753, "num_input_tokens_seen": 7790536, "step": 13430 }, { "epoch": 2.0010425975573427, "grad_norm": 0.7936080098152161, "learning_rate": 4.9999999695908296e-05, "loss": 0.7116, "num_input_tokens_seen": 7793416, "step": 13435 }, { "epoch": 2.001787310098302, "grad_norm": 1.1606546640396118, "learning_rate": 4.9999998977913995e-05, "loss": 0.6721, "num_input_tokens_seen": 7796296, "step": 13440 }, { "epoch": 2.002532022639261, "grad_norm": 0.7618687152862549, "learning_rate": 4.999999783757012e-05, "loss": 0.7339, "num_input_tokens_seen": 7799144, "step": 13445 }, { "epoch": 2.0032767351802203, "grad_norm": 0.9057413339614868, "learning_rate": 4.999999627487669e-05, "loss": 0.6384, "num_input_tokens_seen": 7802088, "step": 13450 }, { "epoch": 2.0040214477211795, "grad_norm": 1.0917373895645142, "learning_rate": 4.999999428983374e-05, "loss": 0.6638, "num_input_tokens_seen": 7805128, "step": 13455 }, { "epoch": 2.0047661602621387, "grad_norm": 0.752208411693573, "learning_rate": 4.999999188244129e-05, "loss": 0.6618, "num_input_tokens_seen": 7807880, "step": 13460 }, { "epoch": 2.005510872803098, "grad_norm": 0.9600043892860413, "learning_rate": 4.999998905269938e-05, "loss": 0.7168, "num_input_tokens_seen": 7811080, "step": 13465 }, { "epoch": 2.006255585344057, "grad_norm": 0.9677248597145081, "learning_rate": 4.9999985800608076e-05, "loss": 0.6505, "num_input_tokens_seen": 7814120, "step": 13470 }, { "epoch": 2.0070002978850163, "grad_norm": 1.476273536682129, "learning_rate": 4.9999982126167414e-05, "loss": 0.6499, "num_input_tokens_seen": 7816872, "step": 13475 }, { "epoch": 2.0077450104259755, "grad_norm": 2.308891773223877, "learning_rate": 4.9999978029377456e-05, "loss": 0.7721, "num_input_tokens_seen": 7819464, "step": 13480 }, { "epoch": 2.0084897229669347, "grad_norm": 0.6427616477012634, "learning_rate": 4.9999973510238284e-05, "loss": 0.8607, "num_input_tokens_seen": 7822216, "step": 13485 }, { "epoch": 2.009234435507894, "grad_norm": 0.7887071371078491, "learning_rate": 4.999996856874997e-05, "loss": 0.6364, "num_input_tokens_seen": 7825128, "step": 13490 }, { "epoch": 2.009979148048853, "grad_norm": 0.7256363034248352, "learning_rate": 4.99999632049126e-05, "loss": 0.7869, "num_input_tokens_seen": 7828264, "step": 13495 }, { "epoch": 2.0107238605898123, "grad_norm": 0.8430116176605225, "learning_rate": 4.999995741872625e-05, "loss": 0.5821, "num_input_tokens_seen": 7831016, "step": 13500 }, { "epoch": 2.0114685731307715, "grad_norm": 0.8059836030006409, "learning_rate": 4.999995121019103e-05, "loss": 0.6779, "num_input_tokens_seen": 7834056, "step": 13505 }, { "epoch": 2.0122132856717307, "grad_norm": 0.8756328225135803, "learning_rate": 4.999994457930705e-05, "loss": 0.7474, "num_input_tokens_seen": 7836648, "step": 13510 }, { "epoch": 2.01295799821269, "grad_norm": 0.9095890522003174, "learning_rate": 4.999993752607441e-05, "loss": 0.7179, "num_input_tokens_seen": 7839720, "step": 13515 }, { "epoch": 2.013702710753649, "grad_norm": 0.8727270364761353, "learning_rate": 4.999993005049324e-05, "loss": 0.6558, "num_input_tokens_seen": 7842696, "step": 13520 }, { "epoch": 2.0144474232946084, "grad_norm": 0.8035591244697571, "learning_rate": 4.999992215256365e-05, "loss": 0.7071, "num_input_tokens_seen": 7845576, "step": 13525 }, { "epoch": 2.0151921358355676, "grad_norm": 0.779927670955658, "learning_rate": 4.9999913832285796e-05, "loss": 0.6215, "num_input_tokens_seen": 7848200, "step": 13530 }, { "epoch": 2.0159368483765268, "grad_norm": 1.081659197807312, "learning_rate": 4.99999050896598e-05, "loss": 0.6671, "num_input_tokens_seen": 7850856, "step": 13535 }, { "epoch": 2.016681560917486, "grad_norm": 1.0089919567108154, "learning_rate": 4.999989592468582e-05, "loss": 0.802, "num_input_tokens_seen": 7853608, "step": 13540 }, { "epoch": 2.017426273458445, "grad_norm": 1.0821715593338013, "learning_rate": 4.9999886337364004e-05, "loss": 0.7804, "num_input_tokens_seen": 7856552, "step": 13545 }, { "epoch": 2.0181709859994044, "grad_norm": 0.666327953338623, "learning_rate": 4.999987632769452e-05, "loss": 0.6158, "num_input_tokens_seen": 7859496, "step": 13550 }, { "epoch": 2.0189156985403636, "grad_norm": 1.07124662399292, "learning_rate": 4.9999865895677534e-05, "loss": 0.7069, "num_input_tokens_seen": 7862632, "step": 13555 }, { "epoch": 2.0196604110813228, "grad_norm": 2.919970750808716, "learning_rate": 4.999985504131322e-05, "loss": 0.5721, "num_input_tokens_seen": 7865480, "step": 13560 }, { "epoch": 2.020405123622282, "grad_norm": 0.7246355414390564, "learning_rate": 4.999984376460176e-05, "loss": 0.7022, "num_input_tokens_seen": 7868392, "step": 13565 }, { "epoch": 2.021149836163241, "grad_norm": 1.5556658506393433, "learning_rate": 4.999983206554335e-05, "loss": 0.6747, "num_input_tokens_seen": 7871208, "step": 13570 }, { "epoch": 2.0218945487042004, "grad_norm": 0.6773921847343445, "learning_rate": 4.9999819944138194e-05, "loss": 0.6935, "num_input_tokens_seen": 7873896, "step": 13575 }, { "epoch": 2.0226392612451596, "grad_norm": 1.055832028388977, "learning_rate": 4.999980740038648e-05, "loss": 0.7023, "num_input_tokens_seen": 7876616, "step": 13580 }, { "epoch": 2.0233839737861183, "grad_norm": 0.8002442717552185, "learning_rate": 4.9999794434288434e-05, "loss": 0.7902, "num_input_tokens_seen": 7879592, "step": 13585 }, { "epoch": 2.0241286863270775, "grad_norm": 1.2057828903198242, "learning_rate": 4.9999781045844266e-05, "loss": 0.6809, "num_input_tokens_seen": 7882120, "step": 13590 }, { "epoch": 2.0248733988680367, "grad_norm": 0.9021010398864746, "learning_rate": 4.999976723505421e-05, "loss": 0.6807, "num_input_tokens_seen": 7884808, "step": 13595 }, { "epoch": 2.025618111408996, "grad_norm": 1.1989681720733643, "learning_rate": 4.999975300191849e-05, "loss": 0.6476, "num_input_tokens_seen": 7887560, "step": 13600 }, { "epoch": 2.026362823949955, "grad_norm": 0.8136505484580994, "learning_rate": 4.9999738346437355e-05, "loss": 0.5484, "num_input_tokens_seen": 7890248, "step": 13605 }, { "epoch": 2.0271075364909144, "grad_norm": 0.6722564101219177, "learning_rate": 4.9999723268611046e-05, "loss": 0.6456, "num_input_tokens_seen": 7893352, "step": 13610 }, { "epoch": 2.0278522490318736, "grad_norm": 0.6414330005645752, "learning_rate": 4.9999707768439824e-05, "loss": 0.6003, "num_input_tokens_seen": 7896136, "step": 13615 }, { "epoch": 2.0285969615728328, "grad_norm": 0.7381497025489807, "learning_rate": 4.999969184592395e-05, "loss": 0.7316, "num_input_tokens_seen": 7899112, "step": 13620 }, { "epoch": 2.029341674113792, "grad_norm": 0.7764710187911987, "learning_rate": 4.999967550106368e-05, "loss": 0.717, "num_input_tokens_seen": 7902120, "step": 13625 }, { "epoch": 2.030086386654751, "grad_norm": 0.7216230630874634, "learning_rate": 4.999965873385931e-05, "loss": 0.7592, "num_input_tokens_seen": 7905384, "step": 13630 }, { "epoch": 2.0308310991957104, "grad_norm": 0.8301064372062683, "learning_rate": 4.999964154431112e-05, "loss": 0.6658, "num_input_tokens_seen": 7908616, "step": 13635 }, { "epoch": 2.0315758117366696, "grad_norm": 0.9956339597702026, "learning_rate": 4.999962393241938e-05, "loss": 0.5606, "num_input_tokens_seen": 7911688, "step": 13640 }, { "epoch": 2.0323205242776288, "grad_norm": 0.8741719722747803, "learning_rate": 4.999960589818441e-05, "loss": 0.6696, "num_input_tokens_seen": 7914664, "step": 13645 }, { "epoch": 2.033065236818588, "grad_norm": 0.8949389457702637, "learning_rate": 4.999958744160651e-05, "loss": 0.6723, "num_input_tokens_seen": 7917800, "step": 13650 }, { "epoch": 2.033809949359547, "grad_norm": 0.9853014349937439, "learning_rate": 4.9999568562685986e-05, "loss": 0.6055, "num_input_tokens_seen": 7920776, "step": 13655 }, { "epoch": 2.0345546619005064, "grad_norm": 0.8853422403335571, "learning_rate": 4.999954926142316e-05, "loss": 0.5986, "num_input_tokens_seen": 7923592, "step": 13660 }, { "epoch": 2.0352993744414656, "grad_norm": 0.788967490196228, "learning_rate": 4.999952953781836e-05, "loss": 0.596, "num_input_tokens_seen": 7926632, "step": 13665 }, { "epoch": 2.036044086982425, "grad_norm": 0.9945287704467773, "learning_rate": 4.9999509391871905e-05, "loss": 0.7038, "num_input_tokens_seen": 7929384, "step": 13670 }, { "epoch": 2.036788799523384, "grad_norm": 0.7898186445236206, "learning_rate": 4.999948882358416e-05, "loss": 0.656, "num_input_tokens_seen": 7932200, "step": 13675 }, { "epoch": 2.037533512064343, "grad_norm": 1.0582208633422852, "learning_rate": 4.9999467832955454e-05, "loss": 0.5045, "num_input_tokens_seen": 7935080, "step": 13680 }, { "epoch": 2.0382782246053024, "grad_norm": 2.5242855548858643, "learning_rate": 4.999944641998615e-05, "loss": 0.7574, "num_input_tokens_seen": 7937960, "step": 13685 }, { "epoch": 2.0390229371462616, "grad_norm": 0.8086226582527161, "learning_rate": 4.99994245846766e-05, "loss": 0.6287, "num_input_tokens_seen": 7941032, "step": 13690 }, { "epoch": 2.039767649687221, "grad_norm": 0.8474138379096985, "learning_rate": 4.999940232702719e-05, "loss": 0.6742, "num_input_tokens_seen": 7943976, "step": 13695 }, { "epoch": 2.04051236222818, "grad_norm": 1.2865244150161743, "learning_rate": 4.999937964703828e-05, "loss": 0.7215, "num_input_tokens_seen": 7946952, "step": 13700 }, { "epoch": 2.041257074769139, "grad_norm": 1.1076037883758545, "learning_rate": 4.999935654471026e-05, "loss": 0.7691, "num_input_tokens_seen": 7949736, "step": 13705 }, { "epoch": 2.0420017873100984, "grad_norm": 0.8727818131446838, "learning_rate": 4.999933302004352e-05, "loss": 0.6565, "num_input_tokens_seen": 7952648, "step": 13710 }, { "epoch": 2.0427464998510576, "grad_norm": 0.6897568702697754, "learning_rate": 4.999930907303846e-05, "loss": 0.7725, "num_input_tokens_seen": 7955912, "step": 13715 }, { "epoch": 2.043491212392017, "grad_norm": 1.0472431182861328, "learning_rate": 4.9999284703695474e-05, "loss": 0.7242, "num_input_tokens_seen": 7958856, "step": 13720 }, { "epoch": 2.044235924932976, "grad_norm": 1.2237650156021118, "learning_rate": 4.9999259912014986e-05, "loss": 0.7045, "num_input_tokens_seen": 7961992, "step": 13725 }, { "epoch": 2.044980637473935, "grad_norm": 1.1827303171157837, "learning_rate": 4.999923469799741e-05, "loss": 0.7627, "num_input_tokens_seen": 7964712, "step": 13730 }, { "epoch": 2.0457253500148944, "grad_norm": 1.2068946361541748, "learning_rate": 4.9999209061643174e-05, "loss": 0.7632, "num_input_tokens_seen": 7967656, "step": 13735 }, { "epoch": 2.0464700625558536, "grad_norm": 0.6935456991195679, "learning_rate": 4.99991830029527e-05, "loss": 0.6563, "num_input_tokens_seen": 7970696, "step": 13740 }, { "epoch": 2.047214775096813, "grad_norm": 0.9436448216438293, "learning_rate": 4.999915652192645e-05, "loss": 0.55, "num_input_tokens_seen": 7973192, "step": 13745 }, { "epoch": 2.047959487637772, "grad_norm": 0.7706229090690613, "learning_rate": 4.9999129618564844e-05, "loss": 0.7755, "num_input_tokens_seen": 7976168, "step": 13750 }, { "epoch": 2.0487042001787312, "grad_norm": 1.0508173704147339, "learning_rate": 4.999910229286836e-05, "loss": 0.765, "num_input_tokens_seen": 7979240, "step": 13755 }, { "epoch": 2.04944891271969, "grad_norm": 1.2686840295791626, "learning_rate": 4.999907454483745e-05, "loss": 0.7583, "num_input_tokens_seen": 7982088, "step": 13760 }, { "epoch": 2.050193625260649, "grad_norm": 0.9222975373268127, "learning_rate": 4.999904637447258e-05, "loss": 0.6151, "num_input_tokens_seen": 7984904, "step": 13765 }, { "epoch": 2.0509383378016084, "grad_norm": 0.8530503511428833, "learning_rate": 4.9999017781774236e-05, "loss": 0.7099, "num_input_tokens_seen": 7987976, "step": 13770 }, { "epoch": 2.0516830503425676, "grad_norm": 0.7590308785438538, "learning_rate": 4.999898876674289e-05, "loss": 0.6828, "num_input_tokens_seen": 7990664, "step": 13775 }, { "epoch": 2.052427762883527, "grad_norm": 1.1951844692230225, "learning_rate": 4.9998959329379036e-05, "loss": 0.6037, "num_input_tokens_seen": 7994856, "step": 13780 }, { "epoch": 2.053172475424486, "grad_norm": 1.209998607635498, "learning_rate": 4.999892946968318e-05, "loss": 0.7826, "num_input_tokens_seen": 7997608, "step": 13785 }, { "epoch": 2.053917187965445, "grad_norm": 1.084031581878662, "learning_rate": 4.999889918765581e-05, "loss": 0.6693, "num_input_tokens_seen": 8000488, "step": 13790 }, { "epoch": 2.0546619005064044, "grad_norm": 1.038774847984314, "learning_rate": 4.999886848329744e-05, "loss": 0.7338, "num_input_tokens_seen": 8003368, "step": 13795 }, { "epoch": 2.0554066130473636, "grad_norm": 0.9456515312194824, "learning_rate": 4.999883735660861e-05, "loss": 0.721, "num_input_tokens_seen": 8006280, "step": 13800 }, { "epoch": 2.056151325588323, "grad_norm": 0.8091086745262146, "learning_rate": 4.999880580758982e-05, "loss": 0.5658, "num_input_tokens_seen": 8009160, "step": 13805 }, { "epoch": 2.056896038129282, "grad_norm": 0.8996577262878418, "learning_rate": 4.999877383624162e-05, "loss": 0.5193, "num_input_tokens_seen": 8012168, "step": 13810 }, { "epoch": 2.057640750670241, "grad_norm": 1.0514745712280273, "learning_rate": 4.9998741442564535e-05, "loss": 0.5741, "num_input_tokens_seen": 8015016, "step": 13815 }, { "epoch": 2.0583854632112004, "grad_norm": 1.2382721900939941, "learning_rate": 4.999870862655913e-05, "loss": 0.786, "num_input_tokens_seen": 8017864, "step": 13820 }, { "epoch": 2.0591301757521596, "grad_norm": 0.8680140972137451, "learning_rate": 4.999867538822595e-05, "loss": 0.5746, "num_input_tokens_seen": 8020936, "step": 13825 }, { "epoch": 2.059874888293119, "grad_norm": 2.186542510986328, "learning_rate": 4.999864172756554e-05, "loss": 0.6949, "num_input_tokens_seen": 8023592, "step": 13830 }, { "epoch": 2.060619600834078, "grad_norm": 1.2783092260360718, "learning_rate": 4.9998607644578505e-05, "loss": 0.6773, "num_input_tokens_seen": 8026792, "step": 13835 }, { "epoch": 2.0613643133750372, "grad_norm": 1.1631189584732056, "learning_rate": 4.9998573139265395e-05, "loss": 0.7374, "num_input_tokens_seen": 8029448, "step": 13840 }, { "epoch": 2.0621090259159964, "grad_norm": 0.985467255115509, "learning_rate": 4.99985382116268e-05, "loss": 0.5964, "num_input_tokens_seen": 8032424, "step": 13845 }, { "epoch": 2.0628537384569556, "grad_norm": 0.7660601735115051, "learning_rate": 4.999850286166331e-05, "loss": 0.7819, "num_input_tokens_seen": 8035080, "step": 13850 }, { "epoch": 2.063598450997915, "grad_norm": 1.0745368003845215, "learning_rate": 4.999846708937552e-05, "loss": 0.6737, "num_input_tokens_seen": 8038056, "step": 13855 }, { "epoch": 2.064343163538874, "grad_norm": 1.0692787170410156, "learning_rate": 4.9998430894764034e-05, "loss": 0.6759, "num_input_tokens_seen": 8040840, "step": 13860 }, { "epoch": 2.0650878760798332, "grad_norm": 1.0525692701339722, "learning_rate": 4.9998394277829466e-05, "loss": 0.5589, "num_input_tokens_seen": 8043912, "step": 13865 }, { "epoch": 2.0658325886207924, "grad_norm": 0.9888657331466675, "learning_rate": 4.9998357238572435e-05, "loss": 0.6816, "num_input_tokens_seen": 8046824, "step": 13870 }, { "epoch": 2.0665773011617516, "grad_norm": 2.015977621078491, "learning_rate": 4.9998319776993566e-05, "loss": 0.5856, "num_input_tokens_seen": 8049704, "step": 13875 }, { "epoch": 2.067322013702711, "grad_norm": 2.4703564643859863, "learning_rate": 4.999828189309349e-05, "loss": 0.8314, "num_input_tokens_seen": 8052680, "step": 13880 }, { "epoch": 2.06806672624367, "grad_norm": 0.9708427786827087, "learning_rate": 4.999824358687285e-05, "loss": 0.6957, "num_input_tokens_seen": 8055400, "step": 13885 }, { "epoch": 2.0688114387846293, "grad_norm": 0.8818408250808716, "learning_rate": 4.9998204858332295e-05, "loss": 0.7082, "num_input_tokens_seen": 8058312, "step": 13890 }, { "epoch": 2.0695561513255885, "grad_norm": 0.6480093598365784, "learning_rate": 4.999816570747247e-05, "loss": 0.6261, "num_input_tokens_seen": 8061160, "step": 13895 }, { "epoch": 2.0703008638665477, "grad_norm": 0.8973095417022705, "learning_rate": 4.999812613429404e-05, "loss": 0.7212, "num_input_tokens_seen": 8064424, "step": 13900 }, { "epoch": 2.071045576407507, "grad_norm": 1.2148207426071167, "learning_rate": 4.9998086138797685e-05, "loss": 0.6108, "num_input_tokens_seen": 8067336, "step": 13905 }, { "epoch": 2.071790288948466, "grad_norm": 0.6152958273887634, "learning_rate": 4.9998045720984065e-05, "loss": 0.7156, "num_input_tokens_seen": 8070600, "step": 13910 }, { "epoch": 2.0725350014894253, "grad_norm": 1.0945188999176025, "learning_rate": 4.999800488085388e-05, "loss": 0.6015, "num_input_tokens_seen": 8073352, "step": 13915 }, { "epoch": 2.0732797140303845, "grad_norm": 0.8340425491333008, "learning_rate": 4.9997963618407794e-05, "loss": 0.5602, "num_input_tokens_seen": 8076456, "step": 13920 }, { "epoch": 2.0740244265713437, "grad_norm": 0.6213623285293579, "learning_rate": 4.999792193364653e-05, "loss": 0.6627, "num_input_tokens_seen": 8079208, "step": 13925 }, { "epoch": 2.074769139112303, "grad_norm": 1.2120047807693481, "learning_rate": 4.999787982657077e-05, "loss": 0.6518, "num_input_tokens_seen": 8082280, "step": 13930 }, { "epoch": 2.0755138516532616, "grad_norm": 1.033565878868103, "learning_rate": 4.999783729718125e-05, "loss": 0.7132, "num_input_tokens_seen": 8084968, "step": 13935 }, { "epoch": 2.076258564194221, "grad_norm": 2.477494478225708, "learning_rate": 4.999779434547867e-05, "loss": 0.6583, "num_input_tokens_seen": 8088008, "step": 13940 }, { "epoch": 2.07700327673518, "grad_norm": 1.2260037660598755, "learning_rate": 4.999775097146376e-05, "loss": 0.6914, "num_input_tokens_seen": 8091048, "step": 13945 }, { "epoch": 2.0777479892761392, "grad_norm": 0.6763800382614136, "learning_rate": 4.999770717513726e-05, "loss": 0.555, "num_input_tokens_seen": 8093992, "step": 13950 }, { "epoch": 2.0784927018170984, "grad_norm": 1.0301458835601807, "learning_rate": 4.99976629564999e-05, "loss": 0.694, "num_input_tokens_seen": 8096648, "step": 13955 }, { "epoch": 2.0792374143580576, "grad_norm": 0.6932746767997742, "learning_rate": 4.999761831555243e-05, "loss": 0.7456, "num_input_tokens_seen": 8099624, "step": 13960 }, { "epoch": 2.079982126899017, "grad_norm": 1.443286657333374, "learning_rate": 4.9997573252295604e-05, "loss": 0.7452, "num_input_tokens_seen": 8102504, "step": 13965 }, { "epoch": 2.080726839439976, "grad_norm": 1.014744520187378, "learning_rate": 4.999752776673018e-05, "loss": 0.7994, "num_input_tokens_seen": 8105224, "step": 13970 }, { "epoch": 2.0814715519809353, "grad_norm": 0.848796546459198, "learning_rate": 4.999748185885694e-05, "loss": 0.7342, "num_input_tokens_seen": 8108040, "step": 13975 }, { "epoch": 2.0822162645218945, "grad_norm": 0.78908771276474, "learning_rate": 4.999743552867665e-05, "loss": 0.7017, "num_input_tokens_seen": 8111144, "step": 13980 }, { "epoch": 2.0829609770628537, "grad_norm": 1.618872046470642, "learning_rate": 4.999738877619009e-05, "loss": 0.7237, "num_input_tokens_seen": 8114056, "step": 13985 }, { "epoch": 2.083705689603813, "grad_norm": 1.3259631395339966, "learning_rate": 4.999734160139805e-05, "loss": 0.6987, "num_input_tokens_seen": 8116744, "step": 13990 }, { "epoch": 2.084450402144772, "grad_norm": 0.9047940373420715, "learning_rate": 4.999729400430133e-05, "loss": 0.6725, "num_input_tokens_seen": 8120008, "step": 13995 }, { "epoch": 2.0851951146857313, "grad_norm": 1.0768945217132568, "learning_rate": 4.9997245984900745e-05, "loss": 0.7714, "num_input_tokens_seen": 8122664, "step": 14000 }, { "epoch": 2.0859398272266905, "grad_norm": 0.7673446536064148, "learning_rate": 4.999719754319708e-05, "loss": 0.6101, "num_input_tokens_seen": 8125832, "step": 14005 }, { "epoch": 2.0866845397676497, "grad_norm": 0.8249602913856506, "learning_rate": 4.9997148679191174e-05, "loss": 0.7543, "num_input_tokens_seen": 8129032, "step": 14010 }, { "epoch": 2.087429252308609, "grad_norm": 0.6622809171676636, "learning_rate": 4.999709939288385e-05, "loss": 0.7104, "num_input_tokens_seen": 8131880, "step": 14015 }, { "epoch": 2.088173964849568, "grad_norm": 2.3044543266296387, "learning_rate": 4.9997049684275936e-05, "loss": 0.8863, "num_input_tokens_seen": 8134632, "step": 14020 }, { "epoch": 2.0889186773905273, "grad_norm": 1.1743719577789307, "learning_rate": 4.999699955336827e-05, "loss": 0.778, "num_input_tokens_seen": 8137512, "step": 14025 }, { "epoch": 2.0896633899314865, "grad_norm": 0.6285878419876099, "learning_rate": 4.9996949000161705e-05, "loss": 0.7038, "num_input_tokens_seen": 8140424, "step": 14030 }, { "epoch": 2.0904081024724457, "grad_norm": 0.7569646835327148, "learning_rate": 4.99968980246571e-05, "loss": 0.5985, "num_input_tokens_seen": 8143272, "step": 14035 }, { "epoch": 2.091152815013405, "grad_norm": 1.1024630069732666, "learning_rate": 4.99968466268553e-05, "loss": 0.8972, "num_input_tokens_seen": 8146376, "step": 14040 }, { "epoch": 2.091897527554364, "grad_norm": 0.8918119668960571, "learning_rate": 4.999679480675719e-05, "loss": 0.7203, "num_input_tokens_seen": 8149224, "step": 14045 }, { "epoch": 2.0926422400953233, "grad_norm": 0.8781414031982422, "learning_rate": 4.9996742564363616e-05, "loss": 0.6096, "num_input_tokens_seen": 8152040, "step": 14050 }, { "epoch": 2.0933869526362825, "grad_norm": 0.895294189453125, "learning_rate": 4.99966898996755e-05, "loss": 0.5712, "num_input_tokens_seen": 8154856, "step": 14055 }, { "epoch": 2.0941316651772417, "grad_norm": 2.6895227432250977, "learning_rate": 4.999663681269372e-05, "loss": 0.6962, "num_input_tokens_seen": 8157512, "step": 14060 }, { "epoch": 2.094876377718201, "grad_norm": 0.7176175713539124, "learning_rate": 4.999658330341915e-05, "loss": 0.7323, "num_input_tokens_seen": 8160584, "step": 14065 }, { "epoch": 2.09562109025916, "grad_norm": 1.235487461090088, "learning_rate": 4.9996529371852716e-05, "loss": 0.774, "num_input_tokens_seen": 8163624, "step": 14070 }, { "epoch": 2.0963658028001193, "grad_norm": 1.1662235260009766, "learning_rate": 4.999647501799532e-05, "loss": 0.7323, "num_input_tokens_seen": 8166280, "step": 14075 }, { "epoch": 2.0971105153410785, "grad_norm": 0.9634692072868347, "learning_rate": 4.99964202418479e-05, "loss": 0.7149, "num_input_tokens_seen": 8169096, "step": 14080 }, { "epoch": 2.0978552278820377, "grad_norm": 1.124605655670166, "learning_rate": 4.999636504341135e-05, "loss": 0.586, "num_input_tokens_seen": 8171944, "step": 14085 }, { "epoch": 2.098599940422997, "grad_norm": 1.0600569248199463, "learning_rate": 4.9996309422686624e-05, "loss": 0.6382, "num_input_tokens_seen": 8175240, "step": 14090 }, { "epoch": 2.099344652963956, "grad_norm": 0.7742431163787842, "learning_rate": 4.999625337967465e-05, "loss": 0.735, "num_input_tokens_seen": 8177896, "step": 14095 }, { "epoch": 2.1000893655049153, "grad_norm": 0.9868897199630737, "learning_rate": 4.999619691437638e-05, "loss": 0.6816, "num_input_tokens_seen": 8180456, "step": 14100 }, { "epoch": 2.1008340780458745, "grad_norm": 0.7563786506652832, "learning_rate": 4.9996140026792774e-05, "loss": 0.7606, "num_input_tokens_seen": 8183272, "step": 14105 }, { "epoch": 2.1015787905868333, "grad_norm": 1.0451757907867432, "learning_rate": 4.999608271692479e-05, "loss": 0.6441, "num_input_tokens_seen": 8186280, "step": 14110 }, { "epoch": 2.1023235031277925, "grad_norm": 0.7352535724639893, "learning_rate": 4.999602498477338e-05, "loss": 0.7504, "num_input_tokens_seen": 8189288, "step": 14115 }, { "epoch": 2.1030682156687517, "grad_norm": 1.0997343063354492, "learning_rate": 4.999596683033955e-05, "loss": 0.7172, "num_input_tokens_seen": 8192136, "step": 14120 }, { "epoch": 2.103812928209711, "grad_norm": 0.8686883449554443, "learning_rate": 4.999590825362425e-05, "loss": 0.7749, "num_input_tokens_seen": 8194952, "step": 14125 }, { "epoch": 2.10455764075067, "grad_norm": 1.3895186185836792, "learning_rate": 4.999584925462849e-05, "loss": 0.7309, "num_input_tokens_seen": 8197736, "step": 14130 }, { "epoch": 2.1053023532916293, "grad_norm": 0.584004282951355, "learning_rate": 4.999578983335327e-05, "loss": 0.5866, "num_input_tokens_seen": 8200584, "step": 14135 }, { "epoch": 2.1060470658325885, "grad_norm": 1.1239488124847412, "learning_rate": 4.999572998979957e-05, "loss": 0.7618, "num_input_tokens_seen": 8203560, "step": 14140 }, { "epoch": 2.1067917783735477, "grad_norm": 1.4468508958816528, "learning_rate": 4.9995669723968426e-05, "loss": 0.7171, "num_input_tokens_seen": 8206344, "step": 14145 }, { "epoch": 2.107536490914507, "grad_norm": 0.844158947467804, "learning_rate": 4.9995609035860845e-05, "loss": 0.7242, "num_input_tokens_seen": 8209128, "step": 14150 }, { "epoch": 2.108281203455466, "grad_norm": 0.7111420631408691, "learning_rate": 4.9995547925477856e-05, "loss": 0.5842, "num_input_tokens_seen": 8211976, "step": 14155 }, { "epoch": 2.1090259159964253, "grad_norm": 0.6787667870521545, "learning_rate": 4.999548639282048e-05, "loss": 0.6105, "num_input_tokens_seen": 8214920, "step": 14160 }, { "epoch": 2.1097706285373845, "grad_norm": 0.7597256898880005, "learning_rate": 4.9995424437889774e-05, "loss": 0.781, "num_input_tokens_seen": 8217992, "step": 14165 }, { "epoch": 2.1105153410783437, "grad_norm": 1.4111273288726807, "learning_rate": 4.999536206068678e-05, "loss": 0.728, "num_input_tokens_seen": 8221000, "step": 14170 }, { "epoch": 2.111260053619303, "grad_norm": 0.647269070148468, "learning_rate": 4.9995299261212536e-05, "loss": 0.8033, "num_input_tokens_seen": 8224008, "step": 14175 }, { "epoch": 2.112004766160262, "grad_norm": 0.8460150957107544, "learning_rate": 4.999523603946812e-05, "loss": 0.6737, "num_input_tokens_seen": 8226760, "step": 14180 }, { "epoch": 2.1127494787012213, "grad_norm": 1.3106358051300049, "learning_rate": 4.9995172395454606e-05, "loss": 0.8135, "num_input_tokens_seen": 8229768, "step": 14185 }, { "epoch": 2.1134941912421805, "grad_norm": 0.8694080710411072, "learning_rate": 4.999510832917304e-05, "loss": 0.6545, "num_input_tokens_seen": 8232744, "step": 14190 }, { "epoch": 2.1142389037831397, "grad_norm": 0.6571841239929199, "learning_rate": 4.9995043840624536e-05, "loss": 0.7478, "num_input_tokens_seen": 8235880, "step": 14195 }, { "epoch": 2.114983616324099, "grad_norm": 1.3609753847122192, "learning_rate": 4.999497892981017e-05, "loss": 0.6518, "num_input_tokens_seen": 8238920, "step": 14200 }, { "epoch": 2.115728328865058, "grad_norm": 0.750363290309906, "learning_rate": 4.999491359673103e-05, "loss": 0.6832, "num_input_tokens_seen": 8241896, "step": 14205 }, { "epoch": 2.1164730414060173, "grad_norm": 0.9253898859024048, "learning_rate": 4.999484784138823e-05, "loss": 0.779, "num_input_tokens_seen": 8244744, "step": 14210 }, { "epoch": 2.1172177539469765, "grad_norm": 1.0285403728485107, "learning_rate": 4.9994781663782884e-05, "loss": 0.69, "num_input_tokens_seen": 8247752, "step": 14215 }, { "epoch": 2.1179624664879357, "grad_norm": 1.498414158821106, "learning_rate": 4.99947150639161e-05, "loss": 0.7056, "num_input_tokens_seen": 8250536, "step": 14220 }, { "epoch": 2.118707179028895, "grad_norm": 1.0385723114013672, "learning_rate": 4.9994648041789016e-05, "loss": 0.6602, "num_input_tokens_seen": 8253480, "step": 14225 }, { "epoch": 2.119451891569854, "grad_norm": 0.6499761939048767, "learning_rate": 4.999458059740275e-05, "loss": 0.7881, "num_input_tokens_seen": 8256328, "step": 14230 }, { "epoch": 2.1201966041108133, "grad_norm": 2.061556577682495, "learning_rate": 4.9994512730758454e-05, "loss": 0.7745, "num_input_tokens_seen": 8259176, "step": 14235 }, { "epoch": 2.1209413166517725, "grad_norm": 0.7147626876831055, "learning_rate": 4.999444444185727e-05, "loss": 0.7454, "num_input_tokens_seen": 8261992, "step": 14240 }, { "epoch": 2.1216860291927317, "grad_norm": 0.8630296587944031, "learning_rate": 4.999437573070034e-05, "loss": 0.711, "num_input_tokens_seen": 8265032, "step": 14245 }, { "epoch": 2.122430741733691, "grad_norm": 1.2488863468170166, "learning_rate": 4.999430659728884e-05, "loss": 0.6978, "num_input_tokens_seen": 8268168, "step": 14250 }, { "epoch": 2.12317545427465, "grad_norm": 0.6861488223075867, "learning_rate": 4.9994237041623935e-05, "loss": 0.6227, "num_input_tokens_seen": 8271208, "step": 14255 }, { "epoch": 2.1239201668156094, "grad_norm": 1.1814433336257935, "learning_rate": 4.99941670637068e-05, "loss": 0.6377, "num_input_tokens_seen": 8274184, "step": 14260 }, { "epoch": 2.1246648793565686, "grad_norm": 1.6040265560150146, "learning_rate": 4.999409666353861e-05, "loss": 0.5931, "num_input_tokens_seen": 8277064, "step": 14265 }, { "epoch": 2.1254095918975278, "grad_norm": 0.8232289552688599, "learning_rate": 4.999402584112057e-05, "loss": 0.6249, "num_input_tokens_seen": 8280200, "step": 14270 }, { "epoch": 2.1261543044384865, "grad_norm": 0.5857277512550354, "learning_rate": 4.999395459645385e-05, "loss": 0.7724, "num_input_tokens_seen": 8282888, "step": 14275 }, { "epoch": 2.126899016979446, "grad_norm": 1.1899693012237549, "learning_rate": 4.999388292953968e-05, "loss": 0.7177, "num_input_tokens_seen": 8285704, "step": 14280 }, { "epoch": 2.127643729520405, "grad_norm": 1.005703091621399, "learning_rate": 4.999381084037926e-05, "loss": 0.6832, "num_input_tokens_seen": 8288520, "step": 14285 }, { "epoch": 2.128388442061364, "grad_norm": 1.058830976486206, "learning_rate": 4.999373832897381e-05, "loss": 0.7517, "num_input_tokens_seen": 8291304, "step": 14290 }, { "epoch": 2.1291331546023233, "grad_norm": 0.83868408203125, "learning_rate": 4.9993665395324554e-05, "loss": 0.6623, "num_input_tokens_seen": 8294120, "step": 14295 }, { "epoch": 2.1298778671432825, "grad_norm": 0.794121503829956, "learning_rate": 4.999359203943272e-05, "loss": 0.6209, "num_input_tokens_seen": 8297160, "step": 14300 }, { "epoch": 2.1306225796842417, "grad_norm": 0.8683567047119141, "learning_rate": 4.999351826129955e-05, "loss": 0.6569, "num_input_tokens_seen": 8299944, "step": 14305 }, { "epoch": 2.131367292225201, "grad_norm": 1.1989372968673706, "learning_rate": 4.9993444060926296e-05, "loss": 0.7823, "num_input_tokens_seen": 8302920, "step": 14310 }, { "epoch": 2.13211200476616, "grad_norm": 1.2710000276565552, "learning_rate": 4.9993369438314204e-05, "loss": 0.7838, "num_input_tokens_seen": 8306024, "step": 14315 }, { "epoch": 2.1328567173071193, "grad_norm": 1.9234744310379028, "learning_rate": 4.9993294393464536e-05, "loss": 0.8416, "num_input_tokens_seen": 8308840, "step": 14320 }, { "epoch": 2.1336014298480785, "grad_norm": 1.1849192380905151, "learning_rate": 4.999321892637856e-05, "loss": 0.7815, "num_input_tokens_seen": 8312008, "step": 14325 }, { "epoch": 2.1343461423890377, "grad_norm": 0.5828664302825928, "learning_rate": 4.9993143037057554e-05, "loss": 0.6155, "num_input_tokens_seen": 8314856, "step": 14330 }, { "epoch": 2.135090854929997, "grad_norm": 0.7262634038925171, "learning_rate": 4.99930667255028e-05, "loss": 0.7593, "num_input_tokens_seen": 8317608, "step": 14335 }, { "epoch": 2.135835567470956, "grad_norm": 0.9389261603355408, "learning_rate": 4.999298999171559e-05, "loss": 0.7045, "num_input_tokens_seen": 8320520, "step": 14340 }, { "epoch": 2.1365802800119154, "grad_norm": 1.2700247764587402, "learning_rate": 4.99929128356972e-05, "loss": 0.5704, "num_input_tokens_seen": 8323272, "step": 14345 }, { "epoch": 2.1373249925528746, "grad_norm": 0.7133320569992065, "learning_rate": 4.9992835257448965e-05, "loss": 0.7649, "num_input_tokens_seen": 8326184, "step": 14350 }, { "epoch": 2.1380697050938338, "grad_norm": 1.293096661567688, "learning_rate": 4.999275725697218e-05, "loss": 0.7875, "num_input_tokens_seen": 8329192, "step": 14355 }, { "epoch": 2.138814417634793, "grad_norm": 0.832874059677124, "learning_rate": 4.9992678834268154e-05, "loss": 0.6716, "num_input_tokens_seen": 8332392, "step": 14360 }, { "epoch": 2.139559130175752, "grad_norm": 0.8597717881202698, "learning_rate": 4.999259998933822e-05, "loss": 0.7322, "num_input_tokens_seen": 8335432, "step": 14365 }, { "epoch": 2.1403038427167114, "grad_norm": 0.9701625108718872, "learning_rate": 4.9992520722183714e-05, "loss": 0.8456, "num_input_tokens_seen": 8338216, "step": 14370 }, { "epoch": 2.1410485552576706, "grad_norm": 0.7049578428268433, "learning_rate": 4.999244103280597e-05, "loss": 0.6585, "num_input_tokens_seen": 8341224, "step": 14375 }, { "epoch": 2.1417932677986298, "grad_norm": 1.3437488079071045, "learning_rate": 4.999236092120634e-05, "loss": 0.7828, "num_input_tokens_seen": 8344104, "step": 14380 }, { "epoch": 2.142537980339589, "grad_norm": 0.800451934337616, "learning_rate": 4.999228038738617e-05, "loss": 0.5841, "num_input_tokens_seen": 8346792, "step": 14385 }, { "epoch": 2.143282692880548, "grad_norm": 0.6869890689849854, "learning_rate": 4.999219943134683e-05, "loss": 0.7671, "num_input_tokens_seen": 8349864, "step": 14390 }, { "epoch": 2.1440274054215074, "grad_norm": 1.0786978006362915, "learning_rate": 4.9992118053089675e-05, "loss": 0.7348, "num_input_tokens_seen": 8352904, "step": 14395 }, { "epoch": 2.1447721179624666, "grad_norm": 1.0329140424728394, "learning_rate": 4.999203625261609e-05, "loss": 0.5943, "num_input_tokens_seen": 8355848, "step": 14400 }, { "epoch": 2.145516830503426, "grad_norm": 1.252211332321167, "learning_rate": 4.999195402992745e-05, "loss": 0.7926, "num_input_tokens_seen": 8358792, "step": 14405 }, { "epoch": 2.146261543044385, "grad_norm": 0.740251362323761, "learning_rate": 4.999187138502515e-05, "loss": 0.4988, "num_input_tokens_seen": 8361224, "step": 14410 }, { "epoch": 2.147006255585344, "grad_norm": 1.3409589529037476, "learning_rate": 4.999178831791058e-05, "loss": 0.7879, "num_input_tokens_seen": 8364360, "step": 14415 }, { "epoch": 2.1477509681263034, "grad_norm": 1.1778138875961304, "learning_rate": 4.999170482858515e-05, "loss": 0.5626, "num_input_tokens_seen": 8367016, "step": 14420 }, { "epoch": 2.1484956806672626, "grad_norm": 0.9166885018348694, "learning_rate": 4.999162091705026e-05, "loss": 0.6645, "num_input_tokens_seen": 8369928, "step": 14425 }, { "epoch": 2.149240393208222, "grad_norm": 3.9862568378448486, "learning_rate": 4.9991536583307344e-05, "loss": 0.7481, "num_input_tokens_seen": 8372712, "step": 14430 }, { "epoch": 2.149985105749181, "grad_norm": 1.2745671272277832, "learning_rate": 4.999145182735782e-05, "loss": 0.5461, "num_input_tokens_seen": 8375560, "step": 14435 }, { "epoch": 2.15072981829014, "grad_norm": 0.7754364609718323, "learning_rate": 4.999136664920311e-05, "loss": 0.6014, "num_input_tokens_seen": 8378536, "step": 14440 }, { "epoch": 2.1514745308310994, "grad_norm": 1.1618456840515137, "learning_rate": 4.999128104884466e-05, "loss": 0.5395, "num_input_tokens_seen": 8381288, "step": 14445 }, { "epoch": 2.152219243372058, "grad_norm": 0.7332903146743774, "learning_rate": 4.999119502628392e-05, "loss": 0.673, "num_input_tokens_seen": 8384200, "step": 14450 }, { "epoch": 2.1529639559130174, "grad_norm": 1.066094994544983, "learning_rate": 4.999110858152234e-05, "loss": 0.7204, "num_input_tokens_seen": 8387208, "step": 14455 }, { "epoch": 2.1537086684539766, "grad_norm": 1.6935265064239502, "learning_rate": 4.999102171456138e-05, "loss": 0.7511, "num_input_tokens_seen": 8390280, "step": 14460 }, { "epoch": 2.1544533809949358, "grad_norm": 0.9583768248558044, "learning_rate": 4.999093442540251e-05, "loss": 0.6099, "num_input_tokens_seen": 8393128, "step": 14465 }, { "epoch": 2.155198093535895, "grad_norm": 2.037836790084839, "learning_rate": 4.9990846714047204e-05, "loss": 0.7766, "num_input_tokens_seen": 8396136, "step": 14470 }, { "epoch": 2.155942806076854, "grad_norm": 1.1688644886016846, "learning_rate": 4.9990758580496935e-05, "loss": 0.8235, "num_input_tokens_seen": 8399208, "step": 14475 }, { "epoch": 2.1566875186178134, "grad_norm": 1.1605265140533447, "learning_rate": 4.99906700247532e-05, "loss": 0.7165, "num_input_tokens_seen": 8402152, "step": 14480 }, { "epoch": 2.1574322311587726, "grad_norm": 1.00770103931427, "learning_rate": 4.99905810468175e-05, "loss": 0.6244, "num_input_tokens_seen": 8405064, "step": 14485 }, { "epoch": 2.158176943699732, "grad_norm": 0.8000138998031616, "learning_rate": 4.999049164669133e-05, "loss": 0.7091, "num_input_tokens_seen": 8408808, "step": 14490 }, { "epoch": 2.158921656240691, "grad_norm": 0.8249644041061401, "learning_rate": 4.9990401824376196e-05, "loss": 0.6074, "num_input_tokens_seen": 8411880, "step": 14495 }, { "epoch": 2.15966636878165, "grad_norm": 0.6932563185691833, "learning_rate": 4.999031157987364e-05, "loss": 0.5909, "num_input_tokens_seen": 8414568, "step": 14500 }, { "epoch": 2.1604110813226094, "grad_norm": 0.8147468566894531, "learning_rate": 4.9990220913185146e-05, "loss": 0.635, "num_input_tokens_seen": 8417192, "step": 14505 }, { "epoch": 2.1611557938635686, "grad_norm": 0.8611400127410889, "learning_rate": 4.9990129824312285e-05, "loss": 0.8084, "num_input_tokens_seen": 8420200, "step": 14510 }, { "epoch": 2.161900506404528, "grad_norm": 0.7767787575721741, "learning_rate": 4.9990038313256573e-05, "loss": 0.6508, "num_input_tokens_seen": 8422952, "step": 14515 }, { "epoch": 2.162645218945487, "grad_norm": 1.0069217681884766, "learning_rate": 4.998994638001957e-05, "loss": 0.7127, "num_input_tokens_seen": 8425544, "step": 14520 }, { "epoch": 2.163389931486446, "grad_norm": 1.1843883991241455, "learning_rate": 4.998985402460281e-05, "loss": 0.6881, "num_input_tokens_seen": 8428616, "step": 14525 }, { "epoch": 2.1641346440274054, "grad_norm": 0.8070662617683411, "learning_rate": 4.998976124700787e-05, "loss": 0.6619, "num_input_tokens_seen": 8431592, "step": 14530 }, { "epoch": 2.1648793565683646, "grad_norm": 1.0348213911056519, "learning_rate": 4.9989668047236316e-05, "loss": 0.6252, "num_input_tokens_seen": 8434568, "step": 14535 }, { "epoch": 2.165624069109324, "grad_norm": 1.1013821363449097, "learning_rate": 4.998957442528972e-05, "loss": 0.5875, "num_input_tokens_seen": 8437288, "step": 14540 }, { "epoch": 2.166368781650283, "grad_norm": 0.6135959625244141, "learning_rate": 4.998948038116965e-05, "loss": 0.7547, "num_input_tokens_seen": 8439976, "step": 14545 }, { "epoch": 2.167113494191242, "grad_norm": 1.409737229347229, "learning_rate": 4.9989385914877717e-05, "loss": 0.6623, "num_input_tokens_seen": 8442728, "step": 14550 }, { "epoch": 2.1678582067322014, "grad_norm": 1.3930541276931763, "learning_rate": 4.998929102641551e-05, "loss": 0.5376, "num_input_tokens_seen": 8445608, "step": 14555 }, { "epoch": 2.1686029192731606, "grad_norm": 0.7521124482154846, "learning_rate": 4.998919571578462e-05, "loss": 0.6948, "num_input_tokens_seen": 8448360, "step": 14560 }, { "epoch": 2.16934763181412, "grad_norm": 0.8555070757865906, "learning_rate": 4.998909998298668e-05, "loss": 0.5793, "num_input_tokens_seen": 8451368, "step": 14565 }, { "epoch": 2.170092344355079, "grad_norm": 0.7065128087997437, "learning_rate": 4.998900382802327e-05, "loss": 0.6203, "num_input_tokens_seen": 8453960, "step": 14570 }, { "epoch": 2.1708370568960382, "grad_norm": 0.920613706111908, "learning_rate": 4.9988907250896056e-05, "loss": 0.6438, "num_input_tokens_seen": 8456872, "step": 14575 }, { "epoch": 2.1715817694369974, "grad_norm": 0.7569342851638794, "learning_rate": 4.998881025160665e-05, "loss": 0.719, "num_input_tokens_seen": 8459656, "step": 14580 }, { "epoch": 2.1723264819779566, "grad_norm": 0.5824496150016785, "learning_rate": 4.9988712830156694e-05, "loss": 0.6991, "num_input_tokens_seen": 8462536, "step": 14585 }, { "epoch": 2.173071194518916, "grad_norm": 0.9455119967460632, "learning_rate": 4.998861498654782e-05, "loss": 0.7614, "num_input_tokens_seen": 8465544, "step": 14590 }, { "epoch": 2.173815907059875, "grad_norm": 0.8363032937049866, "learning_rate": 4.9988516720781705e-05, "loss": 0.6081, "num_input_tokens_seen": 8468424, "step": 14595 }, { "epoch": 2.1745606196008342, "grad_norm": 0.8872883319854736, "learning_rate": 4.998841803286e-05, "loss": 0.7391, "num_input_tokens_seen": 8471496, "step": 14600 }, { "epoch": 2.1753053321417934, "grad_norm": 2.1024351119995117, "learning_rate": 4.9988318922784364e-05, "loss": 0.7287, "num_input_tokens_seen": 8474728, "step": 14605 }, { "epoch": 2.1760500446827526, "grad_norm": 0.7932949066162109, "learning_rate": 4.9988219390556466e-05, "loss": 0.7756, "num_input_tokens_seen": 8477320, "step": 14610 }, { "epoch": 2.176794757223712, "grad_norm": 0.7659575939178467, "learning_rate": 4.998811943617801e-05, "loss": 0.6808, "num_input_tokens_seen": 8480136, "step": 14615 }, { "epoch": 2.177539469764671, "grad_norm": 0.6688268780708313, "learning_rate": 4.998801905965067e-05, "loss": 0.6738, "num_input_tokens_seen": 8482952, "step": 14620 }, { "epoch": 2.17828418230563, "grad_norm": 0.6901483535766602, "learning_rate": 4.998791826097615e-05, "loss": 0.7091, "num_input_tokens_seen": 8485736, "step": 14625 }, { "epoch": 2.179028894846589, "grad_norm": 1.0594390630722046, "learning_rate": 4.998781704015614e-05, "loss": 0.6597, "num_input_tokens_seen": 8488360, "step": 14630 }, { "epoch": 2.179773607387548, "grad_norm": 0.830043375492096, "learning_rate": 4.998771539719236e-05, "loss": 0.7512, "num_input_tokens_seen": 8491368, "step": 14635 }, { "epoch": 2.1805183199285074, "grad_norm": 0.9015718698501587, "learning_rate": 4.998761333208652e-05, "loss": 0.7577, "num_input_tokens_seen": 8494120, "step": 14640 }, { "epoch": 2.1812630324694666, "grad_norm": 1.310734510421753, "learning_rate": 4.9987510844840354e-05, "loss": 0.7818, "num_input_tokens_seen": 8497000, "step": 14645 }, { "epoch": 2.182007745010426, "grad_norm": 0.7372516393661499, "learning_rate": 4.998740793545559e-05, "loss": 0.6773, "num_input_tokens_seen": 8499784, "step": 14650 }, { "epoch": 2.182752457551385, "grad_norm": 1.2750040292739868, "learning_rate": 4.998730460393397e-05, "loss": 0.6588, "num_input_tokens_seen": 8502632, "step": 14655 }, { "epoch": 2.1834971700923442, "grad_norm": 0.9834742546081543, "learning_rate": 4.998720085027723e-05, "loss": 0.8546, "num_input_tokens_seen": 8505352, "step": 14660 }, { "epoch": 2.1842418826333034, "grad_norm": 1.160148024559021, "learning_rate": 4.998709667448712e-05, "loss": 0.6152, "num_input_tokens_seen": 8508264, "step": 14665 }, { "epoch": 2.1849865951742626, "grad_norm": 1.245895504951477, "learning_rate": 4.998699207656542e-05, "loss": 0.6302, "num_input_tokens_seen": 8510952, "step": 14670 }, { "epoch": 2.185731307715222, "grad_norm": 1.8426918983459473, "learning_rate": 4.9986887056513874e-05, "loss": 0.75, "num_input_tokens_seen": 8514312, "step": 14675 }, { "epoch": 2.186476020256181, "grad_norm": 1.1644251346588135, "learning_rate": 4.998678161433427e-05, "loss": 0.7555, "num_input_tokens_seen": 8517224, "step": 14680 }, { "epoch": 2.1872207327971402, "grad_norm": 1.3907244205474854, "learning_rate": 4.998667575002839e-05, "loss": 0.6713, "num_input_tokens_seen": 8520008, "step": 14685 }, { "epoch": 2.1879654453380994, "grad_norm": 1.3498294353485107, "learning_rate": 4.998656946359801e-05, "loss": 0.7186, "num_input_tokens_seen": 8522952, "step": 14690 }, { "epoch": 2.1887101578790586, "grad_norm": 0.9029098749160767, "learning_rate": 4.998646275504494e-05, "loss": 0.6973, "num_input_tokens_seen": 8526088, "step": 14695 }, { "epoch": 2.189454870420018, "grad_norm": 1.272087574005127, "learning_rate": 4.998635562437098e-05, "loss": 0.7769, "num_input_tokens_seen": 8529128, "step": 14700 }, { "epoch": 2.190199582960977, "grad_norm": 1.0582362413406372, "learning_rate": 4.9986248071577934e-05, "loss": 0.6481, "num_input_tokens_seen": 8531880, "step": 14705 }, { "epoch": 2.1909442955019363, "grad_norm": 1.0015127658843994, "learning_rate": 4.998614009666762e-05, "loss": 0.6138, "num_input_tokens_seen": 8534888, "step": 14710 }, { "epoch": 2.1916890080428955, "grad_norm": 1.1854511499404907, "learning_rate": 4.9986031699641866e-05, "loss": 0.5748, "num_input_tokens_seen": 8537992, "step": 14715 }, { "epoch": 2.1924337205838547, "grad_norm": 1.2610445022583008, "learning_rate": 4.99859228805025e-05, "loss": 0.6101, "num_input_tokens_seen": 8541032, "step": 14720 }, { "epoch": 2.193178433124814, "grad_norm": 0.9868835210800171, "learning_rate": 4.9985813639251355e-05, "loss": 0.6074, "num_input_tokens_seen": 8543944, "step": 14725 }, { "epoch": 2.193923145665773, "grad_norm": 1.055099606513977, "learning_rate": 4.9985703975890294e-05, "loss": 0.7694, "num_input_tokens_seen": 8546824, "step": 14730 }, { "epoch": 2.1946678582067323, "grad_norm": 1.2075724601745605, "learning_rate": 4.998559389042115e-05, "loss": 0.6372, "num_input_tokens_seen": 8549704, "step": 14735 }, { "epoch": 2.1954125707476915, "grad_norm": 0.6258065700531006, "learning_rate": 4.99854833828458e-05, "loss": 0.7964, "num_input_tokens_seen": 8552488, "step": 14740 }, { "epoch": 2.1961572832886507, "grad_norm": 0.6582643389701843, "learning_rate": 4.998537245316609e-05, "loss": 0.6401, "num_input_tokens_seen": 8555560, "step": 14745 }, { "epoch": 2.19690199582961, "grad_norm": 1.0313637256622314, "learning_rate": 4.998526110138392e-05, "loss": 0.674, "num_input_tokens_seen": 8558760, "step": 14750 }, { "epoch": 2.197646708370569, "grad_norm": 0.7535772323608398, "learning_rate": 4.9985149327501146e-05, "loss": 0.5581, "num_input_tokens_seen": 8561544, "step": 14755 }, { "epoch": 2.1983914209115283, "grad_norm": 0.911817193031311, "learning_rate": 4.998503713151967e-05, "loss": 0.6835, "num_input_tokens_seen": 8564456, "step": 14760 }, { "epoch": 2.1991361334524875, "grad_norm": 0.9574499726295471, "learning_rate": 4.9984924513441397e-05, "loss": 0.6556, "num_input_tokens_seen": 8567016, "step": 14765 }, { "epoch": 2.1998808459934467, "grad_norm": 0.8791858553886414, "learning_rate": 4.9984811473268214e-05, "loss": 0.6841, "num_input_tokens_seen": 8569736, "step": 14770 }, { "epoch": 2.200625558534406, "grad_norm": 1.135452389717102, "learning_rate": 4.998469801100203e-05, "loss": 0.5986, "num_input_tokens_seen": 8572680, "step": 14775 }, { "epoch": 2.201370271075365, "grad_norm": 0.9908974766731262, "learning_rate": 4.998458412664476e-05, "loss": 0.8349, "num_input_tokens_seen": 8575656, "step": 14780 }, { "epoch": 2.2021149836163243, "grad_norm": 1.370729923248291, "learning_rate": 4.9984469820198345e-05, "loss": 0.7895, "num_input_tokens_seen": 8578632, "step": 14785 }, { "epoch": 2.202859696157283, "grad_norm": 1.8412494659423828, "learning_rate": 4.9984355091664705e-05, "loss": 0.6141, "num_input_tokens_seen": 8581512, "step": 14790 }, { "epoch": 2.2036044086982427, "grad_norm": 0.9193658828735352, "learning_rate": 4.9984239941045766e-05, "loss": 0.7526, "num_input_tokens_seen": 8584584, "step": 14795 }, { "epoch": 2.2043491212392015, "grad_norm": 0.9414641261100769, "learning_rate": 4.99841243683435e-05, "loss": 0.6684, "num_input_tokens_seen": 8587688, "step": 14800 }, { "epoch": 2.2050938337801607, "grad_norm": 1.4740227460861206, "learning_rate": 4.998400837355984e-05, "loss": 0.7722, "num_input_tokens_seen": 8590664, "step": 14805 }, { "epoch": 2.20583854632112, "grad_norm": 1.1959967613220215, "learning_rate": 4.998389195669675e-05, "loss": 0.5906, "num_input_tokens_seen": 8593480, "step": 14810 }, { "epoch": 2.206583258862079, "grad_norm": 1.1566054821014404, "learning_rate": 4.998377511775621e-05, "loss": 0.6939, "num_input_tokens_seen": 8596168, "step": 14815 }, { "epoch": 2.2073279714030383, "grad_norm": 0.6998699307441711, "learning_rate": 4.9983657856740165e-05, "loss": 0.5702, "num_input_tokens_seen": 8599048, "step": 14820 }, { "epoch": 2.2080726839439975, "grad_norm": 1.4495024681091309, "learning_rate": 4.9983540173650614e-05, "loss": 0.657, "num_input_tokens_seen": 8601864, "step": 14825 }, { "epoch": 2.2088173964849567, "grad_norm": 0.7621778845787048, "learning_rate": 4.9983422068489546e-05, "loss": 0.641, "num_input_tokens_seen": 8604776, "step": 14830 }, { "epoch": 2.209562109025916, "grad_norm": 0.7907098531723022, "learning_rate": 4.998330354125896e-05, "loss": 0.6871, "num_input_tokens_seen": 8607816, "step": 14835 }, { "epoch": 2.210306821566875, "grad_norm": 0.7958239316940308, "learning_rate": 4.998318459196085e-05, "loss": 0.7329, "num_input_tokens_seen": 8610536, "step": 14840 }, { "epoch": 2.2110515341078343, "grad_norm": 0.9249851703643799, "learning_rate": 4.998306522059723e-05, "loss": 0.6823, "num_input_tokens_seen": 8613480, "step": 14845 }, { "epoch": 2.2117962466487935, "grad_norm": 1.3801376819610596, "learning_rate": 4.9982945427170115e-05, "loss": 0.6729, "num_input_tokens_seen": 8616296, "step": 14850 }, { "epoch": 2.2125409591897527, "grad_norm": 0.9049334526062012, "learning_rate": 4.998282521168153e-05, "loss": 0.8286, "num_input_tokens_seen": 8619464, "step": 14855 }, { "epoch": 2.213285671730712, "grad_norm": 1.0175561904907227, "learning_rate": 4.9982704574133497e-05, "loss": 0.7579, "num_input_tokens_seen": 8622312, "step": 14860 }, { "epoch": 2.214030384271671, "grad_norm": 0.9577460289001465, "learning_rate": 4.998258351452806e-05, "loss": 0.7378, "num_input_tokens_seen": 8625000, "step": 14865 }, { "epoch": 2.2147750968126303, "grad_norm": 1.051164984703064, "learning_rate": 4.998246203286727e-05, "loss": 0.6834, "num_input_tokens_seen": 8628040, "step": 14870 }, { "epoch": 2.2155198093535895, "grad_norm": 1.1066831350326538, "learning_rate": 4.9982340129153185e-05, "loss": 0.819, "num_input_tokens_seen": 8630792, "step": 14875 }, { "epoch": 2.2162645218945487, "grad_norm": 0.7519941926002502, "learning_rate": 4.9982217803387844e-05, "loss": 0.6788, "num_input_tokens_seen": 8633736, "step": 14880 }, { "epoch": 2.217009234435508, "grad_norm": 0.8086631298065186, "learning_rate": 4.998209505557333e-05, "loss": 0.8473, "num_input_tokens_seen": 8636840, "step": 14885 }, { "epoch": 2.217753946976467, "grad_norm": 0.8311619758605957, "learning_rate": 4.99819718857117e-05, "loss": 0.818, "num_input_tokens_seen": 8639816, "step": 14890 }, { "epoch": 2.2184986595174263, "grad_norm": 0.7648004293441772, "learning_rate": 4.998184829380505e-05, "loss": 0.6803, "num_input_tokens_seen": 8642536, "step": 14895 }, { "epoch": 2.2192433720583855, "grad_norm": 0.9153485298156738, "learning_rate": 4.9981724279855466e-05, "loss": 0.61, "num_input_tokens_seen": 8645576, "step": 14900 }, { "epoch": 2.2199880845993447, "grad_norm": 0.7323988676071167, "learning_rate": 4.998159984386504e-05, "loss": 0.6929, "num_input_tokens_seen": 8648712, "step": 14905 }, { "epoch": 2.220732797140304, "grad_norm": 0.9013568758964539, "learning_rate": 4.9981474985835875e-05, "loss": 0.659, "num_input_tokens_seen": 8651976, "step": 14910 }, { "epoch": 2.221477509681263, "grad_norm": 0.7049149870872498, "learning_rate": 4.9981349705770074e-05, "loss": 0.7457, "num_input_tokens_seen": 8654952, "step": 14915 }, { "epoch": 2.2222222222222223, "grad_norm": 0.7562841773033142, "learning_rate": 4.998122400366977e-05, "loss": 0.6798, "num_input_tokens_seen": 8658056, "step": 14920 }, { "epoch": 2.2229669347631815, "grad_norm": 0.7467185258865356, "learning_rate": 4.998109787953708e-05, "loss": 0.662, "num_input_tokens_seen": 8661160, "step": 14925 }, { "epoch": 2.2237116473041407, "grad_norm": 0.8742703795433044, "learning_rate": 4.998097133337412e-05, "loss": 0.6339, "num_input_tokens_seen": 8664040, "step": 14930 }, { "epoch": 2.2244563598451, "grad_norm": 0.8304799795150757, "learning_rate": 4.998084436518303e-05, "loss": 0.7357, "num_input_tokens_seen": 8667016, "step": 14935 }, { "epoch": 2.225201072386059, "grad_norm": 0.48412206768989563, "learning_rate": 4.998071697496598e-05, "loss": 0.5844, "num_input_tokens_seen": 8669928, "step": 14940 }, { "epoch": 2.2259457849270183, "grad_norm": 0.785354733467102, "learning_rate": 4.99805891627251e-05, "loss": 0.5917, "num_input_tokens_seen": 8673032, "step": 14945 }, { "epoch": 2.2266904974679775, "grad_norm": 0.8764787912368774, "learning_rate": 4.998046092846256e-05, "loss": 0.6682, "num_input_tokens_seen": 8675976, "step": 14950 }, { "epoch": 2.2274352100089367, "grad_norm": 0.6960400938987732, "learning_rate": 4.998033227218052e-05, "loss": 0.7684, "num_input_tokens_seen": 8678856, "step": 14955 }, { "epoch": 2.228179922549896, "grad_norm": 1.0618388652801514, "learning_rate": 4.998020319388115e-05, "loss": 0.6305, "num_input_tokens_seen": 8681896, "step": 14960 }, { "epoch": 2.2289246350908547, "grad_norm": 1.7648406028747559, "learning_rate": 4.998007369356664e-05, "loss": 0.778, "num_input_tokens_seen": 8684968, "step": 14965 }, { "epoch": 2.2296693476318143, "grad_norm": 0.719908595085144, "learning_rate": 4.997994377123917e-05, "loss": 0.6366, "num_input_tokens_seen": 8687912, "step": 14970 }, { "epoch": 2.230414060172773, "grad_norm": 0.6907609701156616, "learning_rate": 4.997981342690095e-05, "loss": 0.6694, "num_input_tokens_seen": 8691176, "step": 14975 }, { "epoch": 2.2311587727137323, "grad_norm": 0.6481912732124329, "learning_rate": 4.9979682660554154e-05, "loss": 0.8121, "num_input_tokens_seen": 8694248, "step": 14980 }, { "epoch": 2.2319034852546915, "grad_norm": 0.8719246983528137, "learning_rate": 4.997955147220101e-05, "loss": 0.7036, "num_input_tokens_seen": 8697096, "step": 14985 }, { "epoch": 2.2326481977956507, "grad_norm": 0.8819616436958313, "learning_rate": 4.997941986184375e-05, "loss": 0.5709, "num_input_tokens_seen": 8699880, "step": 14990 }, { "epoch": 2.23339291033661, "grad_norm": 1.8975417613983154, "learning_rate": 4.9979287829484555e-05, "loss": 0.7289, "num_input_tokens_seen": 8702728, "step": 14995 }, { "epoch": 2.234137622877569, "grad_norm": 1.5343226194381714, "learning_rate": 4.99791553751257e-05, "loss": 0.7649, "num_input_tokens_seen": 8705608, "step": 15000 }, { "epoch": 2.2348823354185283, "grad_norm": 0.746708869934082, "learning_rate": 4.997902249876939e-05, "loss": 0.6784, "num_input_tokens_seen": 8708296, "step": 15005 }, { "epoch": 2.2356270479594875, "grad_norm": 1.0421878099441528, "learning_rate": 4.997888920041789e-05, "loss": 0.6405, "num_input_tokens_seen": 8711080, "step": 15010 }, { "epoch": 2.2363717605004467, "grad_norm": 1.663635492324829, "learning_rate": 4.997875548007343e-05, "loss": 0.6382, "num_input_tokens_seen": 8713896, "step": 15015 }, { "epoch": 2.237116473041406, "grad_norm": 0.9139364957809448, "learning_rate": 4.99786213377383e-05, "loss": 0.9024, "num_input_tokens_seen": 8717000, "step": 15020 }, { "epoch": 2.237861185582365, "grad_norm": 1.2832187414169312, "learning_rate": 4.997848677341474e-05, "loss": 0.6929, "num_input_tokens_seen": 8720264, "step": 15025 }, { "epoch": 2.2386058981233243, "grad_norm": 0.6213014721870422, "learning_rate": 4.997835178710504e-05, "loss": 0.6594, "num_input_tokens_seen": 8723272, "step": 15030 }, { "epoch": 2.2393506106642835, "grad_norm": 0.7015401721000671, "learning_rate": 4.997821637881147e-05, "loss": 0.6642, "num_input_tokens_seen": 8725960, "step": 15035 }, { "epoch": 2.2400953232052427, "grad_norm": 0.6075615286827087, "learning_rate": 4.997808054853632e-05, "loss": 0.7895, "num_input_tokens_seen": 8729064, "step": 15040 }, { "epoch": 2.240840035746202, "grad_norm": 0.6196640729904175, "learning_rate": 4.9977944296281895e-05, "loss": 0.7646, "num_input_tokens_seen": 8732040, "step": 15045 }, { "epoch": 2.241584748287161, "grad_norm": 0.9027499556541443, "learning_rate": 4.997780762205047e-05, "loss": 0.5995, "num_input_tokens_seen": 8735080, "step": 15050 }, { "epoch": 2.2423294608281203, "grad_norm": 0.8758822679519653, "learning_rate": 4.997767052584439e-05, "loss": 0.8297, "num_input_tokens_seen": 8737800, "step": 15055 }, { "epoch": 2.2430741733690795, "grad_norm": 0.6708607077598572, "learning_rate": 4.9977533007665944e-05, "loss": 0.7499, "num_input_tokens_seen": 8740648, "step": 15060 }, { "epoch": 2.2438188859100388, "grad_norm": 0.6712687611579895, "learning_rate": 4.9977395067517464e-05, "loss": 0.6341, "num_input_tokens_seen": 8743560, "step": 15065 }, { "epoch": 2.244563598450998, "grad_norm": 0.5039013028144836, "learning_rate": 4.997725670540128e-05, "loss": 0.738, "num_input_tokens_seen": 8746824, "step": 15070 }, { "epoch": 2.245308310991957, "grad_norm": 1.0706274509429932, "learning_rate": 4.997711792131973e-05, "loss": 0.6962, "num_input_tokens_seen": 8749672, "step": 15075 }, { "epoch": 2.2460530235329164, "grad_norm": 1.0912907123565674, "learning_rate": 4.9976978715275155e-05, "loss": 0.6644, "num_input_tokens_seen": 8752392, "step": 15080 }, { "epoch": 2.2467977360738756, "grad_norm": 1.3788915872573853, "learning_rate": 4.997683908726991e-05, "loss": 0.739, "num_input_tokens_seen": 8755080, "step": 15085 }, { "epoch": 2.2475424486148348, "grad_norm": 0.9442691206932068, "learning_rate": 4.9976699037306356e-05, "loss": 0.7191, "num_input_tokens_seen": 8758152, "step": 15090 }, { "epoch": 2.248287161155794, "grad_norm": 0.678512454032898, "learning_rate": 4.997655856538686e-05, "loss": 0.6725, "num_input_tokens_seen": 8761160, "step": 15095 }, { "epoch": 2.249031873696753, "grad_norm": 0.9722458720207214, "learning_rate": 4.9976417671513787e-05, "loss": 0.5962, "num_input_tokens_seen": 8764104, "step": 15100 }, { "epoch": 2.2497765862377124, "grad_norm": 0.9771321415901184, "learning_rate": 4.997627635568953e-05, "loss": 0.6764, "num_input_tokens_seen": 8766952, "step": 15105 }, { "epoch": 2.2505212987786716, "grad_norm": 0.8314884305000305, "learning_rate": 4.997613461791646e-05, "loss": 0.7842, "num_input_tokens_seen": 8769800, "step": 15110 }, { "epoch": 2.2512660113196308, "grad_norm": 0.5970529913902283, "learning_rate": 4.9975992458196986e-05, "loss": 0.7215, "num_input_tokens_seen": 8773224, "step": 15115 }, { "epoch": 2.25201072386059, "grad_norm": 0.8546632528305054, "learning_rate": 4.99758498765335e-05, "loss": 0.7473, "num_input_tokens_seen": 8775848, "step": 15120 }, { "epoch": 2.252755436401549, "grad_norm": 0.9567497968673706, "learning_rate": 4.997570687292842e-05, "loss": 0.7317, "num_input_tokens_seen": 8778952, "step": 15125 }, { "epoch": 2.2535001489425084, "grad_norm": 0.7692732214927673, "learning_rate": 4.9975563447384156e-05, "loss": 0.6606, "num_input_tokens_seen": 8782280, "step": 15130 }, { "epoch": 2.2542448614834676, "grad_norm": 0.6804118156433105, "learning_rate": 4.997541959990313e-05, "loss": 0.7263, "num_input_tokens_seen": 8785416, "step": 15135 }, { "epoch": 2.2549895740244263, "grad_norm": 1.2620434761047363, "learning_rate": 4.997527533048777e-05, "loss": 0.7157, "num_input_tokens_seen": 8788328, "step": 15140 }, { "epoch": 2.255734286565386, "grad_norm": 0.8302449584007263, "learning_rate": 4.997513063914052e-05, "loss": 0.6671, "num_input_tokens_seen": 8791240, "step": 15145 }, { "epoch": 2.2564789991063448, "grad_norm": 0.8257783651351929, "learning_rate": 4.997498552586382e-05, "loss": 0.6683, "num_input_tokens_seen": 8794344, "step": 15150 }, { "epoch": 2.257223711647304, "grad_norm": 0.6382755637168884, "learning_rate": 4.9974839990660124e-05, "loss": 0.6325, "num_input_tokens_seen": 8797544, "step": 15155 }, { "epoch": 2.257968424188263, "grad_norm": 0.6760281920433044, "learning_rate": 4.997469403353189e-05, "loss": 0.7279, "num_input_tokens_seen": 8800680, "step": 15160 }, { "epoch": 2.2587131367292224, "grad_norm": 1.228190541267395, "learning_rate": 4.9974547654481585e-05, "loss": 0.7012, "num_input_tokens_seen": 8803592, "step": 15165 }, { "epoch": 2.2594578492701816, "grad_norm": 0.6326884031295776, "learning_rate": 4.997440085351168e-05, "loss": 0.6351, "num_input_tokens_seen": 8806472, "step": 15170 }, { "epoch": 2.2602025618111408, "grad_norm": 0.793841540813446, "learning_rate": 4.9974253630624654e-05, "loss": 0.7278, "num_input_tokens_seen": 8809224, "step": 15175 }, { "epoch": 2.2609472743521, "grad_norm": 1.170523762702942, "learning_rate": 4.9974105985822996e-05, "loss": 0.6256, "num_input_tokens_seen": 8811976, "step": 15180 }, { "epoch": 2.261691986893059, "grad_norm": 0.734605073928833, "learning_rate": 4.997395791910919e-05, "loss": 0.7246, "num_input_tokens_seen": 8814824, "step": 15185 }, { "epoch": 2.2624366994340184, "grad_norm": 0.6026498675346375, "learning_rate": 4.997380943048576e-05, "loss": 0.6135, "num_input_tokens_seen": 8817832, "step": 15190 }, { "epoch": 2.2631814119749776, "grad_norm": 0.6835633516311646, "learning_rate": 4.99736605199552e-05, "loss": 0.7201, "num_input_tokens_seen": 8820968, "step": 15195 }, { "epoch": 2.2639261245159368, "grad_norm": 1.5965768098831177, "learning_rate": 4.9973511187520025e-05, "loss": 0.8336, "num_input_tokens_seen": 8823656, "step": 15200 }, { "epoch": 2.264670837056896, "grad_norm": 1.2549240589141846, "learning_rate": 4.9973361433182764e-05, "loss": 0.7967, "num_input_tokens_seen": 8826408, "step": 15205 }, { "epoch": 2.265415549597855, "grad_norm": 1.5755130052566528, "learning_rate": 4.997321125694594e-05, "loss": 0.7543, "num_input_tokens_seen": 8829224, "step": 15210 }, { "epoch": 2.2661602621388144, "grad_norm": 0.9478142261505127, "learning_rate": 4.99730606588121e-05, "loss": 0.7577, "num_input_tokens_seen": 8831976, "step": 15215 }, { "epoch": 2.2669049746797736, "grad_norm": 0.7451637983322144, "learning_rate": 4.997290963878377e-05, "loss": 0.7373, "num_input_tokens_seen": 8835048, "step": 15220 }, { "epoch": 2.267649687220733, "grad_norm": 1.756896734237671, "learning_rate": 4.9972758196863524e-05, "loss": 0.8398, "num_input_tokens_seen": 8838024, "step": 15225 }, { "epoch": 2.268394399761692, "grad_norm": 0.8250395059585571, "learning_rate": 4.9972606333053903e-05, "loss": 0.6116, "num_input_tokens_seen": 8841032, "step": 15230 }, { "epoch": 2.269139112302651, "grad_norm": 0.8837265372276306, "learning_rate": 4.997245404735748e-05, "loss": 0.7532, "num_input_tokens_seen": 8844008, "step": 15235 }, { "epoch": 2.2698838248436104, "grad_norm": 1.360256314277649, "learning_rate": 4.997230133977683e-05, "loss": 0.7882, "num_input_tokens_seen": 8846856, "step": 15240 }, { "epoch": 2.2706285373845696, "grad_norm": 0.6556782722473145, "learning_rate": 4.997214821031453e-05, "loss": 0.7533, "num_input_tokens_seen": 8849512, "step": 15245 }, { "epoch": 2.271373249925529, "grad_norm": 0.6609854102134705, "learning_rate": 4.997199465897316e-05, "loss": 0.6964, "num_input_tokens_seen": 8852552, "step": 15250 }, { "epoch": 2.272117962466488, "grad_norm": 1.4960988759994507, "learning_rate": 4.9971840685755324e-05, "loss": 0.6712, "num_input_tokens_seen": 8855720, "step": 15255 }, { "epoch": 2.272862675007447, "grad_norm": 0.736858606338501, "learning_rate": 4.997168629066362e-05, "loss": 0.7142, "num_input_tokens_seen": 8858536, "step": 15260 }, { "epoch": 2.2736073875484064, "grad_norm": 0.9833472371101379, "learning_rate": 4.9971531473700654e-05, "loss": 0.7038, "num_input_tokens_seen": 8861352, "step": 15265 }, { "epoch": 2.2743521000893656, "grad_norm": 1.1057496070861816, "learning_rate": 4.997137623486905e-05, "loss": 0.7103, "num_input_tokens_seen": 8864392, "step": 15270 }, { "epoch": 2.275096812630325, "grad_norm": 1.0593645572662354, "learning_rate": 4.9971220574171415e-05, "loss": 0.6706, "num_input_tokens_seen": 8867368, "step": 15275 }, { "epoch": 2.275841525171284, "grad_norm": 0.7337110638618469, "learning_rate": 4.9971064491610396e-05, "loss": 0.558, "num_input_tokens_seen": 8870184, "step": 15280 }, { "epoch": 2.276586237712243, "grad_norm": 0.7827690839767456, "learning_rate": 4.997090798718862e-05, "loss": 0.6433, "num_input_tokens_seen": 8873160, "step": 15285 }, { "epoch": 2.2773309502532024, "grad_norm": 1.6046078205108643, "learning_rate": 4.9970751060908735e-05, "loss": 0.7956, "num_input_tokens_seen": 8875880, "step": 15290 }, { "epoch": 2.2780756627941616, "grad_norm": 1.0951099395751953, "learning_rate": 4.997059371277339e-05, "loss": 0.6863, "num_input_tokens_seen": 8878952, "step": 15295 }, { "epoch": 2.278820375335121, "grad_norm": 1.3588510751724243, "learning_rate": 4.997043594278523e-05, "loss": 0.7285, "num_input_tokens_seen": 8881672, "step": 15300 }, { "epoch": 2.2795650878760796, "grad_norm": 0.7022063136100769, "learning_rate": 4.997027775094695e-05, "loss": 0.774, "num_input_tokens_seen": 8884936, "step": 15305 }, { "epoch": 2.2803098004170392, "grad_norm": 0.5959123373031616, "learning_rate": 4.99701191372612e-05, "loss": 0.6018, "num_input_tokens_seen": 8887720, "step": 15310 }, { "epoch": 2.281054512957998, "grad_norm": 0.8123289942741394, "learning_rate": 4.9969960101730664e-05, "loss": 0.7422, "num_input_tokens_seen": 8891240, "step": 15315 }, { "epoch": 2.2817992254989576, "grad_norm": 0.8586064577102661, "learning_rate": 4.996980064435803e-05, "loss": 0.7541, "num_input_tokens_seen": 8894120, "step": 15320 }, { "epoch": 2.2825439380399164, "grad_norm": 0.7072954177856445, "learning_rate": 4.9969640765145996e-05, "loss": 0.7848, "num_input_tokens_seen": 8897032, "step": 15325 }, { "epoch": 2.2832886505808756, "grad_norm": 0.8127656579017639, "learning_rate": 4.9969480464097255e-05, "loss": 0.7778, "num_input_tokens_seen": 8899688, "step": 15330 }, { "epoch": 2.284033363121835, "grad_norm": 0.8880068063735962, "learning_rate": 4.9969319741214525e-05, "loss": 0.5873, "num_input_tokens_seen": 8902504, "step": 15335 }, { "epoch": 2.284778075662794, "grad_norm": 1.1959381103515625, "learning_rate": 4.996915859650051e-05, "loss": 0.7649, "num_input_tokens_seen": 8905224, "step": 15340 }, { "epoch": 2.285522788203753, "grad_norm": 0.952654242515564, "learning_rate": 4.996899702995794e-05, "loss": 0.784, "num_input_tokens_seen": 8908040, "step": 15345 }, { "epoch": 2.2862675007447124, "grad_norm": 0.5731956958770752, "learning_rate": 4.9968835041589546e-05, "loss": 0.7248, "num_input_tokens_seen": 8911208, "step": 15350 }, { "epoch": 2.2870122132856716, "grad_norm": 0.8128458857536316, "learning_rate": 4.996867263139806e-05, "loss": 0.753, "num_input_tokens_seen": 8914472, "step": 15355 }, { "epoch": 2.287756925826631, "grad_norm": 1.2540029287338257, "learning_rate": 4.996850979938622e-05, "loss": 0.719, "num_input_tokens_seen": 8917160, "step": 15360 }, { "epoch": 2.28850163836759, "grad_norm": 0.6557199358940125, "learning_rate": 4.996834654555679e-05, "loss": 0.6376, "num_input_tokens_seen": 8920008, "step": 15365 }, { "epoch": 2.289246350908549, "grad_norm": 0.7249545454978943, "learning_rate": 4.9968182869912525e-05, "loss": 0.7151, "num_input_tokens_seen": 8922888, "step": 15370 }, { "epoch": 2.2899910634495084, "grad_norm": 0.8974847197532654, "learning_rate": 4.9968018772456185e-05, "loss": 0.5834, "num_input_tokens_seen": 8925576, "step": 15375 }, { "epoch": 2.2907357759904676, "grad_norm": 0.7424496412277222, "learning_rate": 4.9967854253190536e-05, "loss": 0.6349, "num_input_tokens_seen": 8928328, "step": 15380 }, { "epoch": 2.291480488531427, "grad_norm": 0.8239037990570068, "learning_rate": 4.996768931211837e-05, "loss": 0.7311, "num_input_tokens_seen": 8931208, "step": 15385 }, { "epoch": 2.292225201072386, "grad_norm": 0.8403071165084839, "learning_rate": 4.996752394924247e-05, "loss": 0.639, "num_input_tokens_seen": 8933896, "step": 15390 }, { "epoch": 2.2929699136133452, "grad_norm": 0.9087637066841125, "learning_rate": 4.996735816456564e-05, "loss": 0.635, "num_input_tokens_seen": 8936712, "step": 15395 }, { "epoch": 2.2937146261543044, "grad_norm": 1.1245685815811157, "learning_rate": 4.9967191958090656e-05, "loss": 0.6429, "num_input_tokens_seen": 8939304, "step": 15400 }, { "epoch": 2.2944593386952636, "grad_norm": 1.0295830965042114, "learning_rate": 4.996702532982034e-05, "loss": 0.6949, "num_input_tokens_seen": 8942120, "step": 15405 }, { "epoch": 2.295204051236223, "grad_norm": 1.1219489574432373, "learning_rate": 4.99668582797575e-05, "loss": 0.6556, "num_input_tokens_seen": 8945192, "step": 15410 }, { "epoch": 2.295948763777182, "grad_norm": 0.837700605392456, "learning_rate": 4.996669080790498e-05, "loss": 0.7003, "num_input_tokens_seen": 8948392, "step": 15415 }, { "epoch": 2.2966934763181412, "grad_norm": 0.7288514971733093, "learning_rate": 4.996652291426559e-05, "loss": 0.7606, "num_input_tokens_seen": 8951016, "step": 15420 }, { "epoch": 2.2974381888591004, "grad_norm": 1.0974048376083374, "learning_rate": 4.996635459884216e-05, "loss": 0.6653, "num_input_tokens_seen": 8953800, "step": 15425 }, { "epoch": 2.2981829014000597, "grad_norm": 0.756811261177063, "learning_rate": 4.996618586163755e-05, "loss": 0.6158, "num_input_tokens_seen": 8956808, "step": 15430 }, { "epoch": 2.298927613941019, "grad_norm": 0.5819084048271179, "learning_rate": 4.996601670265461e-05, "loss": 0.7449, "num_input_tokens_seen": 8959592, "step": 15435 }, { "epoch": 2.299672326481978, "grad_norm": 1.346657156944275, "learning_rate": 4.996584712189618e-05, "loss": 0.6392, "num_input_tokens_seen": 8962504, "step": 15440 }, { "epoch": 2.3004170390229373, "grad_norm": 0.6048394441604614, "learning_rate": 4.996567711936515e-05, "loss": 0.7804, "num_input_tokens_seen": 8965416, "step": 15445 }, { "epoch": 2.3011617515638965, "grad_norm": 0.9076882600784302, "learning_rate": 4.996550669506438e-05, "loss": 0.6988, "num_input_tokens_seen": 8968264, "step": 15450 }, { "epoch": 2.3019064641048557, "grad_norm": 0.5936505198478699, "learning_rate": 4.996533584899674e-05, "loss": 0.6905, "num_input_tokens_seen": 8970888, "step": 15455 }, { "epoch": 2.302651176645815, "grad_norm": 0.6539987325668335, "learning_rate": 4.996516458116512e-05, "loss": 0.6586, "num_input_tokens_seen": 8973896, "step": 15460 }, { "epoch": 2.303395889186774, "grad_norm": 1.0020772218704224, "learning_rate": 4.9964992891572425e-05, "loss": 0.71, "num_input_tokens_seen": 8976712, "step": 15465 }, { "epoch": 2.3041406017277333, "grad_norm": 0.9187766909599304, "learning_rate": 4.996482078022155e-05, "loss": 0.6365, "num_input_tokens_seen": 8979784, "step": 15470 }, { "epoch": 2.3048853142686925, "grad_norm": 0.747460126876831, "learning_rate": 4.9964648247115395e-05, "loss": 0.8162, "num_input_tokens_seen": 8982760, "step": 15475 }, { "epoch": 2.3056300268096512, "grad_norm": 1.3874001502990723, "learning_rate": 4.9964475292256884e-05, "loss": 0.7093, "num_input_tokens_seen": 8985448, "step": 15480 }, { "epoch": 2.306374739350611, "grad_norm": 0.8095577359199524, "learning_rate": 4.996430191564894e-05, "loss": 0.7598, "num_input_tokens_seen": 8988904, "step": 15485 }, { "epoch": 2.3071194518915696, "grad_norm": 0.6542168855667114, "learning_rate": 4.996412811729448e-05, "loss": 0.6968, "num_input_tokens_seen": 8991368, "step": 15490 }, { "epoch": 2.3078641644325293, "grad_norm": 0.9712705612182617, "learning_rate": 4.996395389719646e-05, "loss": 0.6097, "num_input_tokens_seen": 8994152, "step": 15495 }, { "epoch": 2.308608876973488, "grad_norm": 0.7426332235336304, "learning_rate": 4.99637792553578e-05, "loss": 0.6203, "num_input_tokens_seen": 8997000, "step": 15500 }, { "epoch": 2.3093535895144472, "grad_norm": 0.7078371047973633, "learning_rate": 4.996360419178147e-05, "loss": 0.6656, "num_input_tokens_seen": 8999912, "step": 15505 }, { "epoch": 2.3100983020554064, "grad_norm": 0.7695983052253723, "learning_rate": 4.9963428706470405e-05, "loss": 0.4872, "num_input_tokens_seen": 9002760, "step": 15510 }, { "epoch": 2.3108430145963657, "grad_norm": 0.6691104173660278, "learning_rate": 4.9963252799427594e-05, "loss": 0.7038, "num_input_tokens_seen": 9005672, "step": 15515 }, { "epoch": 2.311587727137325, "grad_norm": 0.8050328493118286, "learning_rate": 4.9963076470655995e-05, "loss": 0.6077, "num_input_tokens_seen": 9008648, "step": 15520 }, { "epoch": 2.312332439678284, "grad_norm": 0.801732063293457, "learning_rate": 4.996289972015859e-05, "loss": 0.619, "num_input_tokens_seen": 9011464, "step": 15525 }, { "epoch": 2.3130771522192433, "grad_norm": 1.2994840145111084, "learning_rate": 4.9962722547938365e-05, "loss": 0.6611, "num_input_tokens_seen": 9014280, "step": 15530 }, { "epoch": 2.3138218647602025, "grad_norm": 0.76220703125, "learning_rate": 4.9962544953998316e-05, "loss": 0.5729, "num_input_tokens_seen": 9017032, "step": 15535 }, { "epoch": 2.3145665773011617, "grad_norm": 0.7000508308410645, "learning_rate": 4.996236693834144e-05, "loss": 0.6209, "num_input_tokens_seen": 9020072, "step": 15540 }, { "epoch": 2.315311289842121, "grad_norm": 0.7686893343925476, "learning_rate": 4.996218850097075e-05, "loss": 0.7337, "num_input_tokens_seen": 9022856, "step": 15545 }, { "epoch": 2.31605600238308, "grad_norm": 0.9208924770355225, "learning_rate": 4.996200964188925e-05, "loss": 0.5233, "num_input_tokens_seen": 9025736, "step": 15550 }, { "epoch": 2.3168007149240393, "grad_norm": 0.8850501179695129, "learning_rate": 4.996183036109997e-05, "loss": 0.6124, "num_input_tokens_seen": 9029000, "step": 15555 }, { "epoch": 2.3175454274649985, "grad_norm": 0.6422055959701538, "learning_rate": 4.996165065860594e-05, "loss": 0.7675, "num_input_tokens_seen": 9031816, "step": 15560 }, { "epoch": 2.3182901400059577, "grad_norm": 0.8277701139450073, "learning_rate": 4.996147053441018e-05, "loss": 0.5644, "num_input_tokens_seen": 9034792, "step": 15565 }, { "epoch": 2.319034852546917, "grad_norm": 0.7808788418769836, "learning_rate": 4.996128998851575e-05, "loss": 0.6585, "num_input_tokens_seen": 9037608, "step": 15570 }, { "epoch": 2.319779565087876, "grad_norm": 1.002318263053894, "learning_rate": 4.99611090209257e-05, "loss": 0.5506, "num_input_tokens_seen": 9040392, "step": 15575 }, { "epoch": 2.3205242776288353, "grad_norm": 0.6704215407371521, "learning_rate": 4.9960927631643086e-05, "loss": 0.7763, "num_input_tokens_seen": 9043272, "step": 15580 }, { "epoch": 2.3212689901697945, "grad_norm": 0.7402247190475464, "learning_rate": 4.996074582067096e-05, "loss": 0.6991, "num_input_tokens_seen": 9046376, "step": 15585 }, { "epoch": 2.3220137027107537, "grad_norm": 1.5828033685684204, "learning_rate": 4.9960563588012396e-05, "loss": 0.6799, "num_input_tokens_seen": 9049128, "step": 15590 }, { "epoch": 2.322758415251713, "grad_norm": 0.8857554793357849, "learning_rate": 4.9960380933670495e-05, "loss": 0.7324, "num_input_tokens_seen": 9051944, "step": 15595 }, { "epoch": 2.323503127792672, "grad_norm": 1.069651484489441, "learning_rate": 4.996019785764832e-05, "loss": 0.5511, "num_input_tokens_seen": 9054600, "step": 15600 }, { "epoch": 2.3242478403336313, "grad_norm": 0.5983690619468689, "learning_rate": 4.996001435994897e-05, "loss": 0.6646, "num_input_tokens_seen": 9057352, "step": 15605 }, { "epoch": 2.3249925528745905, "grad_norm": 0.7014415264129639, "learning_rate": 4.995983044057554e-05, "loss": 0.6322, "num_input_tokens_seen": 9060040, "step": 15610 }, { "epoch": 2.3257372654155497, "grad_norm": 1.051482081413269, "learning_rate": 4.9959646099531156e-05, "loss": 0.6699, "num_input_tokens_seen": 9062984, "step": 15615 }, { "epoch": 2.326481977956509, "grad_norm": 1.3490008115768433, "learning_rate": 4.99594613368189e-05, "loss": 0.6843, "num_input_tokens_seen": 9065928, "step": 15620 }, { "epoch": 2.327226690497468, "grad_norm": 1.4418708086013794, "learning_rate": 4.995927615244193e-05, "loss": 0.7638, "num_input_tokens_seen": 9068872, "step": 15625 }, { "epoch": 2.3279714030384273, "grad_norm": 1.0896326303482056, "learning_rate": 4.9959090546403356e-05, "loss": 0.7203, "num_input_tokens_seen": 9071432, "step": 15630 }, { "epoch": 2.3287161155793865, "grad_norm": 0.8775752782821655, "learning_rate": 4.9958904518706305e-05, "loss": 0.6171, "num_input_tokens_seen": 9074312, "step": 15635 }, { "epoch": 2.3294608281203457, "grad_norm": 1.2177172899246216, "learning_rate": 4.9958718069353935e-05, "loss": 0.6536, "num_input_tokens_seen": 9077128, "step": 15640 }, { "epoch": 2.330205540661305, "grad_norm": 0.89166659116745, "learning_rate": 4.9958531198349384e-05, "loss": 0.6835, "num_input_tokens_seen": 9080008, "step": 15645 }, { "epoch": 2.330950253202264, "grad_norm": 1.2821182012557983, "learning_rate": 4.9958343905695823e-05, "loss": 0.6996, "num_input_tokens_seen": 9083016, "step": 15650 }, { "epoch": 2.331694965743223, "grad_norm": 1.1103148460388184, "learning_rate": 4.99581561913964e-05, "loss": 0.547, "num_input_tokens_seen": 9085672, "step": 15655 }, { "epoch": 2.3324396782841825, "grad_norm": 0.8832236528396606, "learning_rate": 4.99579680554543e-05, "loss": 0.7387, "num_input_tokens_seen": 9088392, "step": 15660 }, { "epoch": 2.3331843908251413, "grad_norm": 0.8277170062065125, "learning_rate": 4.99577794978727e-05, "loss": 0.5935, "num_input_tokens_seen": 9091368, "step": 15665 }, { "epoch": 2.333929103366101, "grad_norm": 0.7028926610946655, "learning_rate": 4.995759051865477e-05, "loss": 0.6973, "num_input_tokens_seen": 9094216, "step": 15670 }, { "epoch": 2.3346738159070597, "grad_norm": 0.8623197078704834, "learning_rate": 4.995740111780372e-05, "loss": 0.6565, "num_input_tokens_seen": 9096968, "step": 15675 }, { "epoch": 2.335418528448019, "grad_norm": 1.0188350677490234, "learning_rate": 4.995721129532275e-05, "loss": 0.8493, "num_input_tokens_seen": 9099976, "step": 15680 }, { "epoch": 2.336163240988978, "grad_norm": 1.0680471658706665, "learning_rate": 4.9957021051215055e-05, "loss": 0.6259, "num_input_tokens_seen": 9102856, "step": 15685 }, { "epoch": 2.3369079535299373, "grad_norm": 1.078245759010315, "learning_rate": 4.995683038548385e-05, "loss": 0.6989, "num_input_tokens_seen": 9106088, "step": 15690 }, { "epoch": 2.3376526660708965, "grad_norm": 0.9079275131225586, "learning_rate": 4.995663929813237e-05, "loss": 0.574, "num_input_tokens_seen": 9109096, "step": 15695 }, { "epoch": 2.3383973786118557, "grad_norm": 1.3194650411605835, "learning_rate": 4.995644778916383e-05, "loss": 0.7602, "num_input_tokens_seen": 9111848, "step": 15700 }, { "epoch": 2.339142091152815, "grad_norm": 0.688653290271759, "learning_rate": 4.995625585858146e-05, "loss": 0.6571, "num_input_tokens_seen": 9114504, "step": 15705 }, { "epoch": 2.339886803693774, "grad_norm": 1.2580605745315552, "learning_rate": 4.9956063506388524e-05, "loss": 0.8483, "num_input_tokens_seen": 9117256, "step": 15710 }, { "epoch": 2.3406315162347333, "grad_norm": 1.169318675994873, "learning_rate": 4.995587073258825e-05, "loss": 0.6189, "num_input_tokens_seen": 9120552, "step": 15715 }, { "epoch": 2.3413762287756925, "grad_norm": 0.9303237795829773, "learning_rate": 4.995567753718391e-05, "loss": 0.6964, "num_input_tokens_seen": 9123144, "step": 15720 }, { "epoch": 2.3421209413166517, "grad_norm": 0.6118993759155273, "learning_rate": 4.995548392017876e-05, "loss": 0.514, "num_input_tokens_seen": 9126120, "step": 15725 }, { "epoch": 2.342865653857611, "grad_norm": 1.6824731826782227, "learning_rate": 4.995528988157608e-05, "loss": 0.693, "num_input_tokens_seen": 9129064, "step": 15730 }, { "epoch": 2.34361036639857, "grad_norm": 0.8536350131034851, "learning_rate": 4.995509542137913e-05, "loss": 0.536, "num_input_tokens_seen": 9131880, "step": 15735 }, { "epoch": 2.3443550789395293, "grad_norm": 0.7929524183273315, "learning_rate": 4.995490053959121e-05, "loss": 0.6309, "num_input_tokens_seen": 9134600, "step": 15740 }, { "epoch": 2.3450997914804885, "grad_norm": 0.8827084302902222, "learning_rate": 4.995470523621561e-05, "loss": 0.7729, "num_input_tokens_seen": 9137448, "step": 15745 }, { "epoch": 2.3458445040214477, "grad_norm": 0.7125347852706909, "learning_rate": 4.9954509511255625e-05, "loss": 0.7753, "num_input_tokens_seen": 9140072, "step": 15750 }, { "epoch": 2.346589216562407, "grad_norm": 1.089101791381836, "learning_rate": 4.9954313364714565e-05, "loss": 0.6792, "num_input_tokens_seen": 9142728, "step": 15755 }, { "epoch": 2.347333929103366, "grad_norm": 0.6675214171409607, "learning_rate": 4.9954116796595754e-05, "loss": 0.7304, "num_input_tokens_seen": 9145448, "step": 15760 }, { "epoch": 2.3480786416443253, "grad_norm": 1.556225299835205, "learning_rate": 4.9953919806902486e-05, "loss": 0.616, "num_input_tokens_seen": 9148232, "step": 15765 }, { "epoch": 2.3488233541852845, "grad_norm": 0.9089418649673462, "learning_rate": 4.9953722395638115e-05, "loss": 0.6141, "num_input_tokens_seen": 9151400, "step": 15770 }, { "epoch": 2.3495680667262437, "grad_norm": 1.3497910499572754, "learning_rate": 4.995352456280596e-05, "loss": 0.6688, "num_input_tokens_seen": 9154440, "step": 15775 }, { "epoch": 2.350312779267203, "grad_norm": 0.9066838026046753, "learning_rate": 4.9953326308409364e-05, "loss": 0.5871, "num_input_tokens_seen": 9157352, "step": 15780 }, { "epoch": 2.351057491808162, "grad_norm": 1.1335474252700806, "learning_rate": 4.9953127632451694e-05, "loss": 0.6622, "num_input_tokens_seen": 9160200, "step": 15785 }, { "epoch": 2.3518022043491214, "grad_norm": 0.850776731967926, "learning_rate": 4.995292853493629e-05, "loss": 0.742, "num_input_tokens_seen": 9163144, "step": 15790 }, { "epoch": 2.3525469168900806, "grad_norm": 2.0621042251586914, "learning_rate": 4.995272901586652e-05, "loss": 0.7328, "num_input_tokens_seen": 9166344, "step": 15795 }, { "epoch": 2.3532916294310398, "grad_norm": 0.7498461008071899, "learning_rate": 4.9952529075245744e-05, "loss": 0.6452, "num_input_tokens_seen": 9169032, "step": 15800 }, { "epoch": 2.354036341971999, "grad_norm": 1.2421597242355347, "learning_rate": 4.995232871307736e-05, "loss": 0.6977, "num_input_tokens_seen": 9172104, "step": 15805 }, { "epoch": 2.354781054512958, "grad_norm": 0.8200435638427734, "learning_rate": 4.9952127929364746e-05, "loss": 0.7252, "num_input_tokens_seen": 9174920, "step": 15810 }, { "epoch": 2.3555257670539174, "grad_norm": 0.8054433465003967, "learning_rate": 4.995192672411128e-05, "loss": 0.7826, "num_input_tokens_seen": 9177864, "step": 15815 }, { "epoch": 2.3562704795948766, "grad_norm": 0.6488363146781921, "learning_rate": 4.995172509732038e-05, "loss": 0.6997, "num_input_tokens_seen": 9180552, "step": 15820 }, { "epoch": 2.3570151921358358, "grad_norm": 1.106152892112732, "learning_rate": 4.995152304899544e-05, "loss": 0.6331, "num_input_tokens_seen": 9183304, "step": 15825 }, { "epoch": 2.3577599046767945, "grad_norm": 0.8581089973449707, "learning_rate": 4.9951320579139884e-05, "loss": 0.6537, "num_input_tokens_seen": 9186088, "step": 15830 }, { "epoch": 2.358504617217754, "grad_norm": 0.896824061870575, "learning_rate": 4.995111768775712e-05, "loss": 0.7169, "num_input_tokens_seen": 9188968, "step": 15835 }, { "epoch": 2.359249329758713, "grad_norm": 0.8577812910079956, "learning_rate": 4.995091437485058e-05, "loss": 0.7434, "num_input_tokens_seen": 9191592, "step": 15840 }, { "epoch": 2.359994042299672, "grad_norm": 1.0645153522491455, "learning_rate": 4.9950710640423705e-05, "loss": 0.7814, "num_input_tokens_seen": 9194504, "step": 15845 }, { "epoch": 2.3607387548406313, "grad_norm": 0.8556469678878784, "learning_rate": 4.9950506484479934e-05, "loss": 0.6243, "num_input_tokens_seen": 9197224, "step": 15850 }, { "epoch": 2.3614834673815905, "grad_norm": 0.5774482488632202, "learning_rate": 4.995030190702271e-05, "loss": 0.7037, "num_input_tokens_seen": 9200264, "step": 15855 }, { "epoch": 2.3622281799225497, "grad_norm": 1.1251870393753052, "learning_rate": 4.99500969080555e-05, "loss": 0.6845, "num_input_tokens_seen": 9203112, "step": 15860 }, { "epoch": 2.362972892463509, "grad_norm": 1.1590559482574463, "learning_rate": 4.994989148758176e-05, "loss": 0.7253, "num_input_tokens_seen": 9205928, "step": 15865 }, { "epoch": 2.363717605004468, "grad_norm": 0.5779231190681458, "learning_rate": 4.994968564560495e-05, "loss": 0.7179, "num_input_tokens_seen": 9208584, "step": 15870 }, { "epoch": 2.3644623175454274, "grad_norm": 0.6599013805389404, "learning_rate": 4.994947938212857e-05, "loss": 0.5297, "num_input_tokens_seen": 9211240, "step": 15875 }, { "epoch": 2.3652070300863866, "grad_norm": 1.128706693649292, "learning_rate": 4.994927269715609e-05, "loss": 0.8811, "num_input_tokens_seen": 9214568, "step": 15880 }, { "epoch": 2.3659517426273458, "grad_norm": 0.5807055830955505, "learning_rate": 4.9949065590691e-05, "loss": 0.542, "num_input_tokens_seen": 9217384, "step": 15885 }, { "epoch": 2.366696455168305, "grad_norm": 0.5621842741966248, "learning_rate": 4.9948858062736814e-05, "loss": 0.7432, "num_input_tokens_seen": 9220136, "step": 15890 }, { "epoch": 2.367441167709264, "grad_norm": 0.7836078405380249, "learning_rate": 4.994865011329702e-05, "loss": 0.533, "num_input_tokens_seen": 9222984, "step": 15895 }, { "epoch": 2.3681858802502234, "grad_norm": 0.93854820728302, "learning_rate": 4.994844174237514e-05, "loss": 0.6062, "num_input_tokens_seen": 9225896, "step": 15900 }, { "epoch": 2.3689305927911826, "grad_norm": 0.9369468092918396, "learning_rate": 4.99482329499747e-05, "loss": 0.7457, "num_input_tokens_seen": 9228776, "step": 15905 }, { "epoch": 2.3696753053321418, "grad_norm": 0.9601718783378601, "learning_rate": 4.994802373609922e-05, "loss": 0.7911, "num_input_tokens_seen": 9231720, "step": 15910 }, { "epoch": 2.370420017873101, "grad_norm": 1.0044877529144287, "learning_rate": 4.9947814100752226e-05, "loss": 0.6892, "num_input_tokens_seen": 9234600, "step": 15915 }, { "epoch": 2.37116473041406, "grad_norm": 1.5148725509643555, "learning_rate": 4.994760404393727e-05, "loss": 0.7501, "num_input_tokens_seen": 9237288, "step": 15920 }, { "epoch": 2.3719094429550194, "grad_norm": 0.9517968893051147, "learning_rate": 4.994739356565791e-05, "loss": 0.6325, "num_input_tokens_seen": 9240104, "step": 15925 }, { "epoch": 2.3726541554959786, "grad_norm": 1.1516207456588745, "learning_rate": 4.994718266591768e-05, "loss": 0.6296, "num_input_tokens_seen": 9242888, "step": 15930 }, { "epoch": 2.373398868036938, "grad_norm": 0.947480320930481, "learning_rate": 4.994697134472016e-05, "loss": 0.6279, "num_input_tokens_seen": 9246024, "step": 15935 }, { "epoch": 2.374143580577897, "grad_norm": 1.3632111549377441, "learning_rate": 4.994675960206891e-05, "loss": 0.7232, "num_input_tokens_seen": 9248936, "step": 15940 }, { "epoch": 2.374888293118856, "grad_norm": 0.732226550579071, "learning_rate": 4.9946547437967515e-05, "loss": 0.6854, "num_input_tokens_seen": 9251912, "step": 15945 }, { "epoch": 2.3756330056598154, "grad_norm": 0.8231885433197021, "learning_rate": 4.9946334852419555e-05, "loss": 0.6653, "num_input_tokens_seen": 9254600, "step": 15950 }, { "epoch": 2.3763777182007746, "grad_norm": 0.7679610252380371, "learning_rate": 4.9946121845428616e-05, "loss": 0.6677, "num_input_tokens_seen": 9257864, "step": 15955 }, { "epoch": 2.377122430741734, "grad_norm": 0.643146276473999, "learning_rate": 4.994590841699831e-05, "loss": 0.7893, "num_input_tokens_seen": 9260680, "step": 15960 }, { "epoch": 2.377867143282693, "grad_norm": 0.6798906922340393, "learning_rate": 4.9945694567132227e-05, "loss": 0.5955, "num_input_tokens_seen": 9263432, "step": 15965 }, { "epoch": 2.378611855823652, "grad_norm": 0.7810048460960388, "learning_rate": 4.9945480295834e-05, "loss": 0.6389, "num_input_tokens_seen": 9266056, "step": 15970 }, { "epoch": 2.3793565683646114, "grad_norm": 0.8188327550888062, "learning_rate": 4.994526560310723e-05, "loss": 0.7401, "num_input_tokens_seen": 9269224, "step": 15975 }, { "epoch": 2.3801012809055706, "grad_norm": 0.7393456697463989, "learning_rate": 4.994505048895555e-05, "loss": 0.6668, "num_input_tokens_seen": 9272136, "step": 15980 }, { "epoch": 2.38084599344653, "grad_norm": 1.23038649559021, "learning_rate": 4.99448349533826e-05, "loss": 0.6583, "num_input_tokens_seen": 9275048, "step": 15985 }, { "epoch": 2.381590705987489, "grad_norm": 1.0550782680511475, "learning_rate": 4.9944618996392014e-05, "loss": 0.7224, "num_input_tokens_seen": 9277768, "step": 15990 }, { "epoch": 2.382335418528448, "grad_norm": 0.8990289568901062, "learning_rate": 4.994440261798743e-05, "loss": 0.6449, "num_input_tokens_seen": 9280584, "step": 15995 }, { "epoch": 2.3830801310694074, "grad_norm": 1.4922373294830322, "learning_rate": 4.994418581817254e-05, "loss": 0.7004, "num_input_tokens_seen": 9283624, "step": 16000 }, { "epoch": 2.383824843610366, "grad_norm": 1.0076247453689575, "learning_rate": 4.994396859695096e-05, "loss": 0.6412, "num_input_tokens_seen": 9286472, "step": 16005 }, { "epoch": 2.384569556151326, "grad_norm": 0.7993988990783691, "learning_rate": 4.99437509543264e-05, "loss": 0.492, "num_input_tokens_seen": 9289416, "step": 16010 }, { "epoch": 2.3853142686922846, "grad_norm": 1.2483943700790405, "learning_rate": 4.994353289030251e-05, "loss": 0.7115, "num_input_tokens_seen": 9292488, "step": 16015 }, { "epoch": 2.386058981233244, "grad_norm": 0.880840539932251, "learning_rate": 4.994331440488298e-05, "loss": 0.6692, "num_input_tokens_seen": 9295528, "step": 16020 }, { "epoch": 2.386803693774203, "grad_norm": 1.0256422758102417, "learning_rate": 4.994309549807151e-05, "loss": 0.7523, "num_input_tokens_seen": 9298664, "step": 16025 }, { "epoch": 2.387548406315162, "grad_norm": 0.541448712348938, "learning_rate": 4.9942876169871794e-05, "loss": 0.79, "num_input_tokens_seen": 9301352, "step": 16030 }, { "epoch": 2.3882931188561214, "grad_norm": 2.73785138130188, "learning_rate": 4.9942656420287535e-05, "loss": 0.6669, "num_input_tokens_seen": 9304232, "step": 16035 }, { "epoch": 2.3890378313970806, "grad_norm": 0.8963459134101868, "learning_rate": 4.9942436249322444e-05, "loss": 0.5402, "num_input_tokens_seen": 9307176, "step": 16040 }, { "epoch": 2.38978254393804, "grad_norm": 1.1517508029937744, "learning_rate": 4.994221565698025e-05, "loss": 0.6973, "num_input_tokens_seen": 9309960, "step": 16045 }, { "epoch": 2.390527256478999, "grad_norm": 0.9807510375976562, "learning_rate": 4.9941994643264665e-05, "loss": 0.7594, "num_input_tokens_seen": 9312968, "step": 16050 }, { "epoch": 2.391271969019958, "grad_norm": 0.9914181232452393, "learning_rate": 4.994177320817943e-05, "loss": 0.7741, "num_input_tokens_seen": 9315912, "step": 16055 }, { "epoch": 2.3920166815609174, "grad_norm": 0.901323676109314, "learning_rate": 4.9941551351728286e-05, "loss": 0.6116, "num_input_tokens_seen": 9318984, "step": 16060 }, { "epoch": 2.3927613941018766, "grad_norm": 0.6796950697898865, "learning_rate": 4.994132907391499e-05, "loss": 0.7081, "num_input_tokens_seen": 9321832, "step": 16065 }, { "epoch": 2.393506106642836, "grad_norm": 0.7950719594955444, "learning_rate": 4.994110637474327e-05, "loss": 0.673, "num_input_tokens_seen": 9324968, "step": 16070 }, { "epoch": 2.394250819183795, "grad_norm": 1.9700289964675903, "learning_rate": 4.994088325421693e-05, "loss": 0.6769, "num_input_tokens_seen": 9327816, "step": 16075 }, { "epoch": 2.394995531724754, "grad_norm": 2.04266095161438, "learning_rate": 4.99406597123397e-05, "loss": 0.6546, "num_input_tokens_seen": 9330472, "step": 16080 }, { "epoch": 2.3957402442657134, "grad_norm": 1.2257814407348633, "learning_rate": 4.994043574911538e-05, "loss": 0.6611, "num_input_tokens_seen": 9333544, "step": 16085 }, { "epoch": 2.3964849568066726, "grad_norm": 1.1195937395095825, "learning_rate": 4.9940211364547744e-05, "loss": 0.7017, "num_input_tokens_seen": 9336424, "step": 16090 }, { "epoch": 2.397229669347632, "grad_norm": 0.656643271446228, "learning_rate": 4.9939986558640585e-05, "loss": 0.7017, "num_input_tokens_seen": 9339464, "step": 16095 }, { "epoch": 2.397974381888591, "grad_norm": 0.9278087615966797, "learning_rate": 4.99397613313977e-05, "loss": 0.7227, "num_input_tokens_seen": 9342344, "step": 16100 }, { "epoch": 2.3987190944295502, "grad_norm": 1.6759072542190552, "learning_rate": 4.99395356828229e-05, "loss": 0.6016, "num_input_tokens_seen": 9345160, "step": 16105 }, { "epoch": 2.3994638069705094, "grad_norm": 1.0760599374771118, "learning_rate": 4.993930961291999e-05, "loss": 0.4808, "num_input_tokens_seen": 9348200, "step": 16110 }, { "epoch": 2.4002085195114686, "grad_norm": 0.6852087378501892, "learning_rate": 4.993908312169279e-05, "loss": 0.602, "num_input_tokens_seen": 9350824, "step": 16115 }, { "epoch": 2.400953232052428, "grad_norm": 0.9271185994148254, "learning_rate": 4.9938856209145123e-05, "loss": 0.6369, "num_input_tokens_seen": 9353832, "step": 16120 }, { "epoch": 2.401697944593387, "grad_norm": 0.6161490678787231, "learning_rate": 4.993862887528083e-05, "loss": 0.5608, "num_input_tokens_seen": 9356456, "step": 16125 }, { "epoch": 2.4024426571343462, "grad_norm": 0.8395327925682068, "learning_rate": 4.9938401120103755e-05, "loss": 0.5943, "num_input_tokens_seen": 9359464, "step": 16130 }, { "epoch": 2.4031873696753054, "grad_norm": 0.8497305512428284, "learning_rate": 4.9938172943617735e-05, "loss": 0.7843, "num_input_tokens_seen": 9362600, "step": 16135 }, { "epoch": 2.4039320822162646, "grad_norm": 1.1055223941802979, "learning_rate": 4.993794434582663e-05, "loss": 0.6458, "num_input_tokens_seen": 9365352, "step": 16140 }, { "epoch": 2.404676794757224, "grad_norm": 0.7820340991020203, "learning_rate": 4.99377153267343e-05, "loss": 0.7569, "num_input_tokens_seen": 9368104, "step": 16145 }, { "epoch": 2.405421507298183, "grad_norm": 1.3259350061416626, "learning_rate": 4.9937485886344614e-05, "loss": 0.6487, "num_input_tokens_seen": 9370952, "step": 16150 }, { "epoch": 2.4061662198391423, "grad_norm": 0.7054709792137146, "learning_rate": 4.993725602466145e-05, "loss": 0.7024, "num_input_tokens_seen": 9373832, "step": 16155 }, { "epoch": 2.4069109323801015, "grad_norm": 2.9032931327819824, "learning_rate": 4.9937025741688694e-05, "loss": 0.6706, "num_input_tokens_seen": 9376808, "step": 16160 }, { "epoch": 2.4076556449210607, "grad_norm": 0.8510529398918152, "learning_rate": 4.993679503743023e-05, "loss": 0.5796, "num_input_tokens_seen": 9379400, "step": 16165 }, { "epoch": 2.4084003574620194, "grad_norm": 0.7614185810089111, "learning_rate": 4.993656391188995e-05, "loss": 0.6425, "num_input_tokens_seen": 9382024, "step": 16170 }, { "epoch": 2.409145070002979, "grad_norm": 0.6865399479866028, "learning_rate": 4.993633236507178e-05, "loss": 0.6139, "num_input_tokens_seen": 9385032, "step": 16175 }, { "epoch": 2.409889782543938, "grad_norm": 1.1944411993026733, "learning_rate": 4.9936100396979614e-05, "loss": 0.6047, "num_input_tokens_seen": 9388040, "step": 16180 }, { "epoch": 2.4106344950848975, "grad_norm": 1.012413501739502, "learning_rate": 4.993586800761738e-05, "loss": 0.6993, "num_input_tokens_seen": 9390856, "step": 16185 }, { "epoch": 2.4113792076258562, "grad_norm": 1.2589908838272095, "learning_rate": 4.9935635196989e-05, "loss": 0.8243, "num_input_tokens_seen": 9394088, "step": 16190 }, { "epoch": 2.4121239201668154, "grad_norm": 1.329175591468811, "learning_rate": 4.9935401965098395e-05, "loss": 0.6079, "num_input_tokens_seen": 9397416, "step": 16195 }, { "epoch": 2.4128686327077746, "grad_norm": 0.547534167766571, "learning_rate": 4.9935168311949524e-05, "loss": 0.6536, "num_input_tokens_seen": 9400360, "step": 16200 }, { "epoch": 2.413613345248734, "grad_norm": 0.7305707931518555, "learning_rate": 4.9934934237546326e-05, "loss": 0.6022, "num_input_tokens_seen": 9403176, "step": 16205 }, { "epoch": 2.414358057789693, "grad_norm": 0.8899267315864563, "learning_rate": 4.993469974189275e-05, "loss": 0.6833, "num_input_tokens_seen": 9405896, "step": 16210 }, { "epoch": 2.4151027703306522, "grad_norm": 1.5797680616378784, "learning_rate": 4.993446482499278e-05, "loss": 0.761, "num_input_tokens_seen": 9408872, "step": 16215 }, { "epoch": 2.4158474828716114, "grad_norm": 1.0987982749938965, "learning_rate": 4.993422948685036e-05, "loss": 0.9208, "num_input_tokens_seen": 9411976, "step": 16220 }, { "epoch": 2.4165921954125706, "grad_norm": 1.2778337001800537, "learning_rate": 4.993399372746948e-05, "loss": 0.6835, "num_input_tokens_seen": 9415112, "step": 16225 }, { "epoch": 2.41733690795353, "grad_norm": 0.8101517558097839, "learning_rate": 4.9933757546854115e-05, "loss": 0.6234, "num_input_tokens_seen": 9417992, "step": 16230 }, { "epoch": 2.418081620494489, "grad_norm": 1.0119470357894897, "learning_rate": 4.993352094500825e-05, "loss": 0.5456, "num_input_tokens_seen": 9420808, "step": 16235 }, { "epoch": 2.4188263330354483, "grad_norm": 0.587068498134613, "learning_rate": 4.99332839219359e-05, "loss": 0.618, "num_input_tokens_seen": 9423656, "step": 16240 }, { "epoch": 2.4195710455764075, "grad_norm": 3.514496326446533, "learning_rate": 4.993304647764106e-05, "loss": 0.7739, "num_input_tokens_seen": 9426600, "step": 16245 }, { "epoch": 2.4203157581173667, "grad_norm": 0.6520041227340698, "learning_rate": 4.993280861212773e-05, "loss": 0.7118, "num_input_tokens_seen": 9429608, "step": 16250 }, { "epoch": 2.421060470658326, "grad_norm": 4.3328938484191895, "learning_rate": 4.993257032539995e-05, "loss": 0.8806, "num_input_tokens_seen": 9432648, "step": 16255 }, { "epoch": 2.421805183199285, "grad_norm": 1.3155326843261719, "learning_rate": 4.993233161746174e-05, "loss": 0.7732, "num_input_tokens_seen": 9435624, "step": 16260 }, { "epoch": 2.4225498957402443, "grad_norm": 0.5881465077400208, "learning_rate": 4.993209248831711e-05, "loss": 0.5958, "num_input_tokens_seen": 9438632, "step": 16265 }, { "epoch": 2.4232946082812035, "grad_norm": 0.873414933681488, "learning_rate": 4.9931852937970124e-05, "loss": 0.7728, "num_input_tokens_seen": 9441416, "step": 16270 }, { "epoch": 2.4240393208221627, "grad_norm": 1.1249113082885742, "learning_rate": 4.9931612966424824e-05, "loss": 0.8072, "num_input_tokens_seen": 9443944, "step": 16275 }, { "epoch": 2.424784033363122, "grad_norm": 0.6054101586341858, "learning_rate": 4.993137257368526e-05, "loss": 0.5883, "num_input_tokens_seen": 9446952, "step": 16280 }, { "epoch": 2.425528745904081, "grad_norm": 0.9530358910560608, "learning_rate": 4.99311317597555e-05, "loss": 0.6278, "num_input_tokens_seen": 9449736, "step": 16285 }, { "epoch": 2.4262734584450403, "grad_norm": 1.075275182723999, "learning_rate": 4.993089052463961e-05, "loss": 0.7042, "num_input_tokens_seen": 9452424, "step": 16290 }, { "epoch": 2.4270181709859995, "grad_norm": 0.7488799095153809, "learning_rate": 4.993064886834166e-05, "loss": 0.6647, "num_input_tokens_seen": 9455208, "step": 16295 }, { "epoch": 2.4277628835269587, "grad_norm": 1.039530634880066, "learning_rate": 4.993040679086573e-05, "loss": 0.77, "num_input_tokens_seen": 9458312, "step": 16300 }, { "epoch": 2.428507596067918, "grad_norm": 1.3704715967178345, "learning_rate": 4.993016429221592e-05, "loss": 0.7415, "num_input_tokens_seen": 9461160, "step": 16305 }, { "epoch": 2.429252308608877, "grad_norm": 1.0524221658706665, "learning_rate": 4.992992137239632e-05, "loss": 0.6363, "num_input_tokens_seen": 9464040, "step": 16310 }, { "epoch": 2.4299970211498363, "grad_norm": 0.8522828817367554, "learning_rate": 4.992967803141104e-05, "loss": 0.8312, "num_input_tokens_seen": 9466984, "step": 16315 }, { "epoch": 2.4307417336907955, "grad_norm": 1.091389536857605, "learning_rate": 4.992943426926419e-05, "loss": 0.6967, "num_input_tokens_seen": 9470056, "step": 16320 }, { "epoch": 2.4314864462317547, "grad_norm": 0.8397249579429626, "learning_rate": 4.9929190085959874e-05, "loss": 0.719, "num_input_tokens_seen": 9472936, "step": 16325 }, { "epoch": 2.432231158772714, "grad_norm": 0.9348500370979309, "learning_rate": 4.9928945481502234e-05, "loss": 0.9047, "num_input_tokens_seen": 9475592, "step": 16330 }, { "epoch": 2.432975871313673, "grad_norm": 1.0051674842834473, "learning_rate": 4.9928700455895394e-05, "loss": 0.7416, "num_input_tokens_seen": 9478312, "step": 16335 }, { "epoch": 2.4337205838546323, "grad_norm": 1.0044218301773071, "learning_rate": 4.99284550091435e-05, "loss": 0.6804, "num_input_tokens_seen": 9481384, "step": 16340 }, { "epoch": 2.434465296395591, "grad_norm": 1.3628525733947754, "learning_rate": 4.992820914125069e-05, "loss": 0.8432, "num_input_tokens_seen": 9484200, "step": 16345 }, { "epoch": 2.4352100089365507, "grad_norm": 0.7546951174736023, "learning_rate": 4.9927962852221136e-05, "loss": 0.6618, "num_input_tokens_seen": 9487208, "step": 16350 }, { "epoch": 2.4359547214775095, "grad_norm": 0.9701008200645447, "learning_rate": 4.9927716142058976e-05, "loss": 0.8443, "num_input_tokens_seen": 9490152, "step": 16355 }, { "epoch": 2.436699434018469, "grad_norm": 0.6778063178062439, "learning_rate": 4.992746901076838e-05, "loss": 0.5626, "num_input_tokens_seen": 9493064, "step": 16360 }, { "epoch": 2.437444146559428, "grad_norm": 1.2386647462844849, "learning_rate": 4.992722145835354e-05, "loss": 0.6915, "num_input_tokens_seen": 9496008, "step": 16365 }, { "epoch": 2.438188859100387, "grad_norm": 0.943727433681488, "learning_rate": 4.992697348481863e-05, "loss": 0.6089, "num_input_tokens_seen": 9498792, "step": 16370 }, { "epoch": 2.4389335716413463, "grad_norm": 0.999785304069519, "learning_rate": 4.992672509016782e-05, "loss": 0.6981, "num_input_tokens_seen": 9502056, "step": 16375 }, { "epoch": 2.4396782841823055, "grad_norm": 1.7588586807250977, "learning_rate": 4.992647627440534e-05, "loss": 0.6585, "num_input_tokens_seen": 9504744, "step": 16380 }, { "epoch": 2.4404229967232647, "grad_norm": 1.7150801420211792, "learning_rate": 4.992622703753538e-05, "loss": 0.6588, "num_input_tokens_seen": 9507816, "step": 16385 }, { "epoch": 2.441167709264224, "grad_norm": 0.8461133241653442, "learning_rate": 4.9925977379562145e-05, "loss": 0.5868, "num_input_tokens_seen": 9510888, "step": 16390 }, { "epoch": 2.441912421805183, "grad_norm": 0.6993644833564758, "learning_rate": 4.9925727300489853e-05, "loss": 0.7084, "num_input_tokens_seen": 9513768, "step": 16395 }, { "epoch": 2.4426571343461423, "grad_norm": 1.0080219507217407, "learning_rate": 4.9925476800322735e-05, "loss": 0.7622, "num_input_tokens_seen": 9516648, "step": 16400 }, { "epoch": 2.4434018468871015, "grad_norm": 0.6802034378051758, "learning_rate": 4.992522587906501e-05, "loss": 0.6803, "num_input_tokens_seen": 9519784, "step": 16405 }, { "epoch": 2.4441465594280607, "grad_norm": 0.605742335319519, "learning_rate": 4.992497453672094e-05, "loss": 0.7145, "num_input_tokens_seen": 9522888, "step": 16410 }, { "epoch": 2.44489127196902, "grad_norm": 1.0987508296966553, "learning_rate": 4.9924722773294745e-05, "loss": 0.6815, "num_input_tokens_seen": 9525608, "step": 16415 }, { "epoch": 2.445635984509979, "grad_norm": 0.8070809245109558, "learning_rate": 4.99244705887907e-05, "loss": 0.5398, "num_input_tokens_seen": 9528840, "step": 16420 }, { "epoch": 2.4463806970509383, "grad_norm": 0.7059081792831421, "learning_rate": 4.992421798321305e-05, "loss": 0.6967, "num_input_tokens_seen": 9531752, "step": 16425 }, { "epoch": 2.4471254095918975, "grad_norm": 1.1086609363555908, "learning_rate": 4.992396495656608e-05, "loss": 0.752, "num_input_tokens_seen": 9534696, "step": 16430 }, { "epoch": 2.4478701221328567, "grad_norm": 0.9821216464042664, "learning_rate": 4.992371150885404e-05, "loss": 0.5899, "num_input_tokens_seen": 9537352, "step": 16435 }, { "epoch": 2.448614834673816, "grad_norm": 0.9020571708679199, "learning_rate": 4.9923457640081236e-05, "loss": 0.6925, "num_input_tokens_seen": 9540392, "step": 16440 }, { "epoch": 2.449359547214775, "grad_norm": 0.8836014866828918, "learning_rate": 4.992320335025194e-05, "loss": 0.6987, "num_input_tokens_seen": 9543144, "step": 16445 }, { "epoch": 2.4501042597557343, "grad_norm": 0.6745278239250183, "learning_rate": 4.992294863937046e-05, "loss": 0.7249, "num_input_tokens_seen": 9546184, "step": 16450 }, { "epoch": 2.4508489722966935, "grad_norm": 1.2976094484329224, "learning_rate": 4.9922693507441084e-05, "loss": 0.7646, "num_input_tokens_seen": 9549032, "step": 16455 }, { "epoch": 2.4515936848376527, "grad_norm": 0.6287378072738647, "learning_rate": 4.9922437954468136e-05, "loss": 0.7751, "num_input_tokens_seen": 9552296, "step": 16460 }, { "epoch": 2.452338397378612, "grad_norm": 0.9473912119865417, "learning_rate": 4.992218198045593e-05, "loss": 0.7623, "num_input_tokens_seen": 9555272, "step": 16465 }, { "epoch": 2.453083109919571, "grad_norm": 0.8854497075080872, "learning_rate": 4.992192558540879e-05, "loss": 0.6144, "num_input_tokens_seen": 9558184, "step": 16470 }, { "epoch": 2.4538278224605303, "grad_norm": 0.8220847845077515, "learning_rate": 4.992166876933105e-05, "loss": 0.6829, "num_input_tokens_seen": 9561064, "step": 16475 }, { "epoch": 2.4545725350014895, "grad_norm": 1.8497800827026367, "learning_rate": 4.9921411532227036e-05, "loss": 0.7593, "num_input_tokens_seen": 9564072, "step": 16480 }, { "epoch": 2.4553172475424487, "grad_norm": 1.1772302389144897, "learning_rate": 4.992115387410111e-05, "loss": 0.6312, "num_input_tokens_seen": 9566728, "step": 16485 }, { "epoch": 2.456061960083408, "grad_norm": 1.0777974128723145, "learning_rate": 4.992089579495762e-05, "loss": 0.6267, "num_input_tokens_seen": 9569608, "step": 16490 }, { "epoch": 2.456806672624367, "grad_norm": 0.7894755601882935, "learning_rate": 4.992063729480092e-05, "loss": 0.5296, "num_input_tokens_seen": 9572392, "step": 16495 }, { "epoch": 2.4575513851653263, "grad_norm": 0.7191529273986816, "learning_rate": 4.992037837363538e-05, "loss": 0.6694, "num_input_tokens_seen": 9575400, "step": 16500 }, { "epoch": 2.4582960977062855, "grad_norm": 1.5733308792114258, "learning_rate": 4.992011903146537e-05, "loss": 0.6851, "num_input_tokens_seen": 9578312, "step": 16505 }, { "epoch": 2.4590408102472447, "grad_norm": 0.8803525567054749, "learning_rate": 4.991985926829529e-05, "loss": 0.6651, "num_input_tokens_seen": 9581000, "step": 16510 }, { "epoch": 2.459785522788204, "grad_norm": 1.0302767753601074, "learning_rate": 4.991959908412951e-05, "loss": 0.5579, "num_input_tokens_seen": 9583688, "step": 16515 }, { "epoch": 2.4605302353291627, "grad_norm": 0.9493618607521057, "learning_rate": 4.9919338478972424e-05, "loss": 0.7016, "num_input_tokens_seen": 9587080, "step": 16520 }, { "epoch": 2.4612749478701224, "grad_norm": 0.9995103478431702, "learning_rate": 4.991907745282845e-05, "loss": 0.6578, "num_input_tokens_seen": 9589960, "step": 16525 }, { "epoch": 2.462019660411081, "grad_norm": 0.7445199489593506, "learning_rate": 4.9918816005701986e-05, "loss": 0.789, "num_input_tokens_seen": 9592936, "step": 16530 }, { "epoch": 2.4627643729520408, "grad_norm": 0.7326104640960693, "learning_rate": 4.9918554137597454e-05, "loss": 0.5739, "num_input_tokens_seen": 9595848, "step": 16535 }, { "epoch": 2.4635090854929995, "grad_norm": 1.1854056119918823, "learning_rate": 4.9918291848519275e-05, "loss": 0.7165, "num_input_tokens_seen": 9598600, "step": 16540 }, { "epoch": 2.4642537980339587, "grad_norm": 0.9100592732429504, "learning_rate": 4.991802913847188e-05, "loss": 0.63, "num_input_tokens_seen": 9601320, "step": 16545 }, { "epoch": 2.464998510574918, "grad_norm": 1.2260494232177734, "learning_rate": 4.9917766007459696e-05, "loss": 0.6582, "num_input_tokens_seen": 9604584, "step": 16550 }, { "epoch": 2.465743223115877, "grad_norm": 0.969062864780426, "learning_rate": 4.99175024554872e-05, "loss": 0.858, "num_input_tokens_seen": 9607560, "step": 16555 }, { "epoch": 2.4664879356568363, "grad_norm": 0.8880068063735962, "learning_rate": 4.991723848255881e-05, "loss": 0.7101, "num_input_tokens_seen": 9610216, "step": 16560 }, { "epoch": 2.4672326481977955, "grad_norm": 1.257699728012085, "learning_rate": 4.9916974088679015e-05, "loss": 0.6978, "num_input_tokens_seen": 9613224, "step": 16565 }, { "epoch": 2.4679773607387547, "grad_norm": 1.1106024980545044, "learning_rate": 4.991670927385226e-05, "loss": 0.5875, "num_input_tokens_seen": 9615848, "step": 16570 }, { "epoch": 2.468722073279714, "grad_norm": 1.3330903053283691, "learning_rate": 4.9916444038083024e-05, "loss": 0.8337, "num_input_tokens_seen": 9618600, "step": 16575 }, { "epoch": 2.469466785820673, "grad_norm": 1.0692243576049805, "learning_rate": 4.991617838137579e-05, "loss": 0.6544, "num_input_tokens_seen": 9621480, "step": 16580 }, { "epoch": 2.4702114983616323, "grad_norm": 1.9955568313598633, "learning_rate": 4.991591230373505e-05, "loss": 0.8163, "num_input_tokens_seen": 9624488, "step": 16585 }, { "epoch": 2.4709562109025915, "grad_norm": 1.0445153713226318, "learning_rate": 4.99156458051653e-05, "loss": 0.593, "num_input_tokens_seen": 9627720, "step": 16590 }, { "epoch": 2.4717009234435507, "grad_norm": 2.460791826248169, "learning_rate": 4.9915378885671026e-05, "loss": 0.7143, "num_input_tokens_seen": 9630856, "step": 16595 }, { "epoch": 2.47244563598451, "grad_norm": 0.66978919506073, "learning_rate": 4.9915111545256754e-05, "loss": 0.685, "num_input_tokens_seen": 9633928, "step": 16600 }, { "epoch": 2.473190348525469, "grad_norm": 0.6641314625740051, "learning_rate": 4.991484378392699e-05, "loss": 0.6598, "num_input_tokens_seen": 9637064, "step": 16605 }, { "epoch": 2.4739350610664284, "grad_norm": 1.055429220199585, "learning_rate": 4.9914575601686266e-05, "loss": 0.7351, "num_input_tokens_seen": 9640168, "step": 16610 }, { "epoch": 2.4746797736073876, "grad_norm": 1.2227002382278442, "learning_rate": 4.9914306998539115e-05, "loss": 0.7599, "num_input_tokens_seen": 9642888, "step": 16615 }, { "epoch": 2.4754244861483468, "grad_norm": 0.739469051361084, "learning_rate": 4.991403797449006e-05, "loss": 0.7245, "num_input_tokens_seen": 9645832, "step": 16620 }, { "epoch": 2.476169198689306, "grad_norm": 0.8659328818321228, "learning_rate": 4.9913768529543656e-05, "loss": 0.8771, "num_input_tokens_seen": 9648680, "step": 16625 }, { "epoch": 2.476913911230265, "grad_norm": 1.3054560422897339, "learning_rate": 4.991349866370446e-05, "loss": 0.5438, "num_input_tokens_seen": 9651304, "step": 16630 }, { "epoch": 2.4776586237712244, "grad_norm": 1.1745213270187378, "learning_rate": 4.9913228376977026e-05, "loss": 0.7905, "num_input_tokens_seen": 9654184, "step": 16635 }, { "epoch": 2.4784033363121836, "grad_norm": 1.1107159852981567, "learning_rate": 4.991295766936591e-05, "loss": 0.6195, "num_input_tokens_seen": 9657000, "step": 16640 }, { "epoch": 2.4791480488531428, "grad_norm": 0.4649486839771271, "learning_rate": 4.9912686540875696e-05, "loss": 0.684, "num_input_tokens_seen": 9660040, "step": 16645 }, { "epoch": 2.479892761394102, "grad_norm": 0.7980941534042358, "learning_rate": 4.991241499151097e-05, "loss": 0.5016, "num_input_tokens_seen": 9662984, "step": 16650 }, { "epoch": 2.480637473935061, "grad_norm": 3.1905078887939453, "learning_rate": 4.9912143021276306e-05, "loss": 0.8648, "num_input_tokens_seen": 9665704, "step": 16655 }, { "epoch": 2.4813821864760204, "grad_norm": 1.1936485767364502, "learning_rate": 4.991187063017631e-05, "loss": 0.7802, "num_input_tokens_seen": 9668296, "step": 16660 }, { "epoch": 2.4821268990169796, "grad_norm": 0.9370390772819519, "learning_rate": 4.9911597818215575e-05, "loss": 0.6345, "num_input_tokens_seen": 9671112, "step": 16665 }, { "epoch": 2.482871611557939, "grad_norm": 0.7547602653503418, "learning_rate": 4.9911324585398724e-05, "loss": 0.6727, "num_input_tokens_seen": 9674248, "step": 16670 }, { "epoch": 2.483616324098898, "grad_norm": 0.8336135745048523, "learning_rate": 4.9911050931730354e-05, "loss": 0.6598, "num_input_tokens_seen": 9677608, "step": 16675 }, { "epoch": 2.484361036639857, "grad_norm": 0.8389023542404175, "learning_rate": 4.9910776857215094e-05, "loss": 0.7901, "num_input_tokens_seen": 9680712, "step": 16680 }, { "epoch": 2.4851057491808164, "grad_norm": 0.9426432847976685, "learning_rate": 4.991050236185758e-05, "loss": 0.5537, "num_input_tokens_seen": 9683496, "step": 16685 }, { "epoch": 2.4858504617217756, "grad_norm": 0.6323997378349304, "learning_rate": 4.991022744566245e-05, "loss": 0.6654, "num_input_tokens_seen": 9686344, "step": 16690 }, { "epoch": 2.4865951742627344, "grad_norm": 0.7872471809387207, "learning_rate": 4.990995210863434e-05, "loss": 0.6816, "num_input_tokens_seen": 9689576, "step": 16695 }, { "epoch": 2.487339886803694, "grad_norm": 1.1404213905334473, "learning_rate": 4.9909676350777914e-05, "loss": 0.6981, "num_input_tokens_seen": 9692136, "step": 16700 }, { "epoch": 2.4880845993446528, "grad_norm": 0.9955978989601135, "learning_rate": 4.990940017209782e-05, "loss": 0.6932, "num_input_tokens_seen": 9694856, "step": 16705 }, { "epoch": 2.488829311885612, "grad_norm": 0.5447946190834045, "learning_rate": 4.990912357259872e-05, "loss": 0.9016, "num_input_tokens_seen": 9697832, "step": 16710 }, { "epoch": 2.489574024426571, "grad_norm": 1.5151244401931763, "learning_rate": 4.99088465522853e-05, "loss": 0.6996, "num_input_tokens_seen": 9700456, "step": 16715 }, { "epoch": 2.4903187369675304, "grad_norm": 1.2506126165390015, "learning_rate": 4.9908569111162226e-05, "loss": 0.7133, "num_input_tokens_seen": 9703272, "step": 16720 }, { "epoch": 2.4910634495084896, "grad_norm": 0.7462703585624695, "learning_rate": 4.9908291249234207e-05, "loss": 0.6636, "num_input_tokens_seen": 9706056, "step": 16725 }, { "epoch": 2.4918081620494488, "grad_norm": 1.3342411518096924, "learning_rate": 4.990801296650591e-05, "loss": 0.7236, "num_input_tokens_seen": 9708712, "step": 16730 }, { "epoch": 2.492552874590408, "grad_norm": 0.6694603562355042, "learning_rate": 4.9907734262982056e-05, "loss": 0.6697, "num_input_tokens_seen": 9711176, "step": 16735 }, { "epoch": 2.493297587131367, "grad_norm": 2.1413369178771973, "learning_rate": 4.990745513866735e-05, "loss": 0.7522, "num_input_tokens_seen": 9713768, "step": 16740 }, { "epoch": 2.4940422996723264, "grad_norm": 1.0505603551864624, "learning_rate": 4.99071755935665e-05, "loss": 0.6154, "num_input_tokens_seen": 9716296, "step": 16745 }, { "epoch": 2.4947870122132856, "grad_norm": 0.7484943866729736, "learning_rate": 4.990689562768423e-05, "loss": 0.585, "num_input_tokens_seen": 9719144, "step": 16750 }, { "epoch": 2.495531724754245, "grad_norm": 1.3749793767929077, "learning_rate": 4.990661524102528e-05, "loss": 0.7133, "num_input_tokens_seen": 9722472, "step": 16755 }, { "epoch": 2.496276437295204, "grad_norm": 1.3346116542816162, "learning_rate": 4.990633443359437e-05, "loss": 0.5153, "num_input_tokens_seen": 9725288, "step": 16760 }, { "epoch": 2.497021149836163, "grad_norm": 0.8276724815368652, "learning_rate": 4.990605320539626e-05, "loss": 0.7161, "num_input_tokens_seen": 9728136, "step": 16765 }, { "epoch": 2.4977658623771224, "grad_norm": 0.8612298369407654, "learning_rate": 4.990577155643569e-05, "loss": 0.7177, "num_input_tokens_seen": 9730984, "step": 16770 }, { "epoch": 2.4985105749180816, "grad_norm": 0.9457170963287354, "learning_rate": 4.9905489486717425e-05, "loss": 0.7359, "num_input_tokens_seen": 9733864, "step": 16775 }, { "epoch": 2.499255287459041, "grad_norm": 0.7336069345474243, "learning_rate": 4.990520699624623e-05, "loss": 0.7592, "num_input_tokens_seen": 9736712, "step": 16780 }, { "epoch": 2.5, "grad_norm": 0.8630470633506775, "learning_rate": 4.990492408502687e-05, "loss": 0.7341, "num_input_tokens_seen": 9739400, "step": 16785 }, { "epoch": 2.500744712540959, "grad_norm": 1.1464804410934448, "learning_rate": 4.9904640753064135e-05, "loss": 0.7842, "num_input_tokens_seen": 9742216, "step": 16790 }, { "epoch": 2.5014894250819184, "grad_norm": 0.7820388078689575, "learning_rate": 4.99043570003628e-05, "loss": 0.6623, "num_input_tokens_seen": 9745224, "step": 16795 }, { "epoch": 2.5022341376228776, "grad_norm": 1.3799864053726196, "learning_rate": 4.990407282692767e-05, "loss": 0.7588, "num_input_tokens_seen": 9748328, "step": 16800 }, { "epoch": 2.502978850163837, "grad_norm": 0.5906170606613159, "learning_rate": 4.9903788232763545e-05, "loss": 0.586, "num_input_tokens_seen": 9751048, "step": 16805 }, { "epoch": 2.503723562704796, "grad_norm": 1.1024185419082642, "learning_rate": 4.9903503217875227e-05, "loss": 0.7459, "num_input_tokens_seen": 9753704, "step": 16810 }, { "epoch": 2.504468275245755, "grad_norm": 1.0126394033432007, "learning_rate": 4.990321778226753e-05, "loss": 0.6625, "num_input_tokens_seen": 9756456, "step": 16815 }, { "epoch": 2.5052129877867144, "grad_norm": 0.7522115707397461, "learning_rate": 4.990293192594527e-05, "loss": 0.6265, "num_input_tokens_seen": 9759400, "step": 16820 }, { "epoch": 2.5059577003276736, "grad_norm": 0.9986379146575928, "learning_rate": 4.9902645648913305e-05, "loss": 0.8202, "num_input_tokens_seen": 9762120, "step": 16825 }, { "epoch": 2.506702412868633, "grad_norm": 0.6545566916465759, "learning_rate": 4.990235895117644e-05, "loss": 0.7933, "num_input_tokens_seen": 9765096, "step": 16830 }, { "epoch": 2.507447125409592, "grad_norm": 0.741805911064148, "learning_rate": 4.990207183273953e-05, "loss": 0.5484, "num_input_tokens_seen": 9767944, "step": 16835 }, { "epoch": 2.5081918379505512, "grad_norm": 1.7585529088974, "learning_rate": 4.9901784293607424e-05, "loss": 0.7145, "num_input_tokens_seen": 9770696, "step": 16840 }, { "epoch": 2.5089365504915104, "grad_norm": 1.1746091842651367, "learning_rate": 4.990149633378499e-05, "loss": 0.6383, "num_input_tokens_seen": 9773384, "step": 16845 }, { "epoch": 2.509681263032469, "grad_norm": 0.8338578939437866, "learning_rate": 4.990120795327707e-05, "loss": 0.6687, "num_input_tokens_seen": 9776232, "step": 16850 }, { "epoch": 2.510425975573429, "grad_norm": 0.9384539127349854, "learning_rate": 4.990091915208857e-05, "loss": 0.5745, "num_input_tokens_seen": 9778952, "step": 16855 }, { "epoch": 2.5111706881143876, "grad_norm": 0.7414059042930603, "learning_rate": 4.990062993022433e-05, "loss": 0.562, "num_input_tokens_seen": 9781864, "step": 16860 }, { "epoch": 2.5119154006553472, "grad_norm": 1.4399054050445557, "learning_rate": 4.990034028768927e-05, "loss": 0.5742, "num_input_tokens_seen": 9784936, "step": 16865 }, { "epoch": 2.512660113196306, "grad_norm": 0.7813630104064941, "learning_rate": 4.990005022448826e-05, "loss": 0.6269, "num_input_tokens_seen": 9787976, "step": 16870 }, { "epoch": 2.5134048257372656, "grad_norm": 0.9292622804641724, "learning_rate": 4.989975974062621e-05, "loss": 0.5811, "num_input_tokens_seen": 9790952, "step": 16875 }, { "epoch": 2.5141495382782244, "grad_norm": 0.5464823842048645, "learning_rate": 4.989946883610803e-05, "loss": 0.7501, "num_input_tokens_seen": 9793832, "step": 16880 }, { "epoch": 2.514894250819184, "grad_norm": 0.6831194758415222, "learning_rate": 4.9899177510938624e-05, "loss": 0.7947, "num_input_tokens_seen": 9796872, "step": 16885 }, { "epoch": 2.515638963360143, "grad_norm": 1.9522708654403687, "learning_rate": 4.9898885765122927e-05, "loss": 0.6648, "num_input_tokens_seen": 9799912, "step": 16890 }, { "epoch": 2.516383675901102, "grad_norm": 1.8662642240524292, "learning_rate": 4.9898593598665856e-05, "loss": 0.7325, "num_input_tokens_seen": 9802696, "step": 16895 }, { "epoch": 2.517128388442061, "grad_norm": 0.8394646644592285, "learning_rate": 4.989830101157235e-05, "loss": 0.6612, "num_input_tokens_seen": 9805384, "step": 16900 }, { "epoch": 2.5178731009830204, "grad_norm": 1.5711917877197266, "learning_rate": 4.9898008003847363e-05, "loss": 0.8096, "num_input_tokens_seen": 9808616, "step": 16905 }, { "epoch": 2.5186178135239796, "grad_norm": 0.9566442966461182, "learning_rate": 4.989771457549582e-05, "loss": 0.6999, "num_input_tokens_seen": 9811720, "step": 16910 }, { "epoch": 2.519362526064939, "grad_norm": 1.3490180969238281, "learning_rate": 4.989742072652271e-05, "loss": 0.6127, "num_input_tokens_seen": 9814536, "step": 16915 }, { "epoch": 2.520107238605898, "grad_norm": 1.1031666994094849, "learning_rate": 4.989712645693297e-05, "loss": 0.5476, "num_input_tokens_seen": 9817256, "step": 16920 }, { "epoch": 2.5208519511468572, "grad_norm": 1.0450512170791626, "learning_rate": 4.98968317667316e-05, "loss": 0.6389, "num_input_tokens_seen": 9820328, "step": 16925 }, { "epoch": 2.5215966636878164, "grad_norm": 0.6418074369430542, "learning_rate": 4.989653665592355e-05, "loss": 0.64, "num_input_tokens_seen": 9823304, "step": 16930 }, { "epoch": 2.5223413762287756, "grad_norm": 0.9214152693748474, "learning_rate": 4.989624112451381e-05, "loss": 0.7269, "num_input_tokens_seen": 9825992, "step": 16935 }, { "epoch": 2.523086088769735, "grad_norm": 0.8405261635780334, "learning_rate": 4.989594517250739e-05, "loss": 0.5972, "num_input_tokens_seen": 9828936, "step": 16940 }, { "epoch": 2.523830801310694, "grad_norm": 1.5879813432693481, "learning_rate": 4.989564879990928e-05, "loss": 0.7898, "num_input_tokens_seen": 9831656, "step": 16945 }, { "epoch": 2.5245755138516532, "grad_norm": 0.9310503602027893, "learning_rate": 4.9895352006724485e-05, "loss": 0.6187, "num_input_tokens_seen": 9834632, "step": 16950 }, { "epoch": 2.5253202263926124, "grad_norm": 0.8959015607833862, "learning_rate": 4.9895054792958015e-05, "loss": 0.776, "num_input_tokens_seen": 9837576, "step": 16955 }, { "epoch": 2.5260649389335716, "grad_norm": 0.9074065089225769, "learning_rate": 4.98947571586149e-05, "loss": 0.8095, "num_input_tokens_seen": 9840616, "step": 16960 }, { "epoch": 2.526809651474531, "grad_norm": 1.0663930177688599, "learning_rate": 4.9894459103700166e-05, "loss": 0.7051, "num_input_tokens_seen": 9843720, "step": 16965 }, { "epoch": 2.52755436401549, "grad_norm": 0.7103818655014038, "learning_rate": 4.989416062821884e-05, "loss": 0.6535, "num_input_tokens_seen": 9846728, "step": 16970 }, { "epoch": 2.5282990765564493, "grad_norm": 1.316330909729004, "learning_rate": 4.989386173217598e-05, "loss": 0.7078, "num_input_tokens_seen": 9849608, "step": 16975 }, { "epoch": 2.5290437890974085, "grad_norm": 0.7260589599609375, "learning_rate": 4.989356241557662e-05, "loss": 0.7154, "num_input_tokens_seen": 9852392, "step": 16980 }, { "epoch": 2.5297885016383677, "grad_norm": 0.7314897179603577, "learning_rate": 4.989326267842583e-05, "loss": 0.7263, "num_input_tokens_seen": 9855048, "step": 16985 }, { "epoch": 2.530533214179327, "grad_norm": 0.6418606638908386, "learning_rate": 4.9892962520728664e-05, "loss": 0.7626, "num_input_tokens_seen": 9858056, "step": 16990 }, { "epoch": 2.531277926720286, "grad_norm": 0.9059669375419617, "learning_rate": 4.989266194249019e-05, "loss": 0.628, "num_input_tokens_seen": 9860776, "step": 16995 }, { "epoch": 2.5320226392612453, "grad_norm": 0.7392818331718445, "learning_rate": 4.989236094371551e-05, "loss": 0.5947, "num_input_tokens_seen": 9864008, "step": 17000 }, { "epoch": 2.5327673518022045, "grad_norm": 0.9942111968994141, "learning_rate": 4.9892059524409676e-05, "loss": 0.687, "num_input_tokens_seen": 9867144, "step": 17005 }, { "epoch": 2.5335120643431637, "grad_norm": 0.8365013003349304, "learning_rate": 4.98917576845778e-05, "loss": 0.6236, "num_input_tokens_seen": 9869832, "step": 17010 }, { "epoch": 2.534256776884123, "grad_norm": 0.8381688594818115, "learning_rate": 4.989145542422498e-05, "loss": 0.5147, "num_input_tokens_seen": 9872936, "step": 17015 }, { "epoch": 2.535001489425082, "grad_norm": 0.56973797082901, "learning_rate": 4.989115274335632e-05, "loss": 0.5929, "num_input_tokens_seen": 9875624, "step": 17020 }, { "epoch": 2.535746201966041, "grad_norm": 0.781199038028717, "learning_rate": 4.9890849641976924e-05, "loss": 0.6734, "num_input_tokens_seen": 9878472, "step": 17025 }, { "epoch": 2.5364909145070005, "grad_norm": 1.0893542766571045, "learning_rate": 4.989054612009192e-05, "loss": 0.6273, "num_input_tokens_seen": 9881320, "step": 17030 }, { "epoch": 2.5372356270479592, "grad_norm": 0.7519622445106506, "learning_rate": 4.989024217770645e-05, "loss": 0.607, "num_input_tokens_seen": 9884072, "step": 17035 }, { "epoch": 2.537980339588919, "grad_norm": 0.7606238722801208, "learning_rate": 4.988993781482563e-05, "loss": 0.7306, "num_input_tokens_seen": 9886984, "step": 17040 }, { "epoch": 2.5387250521298776, "grad_norm": 1.0276128053665161, "learning_rate": 4.9889633031454604e-05, "loss": 0.7706, "num_input_tokens_seen": 9889960, "step": 17045 }, { "epoch": 2.5394697646708373, "grad_norm": 1.1102007627487183, "learning_rate": 4.9889327827598526e-05, "loss": 0.6242, "num_input_tokens_seen": 9892744, "step": 17050 }, { "epoch": 2.540214477211796, "grad_norm": 0.9111922383308411, "learning_rate": 4.988902220326255e-05, "loss": 0.6998, "num_input_tokens_seen": 9895656, "step": 17055 }, { "epoch": 2.5409591897527557, "grad_norm": 0.5168132781982422, "learning_rate": 4.9888716158451844e-05, "loss": 0.5748, "num_input_tokens_seen": 9898664, "step": 17060 }, { "epoch": 2.5417039022937145, "grad_norm": 0.6940692663192749, "learning_rate": 4.988840969317157e-05, "loss": 0.6601, "num_input_tokens_seen": 9901640, "step": 17065 }, { "epoch": 2.5424486148346737, "grad_norm": 1.6139298677444458, "learning_rate": 4.988810280742691e-05, "loss": 0.7118, "num_input_tokens_seen": 9904616, "step": 17070 }, { "epoch": 2.543193327375633, "grad_norm": 0.9269105195999146, "learning_rate": 4.988779550122305e-05, "loss": 0.7011, "num_input_tokens_seen": 9907240, "step": 17075 }, { "epoch": 2.543938039916592, "grad_norm": 1.2615370750427246, "learning_rate": 4.9887487774565176e-05, "loss": 0.7071, "num_input_tokens_seen": 9910440, "step": 17080 }, { "epoch": 2.5446827524575513, "grad_norm": 0.598221480846405, "learning_rate": 4.98871796274585e-05, "loss": 0.7325, "num_input_tokens_seen": 9913672, "step": 17085 }, { "epoch": 2.5454274649985105, "grad_norm": 0.7471570372581482, "learning_rate": 4.9886871059908213e-05, "loss": 0.6989, "num_input_tokens_seen": 9916584, "step": 17090 }, { "epoch": 2.5461721775394697, "grad_norm": 0.8832478523254395, "learning_rate": 4.988656207191953e-05, "loss": 0.8003, "num_input_tokens_seen": 9919432, "step": 17095 }, { "epoch": 2.546916890080429, "grad_norm": 0.7697012424468994, "learning_rate": 4.988625266349768e-05, "loss": 0.7318, "num_input_tokens_seen": 9922184, "step": 17100 }, { "epoch": 2.547661602621388, "grad_norm": 1.213279366493225, "learning_rate": 4.988594283464788e-05, "loss": 0.787, "num_input_tokens_seen": 9925064, "step": 17105 }, { "epoch": 2.5484063151623473, "grad_norm": 0.8944819569587708, "learning_rate": 4.988563258537537e-05, "loss": 0.6794, "num_input_tokens_seen": 9928040, "step": 17110 }, { "epoch": 2.5491510277033065, "grad_norm": 0.837315559387207, "learning_rate": 4.988532191568539e-05, "loss": 0.7499, "num_input_tokens_seen": 9930824, "step": 17115 }, { "epoch": 2.5498957402442657, "grad_norm": 1.0222779512405396, "learning_rate": 4.988501082558319e-05, "loss": 0.639, "num_input_tokens_seen": 9933480, "step": 17120 }, { "epoch": 2.550640452785225, "grad_norm": 0.6492897272109985, "learning_rate": 4.988469931507402e-05, "loss": 0.7539, "num_input_tokens_seen": 9936456, "step": 17125 }, { "epoch": 2.551385165326184, "grad_norm": 1.0778107643127441, "learning_rate": 4.988438738416316e-05, "loss": 0.7217, "num_input_tokens_seen": 9939368, "step": 17130 }, { "epoch": 2.5521298778671433, "grad_norm": 0.9721240401268005, "learning_rate": 4.988407503285585e-05, "loss": 0.6197, "num_input_tokens_seen": 9942216, "step": 17135 }, { "epoch": 2.5528745904081025, "grad_norm": 0.8057897686958313, "learning_rate": 4.988376226115739e-05, "loss": 0.6907, "num_input_tokens_seen": 9944904, "step": 17140 }, { "epoch": 2.5536193029490617, "grad_norm": 1.0386629104614258, "learning_rate": 4.9883449069073055e-05, "loss": 0.6588, "num_input_tokens_seen": 9947752, "step": 17145 }, { "epoch": 2.554364015490021, "grad_norm": 1.1356983184814453, "learning_rate": 4.9883135456608146e-05, "loss": 0.6365, "num_input_tokens_seen": 9950536, "step": 17150 }, { "epoch": 2.55510872803098, "grad_norm": 0.7557546496391296, "learning_rate": 4.988282142376795e-05, "loss": 0.6612, "num_input_tokens_seen": 9953448, "step": 17155 }, { "epoch": 2.5558534405719393, "grad_norm": 1.1629332304000854, "learning_rate": 4.9882506970557766e-05, "loss": 0.6873, "num_input_tokens_seen": 9956424, "step": 17160 }, { "epoch": 2.5565981531128985, "grad_norm": 1.9987447261810303, "learning_rate": 4.988219209698293e-05, "loss": 0.6901, "num_input_tokens_seen": 9959176, "step": 17165 }, { "epoch": 2.5573428656538577, "grad_norm": 1.4175899028778076, "learning_rate": 4.988187680304874e-05, "loss": 0.704, "num_input_tokens_seen": 9962568, "step": 17170 }, { "epoch": 2.558087578194817, "grad_norm": 1.1107864379882812, "learning_rate": 4.988156108876053e-05, "loss": 0.7979, "num_input_tokens_seen": 9965672, "step": 17175 }, { "epoch": 2.558832290735776, "grad_norm": 1.740028977394104, "learning_rate": 4.988124495412364e-05, "loss": 0.6043, "num_input_tokens_seen": 9968520, "step": 17180 }, { "epoch": 2.5595770032767353, "grad_norm": 0.9228881597518921, "learning_rate": 4.98809283991434e-05, "loss": 0.6178, "num_input_tokens_seen": 9971240, "step": 17185 }, { "epoch": 2.5603217158176945, "grad_norm": 0.9403863549232483, "learning_rate": 4.988061142382516e-05, "loss": 0.6971, "num_input_tokens_seen": 9974120, "step": 17190 }, { "epoch": 2.5610664283586537, "grad_norm": 1.0718449354171753, "learning_rate": 4.988029402817428e-05, "loss": 0.7035, "num_input_tokens_seen": 9977192, "step": 17195 }, { "epoch": 2.5618111408996125, "grad_norm": 0.6535166501998901, "learning_rate": 4.9879976212196124e-05, "loss": 0.6012, "num_input_tokens_seen": 9979848, "step": 17200 }, { "epoch": 2.562555853440572, "grad_norm": 0.7110390067100525, "learning_rate": 4.987965797589605e-05, "loss": 0.6436, "num_input_tokens_seen": 9982984, "step": 17205 }, { "epoch": 2.563300565981531, "grad_norm": 0.9340125322341919, "learning_rate": 4.987933931927944e-05, "loss": 0.6068, "num_input_tokens_seen": 9985960, "step": 17210 }, { "epoch": 2.5640452785224905, "grad_norm": 2.4261279106140137, "learning_rate": 4.987902024235169e-05, "loss": 0.7475, "num_input_tokens_seen": 9988872, "step": 17215 }, { "epoch": 2.5647899910634493, "grad_norm": 0.6158744692802429, "learning_rate": 4.987870074511817e-05, "loss": 0.6218, "num_input_tokens_seen": 9991720, "step": 17220 }, { "epoch": 2.565534703604409, "grad_norm": 0.7125300168991089, "learning_rate": 4.987838082758428e-05, "loss": 0.6824, "num_input_tokens_seen": 9994472, "step": 17225 }, { "epoch": 2.5662794161453677, "grad_norm": 0.6887785196304321, "learning_rate": 4.987806048975544e-05, "loss": 0.7117, "num_input_tokens_seen": 9997128, "step": 17230 }, { "epoch": 2.5670241286863273, "grad_norm": 0.9061623215675354, "learning_rate": 4.987773973163706e-05, "loss": 0.6054, "num_input_tokens_seen": 9999848, "step": 17235 }, { "epoch": 2.567768841227286, "grad_norm": 0.9968768358230591, "learning_rate": 4.987741855323454e-05, "loss": 0.6936, "num_input_tokens_seen": 10002664, "step": 17240 }, { "epoch": 2.5685135537682453, "grad_norm": 0.6041329503059387, "learning_rate": 4.9877096954553324e-05, "loss": 0.5634, "num_input_tokens_seen": 10005320, "step": 17245 }, { "epoch": 2.5692582663092045, "grad_norm": 0.7467694282531738, "learning_rate": 4.987677493559883e-05, "loss": 0.6261, "num_input_tokens_seen": 10008232, "step": 17250 }, { "epoch": 2.5700029788501637, "grad_norm": 1.0276678800582886, "learning_rate": 4.987645249637652e-05, "loss": 0.633, "num_input_tokens_seen": 10011112, "step": 17255 }, { "epoch": 2.570747691391123, "grad_norm": 1.5793712139129639, "learning_rate": 4.987612963689182e-05, "loss": 0.7824, "num_input_tokens_seen": 10014312, "step": 17260 }, { "epoch": 2.571492403932082, "grad_norm": 0.8714131116867065, "learning_rate": 4.987580635715019e-05, "loss": 0.683, "num_input_tokens_seen": 10017320, "step": 17265 }, { "epoch": 2.5722371164730413, "grad_norm": 1.173873782157898, "learning_rate": 4.98754826571571e-05, "loss": 0.6899, "num_input_tokens_seen": 10020520, "step": 17270 }, { "epoch": 2.5729818290140005, "grad_norm": 0.795539140701294, "learning_rate": 4.9875158536918015e-05, "loss": 0.8408, "num_input_tokens_seen": 10023720, "step": 17275 }, { "epoch": 2.5737265415549597, "grad_norm": 1.2289727926254272, "learning_rate": 4.987483399643841e-05, "loss": 0.6325, "num_input_tokens_seen": 10026760, "step": 17280 }, { "epoch": 2.574471254095919, "grad_norm": 0.7933492064476013, "learning_rate": 4.987450903572376e-05, "loss": 0.756, "num_input_tokens_seen": 10029640, "step": 17285 }, { "epoch": 2.575215966636878, "grad_norm": 1.0728718042373657, "learning_rate": 4.987418365477956e-05, "loss": 0.7039, "num_input_tokens_seen": 10032744, "step": 17290 }, { "epoch": 2.5759606791778373, "grad_norm": 0.7012736797332764, "learning_rate": 4.987385785361131e-05, "loss": 0.7372, "num_input_tokens_seen": 10035816, "step": 17295 }, { "epoch": 2.5767053917187965, "grad_norm": 1.020298719406128, "learning_rate": 4.987353163222451e-05, "loss": 0.5878, "num_input_tokens_seen": 10038792, "step": 17300 }, { "epoch": 2.5774501042597557, "grad_norm": 1.5412901639938354, "learning_rate": 4.9873204990624676e-05, "loss": 0.8254, "num_input_tokens_seen": 10042056, "step": 17305 }, { "epoch": 2.578194816800715, "grad_norm": 0.9346829652786255, "learning_rate": 4.987287792881733e-05, "loss": 0.673, "num_input_tokens_seen": 10044968, "step": 17310 }, { "epoch": 2.578939529341674, "grad_norm": 1.1037003993988037, "learning_rate": 4.9872550446807986e-05, "loss": 0.5839, "num_input_tokens_seen": 10047816, "step": 17315 }, { "epoch": 2.5796842418826333, "grad_norm": 0.7135063409805298, "learning_rate": 4.987222254460218e-05, "loss": 0.6755, "num_input_tokens_seen": 10050728, "step": 17320 }, { "epoch": 2.5804289544235925, "grad_norm": 1.6289492845535278, "learning_rate": 4.987189422220545e-05, "loss": 0.7156, "num_input_tokens_seen": 10053736, "step": 17325 }, { "epoch": 2.5811736669645517, "grad_norm": 0.9995360374450684, "learning_rate": 4.987156547962335e-05, "loss": 0.5987, "num_input_tokens_seen": 10056424, "step": 17330 }, { "epoch": 2.581918379505511, "grad_norm": 0.9602022767066956, "learning_rate": 4.987123631686143e-05, "loss": 0.6536, "num_input_tokens_seen": 10059496, "step": 17335 }, { "epoch": 2.58266309204647, "grad_norm": 0.738562285900116, "learning_rate": 4.987090673392525e-05, "loss": 0.6073, "num_input_tokens_seen": 10062248, "step": 17340 }, { "epoch": 2.5834078045874294, "grad_norm": 1.0069321393966675, "learning_rate": 4.987057673082038e-05, "loss": 0.585, "num_input_tokens_seen": 10065128, "step": 17345 }, { "epoch": 2.5841525171283886, "grad_norm": 0.8954365253448486, "learning_rate": 4.987024630755239e-05, "loss": 0.5086, "num_input_tokens_seen": 10068072, "step": 17350 }, { "epoch": 2.5848972296693478, "grad_norm": 0.6948533654212952, "learning_rate": 4.986991546412687e-05, "loss": 0.6344, "num_input_tokens_seen": 10071080, "step": 17355 }, { "epoch": 2.585641942210307, "grad_norm": 1.0513644218444824, "learning_rate": 4.98695842005494e-05, "loss": 0.7078, "num_input_tokens_seen": 10073832, "step": 17360 }, { "epoch": 2.586386654751266, "grad_norm": 1.0053385496139526, "learning_rate": 4.9869252516825585e-05, "loss": 0.8359, "num_input_tokens_seen": 10076840, "step": 17365 }, { "epoch": 2.5871313672922254, "grad_norm": 2.16042423248291, "learning_rate": 4.986892041296102e-05, "loss": 0.8845, "num_input_tokens_seen": 10079560, "step": 17370 }, { "epoch": 2.587876079833184, "grad_norm": 0.6620516180992126, "learning_rate": 4.9868587888961325e-05, "loss": 0.7559, "num_input_tokens_seen": 10082568, "step": 17375 }, { "epoch": 2.5886207923741438, "grad_norm": 0.8292832970619202, "learning_rate": 4.986825494483211e-05, "loss": 0.6775, "num_input_tokens_seen": 10085352, "step": 17380 }, { "epoch": 2.5893655049151025, "grad_norm": 1.328607439994812, "learning_rate": 4.9867921580579e-05, "loss": 0.8517, "num_input_tokens_seen": 10088232, "step": 17385 }, { "epoch": 2.590110217456062, "grad_norm": 0.8745736479759216, "learning_rate": 4.986758779620764e-05, "loss": 0.5865, "num_input_tokens_seen": 10091272, "step": 17390 }, { "epoch": 2.590854929997021, "grad_norm": 0.6813538670539856, "learning_rate": 4.986725359172365e-05, "loss": 0.7102, "num_input_tokens_seen": 10094088, "step": 17395 }, { "epoch": 2.5915996425379806, "grad_norm": 0.791126549243927, "learning_rate": 4.986691896713269e-05, "loss": 0.8354, "num_input_tokens_seen": 10096808, "step": 17400 }, { "epoch": 2.5923443550789393, "grad_norm": 0.7971041202545166, "learning_rate": 4.98665839224404e-05, "loss": 0.6827, "num_input_tokens_seen": 10099752, "step": 17405 }, { "epoch": 2.593089067619899, "grad_norm": 0.8504270911216736, "learning_rate": 4.9866248457652455e-05, "loss": 0.808, "num_input_tokens_seen": 10102632, "step": 17410 }, { "epoch": 2.5938337801608577, "grad_norm": 1.192004680633545, "learning_rate": 4.986591257277451e-05, "loss": 0.7253, "num_input_tokens_seen": 10105480, "step": 17415 }, { "epoch": 2.594578492701817, "grad_norm": 0.7283084392547607, "learning_rate": 4.986557626781224e-05, "loss": 0.6494, "num_input_tokens_seen": 10108616, "step": 17420 }, { "epoch": 2.595323205242776, "grad_norm": 1.6193591356277466, "learning_rate": 4.9865239542771345e-05, "loss": 0.7732, "num_input_tokens_seen": 10111560, "step": 17425 }, { "epoch": 2.5960679177837354, "grad_norm": 0.7340168952941895, "learning_rate": 4.986490239765749e-05, "loss": 0.6957, "num_input_tokens_seen": 10114824, "step": 17430 }, { "epoch": 2.5968126303246946, "grad_norm": 1.2781174182891846, "learning_rate": 4.9864564832476386e-05, "loss": 0.7201, "num_input_tokens_seen": 10117768, "step": 17435 }, { "epoch": 2.5975573428656538, "grad_norm": 0.6950890421867371, "learning_rate": 4.986422684723373e-05, "loss": 0.7819, "num_input_tokens_seen": 10121064, "step": 17440 }, { "epoch": 2.598302055406613, "grad_norm": 0.6807783246040344, "learning_rate": 4.986388844193523e-05, "loss": 0.7965, "num_input_tokens_seen": 10123912, "step": 17445 }, { "epoch": 2.599046767947572, "grad_norm": 0.7411789894104004, "learning_rate": 4.9863549616586604e-05, "loss": 0.6532, "num_input_tokens_seen": 10126760, "step": 17450 }, { "epoch": 2.5997914804885314, "grad_norm": 0.6080256700515747, "learning_rate": 4.986321037119358e-05, "loss": 0.5388, "num_input_tokens_seen": 10129512, "step": 17455 }, { "epoch": 2.6005361930294906, "grad_norm": 1.0125422477722168, "learning_rate": 4.986287070576188e-05, "loss": 0.8398, "num_input_tokens_seen": 10132424, "step": 17460 }, { "epoch": 2.6012809055704498, "grad_norm": 1.0245654582977295, "learning_rate": 4.986253062029725e-05, "loss": 0.6545, "num_input_tokens_seen": 10135304, "step": 17465 }, { "epoch": 2.602025618111409, "grad_norm": 0.6069490909576416, "learning_rate": 4.986219011480544e-05, "loss": 0.6401, "num_input_tokens_seen": 10137864, "step": 17470 }, { "epoch": 2.602770330652368, "grad_norm": 0.6190932393074036, "learning_rate": 4.98618491892922e-05, "loss": 0.7152, "num_input_tokens_seen": 10140648, "step": 17475 }, { "epoch": 2.6035150431933274, "grad_norm": 0.8385153412818909, "learning_rate": 4.986150784376328e-05, "loss": 0.7163, "num_input_tokens_seen": 10143464, "step": 17480 }, { "epoch": 2.6042597557342866, "grad_norm": 0.5048978328704834, "learning_rate": 4.986116607822445e-05, "loss": 0.7083, "num_input_tokens_seen": 10146536, "step": 17485 }, { "epoch": 2.605004468275246, "grad_norm": 0.6069284081459045, "learning_rate": 4.9860823892681496e-05, "loss": 0.6708, "num_input_tokens_seen": 10149480, "step": 17490 }, { "epoch": 2.605749180816205, "grad_norm": 0.6383386254310608, "learning_rate": 4.986048128714019e-05, "loss": 0.6157, "num_input_tokens_seen": 10152456, "step": 17495 }, { "epoch": 2.606493893357164, "grad_norm": 1.2476646900177002, "learning_rate": 4.986013826160631e-05, "loss": 0.6576, "num_input_tokens_seen": 10155304, "step": 17500 }, { "epoch": 2.6072386058981234, "grad_norm": 0.6217283010482788, "learning_rate": 4.985979481608567e-05, "loss": 0.7393, "num_input_tokens_seen": 10158312, "step": 17505 }, { "epoch": 2.6079833184390826, "grad_norm": 0.8193421363830566, "learning_rate": 4.9859450950584056e-05, "loss": 0.6794, "num_input_tokens_seen": 10161128, "step": 17510 }, { "epoch": 2.608728030980042, "grad_norm": 1.200469970703125, "learning_rate": 4.9859106665107294e-05, "loss": 0.6806, "num_input_tokens_seen": 10163880, "step": 17515 }, { "epoch": 2.609472743521001, "grad_norm": 0.6245138645172119, "learning_rate": 4.985876195966118e-05, "loss": 0.6335, "num_input_tokens_seen": 10166568, "step": 17520 }, { "epoch": 2.61021745606196, "grad_norm": 0.6880339980125427, "learning_rate": 4.985841683425155e-05, "loss": 0.7165, "num_input_tokens_seen": 10169512, "step": 17525 }, { "epoch": 2.6109621686029194, "grad_norm": 0.7730264067649841, "learning_rate": 4.9858071288884236e-05, "loss": 0.6868, "num_input_tokens_seen": 10172264, "step": 17530 }, { "epoch": 2.6117068811438786, "grad_norm": 1.2969330549240112, "learning_rate": 4.985772532356507e-05, "loss": 0.522, "num_input_tokens_seen": 10175112, "step": 17535 }, { "epoch": 2.612451593684838, "grad_norm": 0.873664915561676, "learning_rate": 4.9857378938299895e-05, "loss": 0.606, "num_input_tokens_seen": 10177928, "step": 17540 }, { "epoch": 2.613196306225797, "grad_norm": 0.8469653725624084, "learning_rate": 4.985703213309457e-05, "loss": 0.7152, "num_input_tokens_seen": 10180840, "step": 17545 }, { "epoch": 2.6139410187667558, "grad_norm": 0.6223414540290833, "learning_rate": 4.9856684907954955e-05, "loss": 0.5233, "num_input_tokens_seen": 10183624, "step": 17550 }, { "epoch": 2.6146857313077154, "grad_norm": 0.652326762676239, "learning_rate": 4.985633726288691e-05, "loss": 0.7012, "num_input_tokens_seen": 10186728, "step": 17555 }, { "epoch": 2.615430443848674, "grad_norm": 0.852985143661499, "learning_rate": 4.985598919789631e-05, "loss": 0.6804, "num_input_tokens_seen": 10189512, "step": 17560 }, { "epoch": 2.616175156389634, "grad_norm": 1.0206129550933838, "learning_rate": 4.9855640712989035e-05, "loss": 0.6537, "num_input_tokens_seen": 10192104, "step": 17565 }, { "epoch": 2.6169198689305926, "grad_norm": 0.8187028765678406, "learning_rate": 4.9855291808170966e-05, "loss": 0.6548, "num_input_tokens_seen": 10194568, "step": 17570 }, { "epoch": 2.6176645814715522, "grad_norm": 0.7093005776405334, "learning_rate": 4.985494248344801e-05, "loss": 0.5987, "num_input_tokens_seen": 10197224, "step": 17575 }, { "epoch": 2.618409294012511, "grad_norm": 0.7293959856033325, "learning_rate": 4.9854592738826054e-05, "loss": 0.7201, "num_input_tokens_seen": 10200168, "step": 17580 }, { "epoch": 2.6191540065534706, "grad_norm": 1.0282557010650635, "learning_rate": 4.985424257431103e-05, "loss": 0.6335, "num_input_tokens_seen": 10202792, "step": 17585 }, { "epoch": 2.6198987190944294, "grad_norm": 1.0774873495101929, "learning_rate": 4.985389198990883e-05, "loss": 0.7485, "num_input_tokens_seen": 10205640, "step": 17590 }, { "epoch": 2.6206434316353886, "grad_norm": 1.4837961196899414, "learning_rate": 4.985354098562538e-05, "loss": 0.6293, "num_input_tokens_seen": 10208200, "step": 17595 }, { "epoch": 2.621388144176348, "grad_norm": 1.0015795230865479, "learning_rate": 4.985318956146662e-05, "loss": 0.7014, "num_input_tokens_seen": 10210920, "step": 17600 }, { "epoch": 2.622132856717307, "grad_norm": 0.9356793165206909, "learning_rate": 4.9852837717438485e-05, "loss": 0.6945, "num_input_tokens_seen": 10213960, "step": 17605 }, { "epoch": 2.622877569258266, "grad_norm": 1.3603934049606323, "learning_rate": 4.985248545354692e-05, "loss": 0.7646, "num_input_tokens_seen": 10216616, "step": 17610 }, { "epoch": 2.6236222817992254, "grad_norm": 0.6376991271972656, "learning_rate": 4.985213276979785e-05, "loss": 0.6969, "num_input_tokens_seen": 10219336, "step": 17615 }, { "epoch": 2.6243669943401846, "grad_norm": 0.8766759634017944, "learning_rate": 4.985177966619727e-05, "loss": 0.4589, "num_input_tokens_seen": 10222120, "step": 17620 }, { "epoch": 2.625111706881144, "grad_norm": 1.1951478719711304, "learning_rate": 4.985142614275114e-05, "loss": 0.7773, "num_input_tokens_seen": 10224904, "step": 17625 }, { "epoch": 2.625856419422103, "grad_norm": 0.7245769500732422, "learning_rate": 4.985107219946541e-05, "loss": 0.6293, "num_input_tokens_seen": 10227816, "step": 17630 }, { "epoch": 2.626601131963062, "grad_norm": 0.8113626837730408, "learning_rate": 4.985071783634608e-05, "loss": 0.5525, "num_input_tokens_seen": 10230568, "step": 17635 }, { "epoch": 2.6273458445040214, "grad_norm": 1.113762617111206, "learning_rate": 4.985036305339913e-05, "loss": 0.6437, "num_input_tokens_seen": 10233960, "step": 17640 }, { "epoch": 2.6280905570449806, "grad_norm": 1.238325834274292, "learning_rate": 4.9850007850630545e-05, "loss": 0.6504, "num_input_tokens_seen": 10236776, "step": 17645 }, { "epoch": 2.62883526958594, "grad_norm": 1.2103971242904663, "learning_rate": 4.984965222804634e-05, "loss": 0.7872, "num_input_tokens_seen": 10240072, "step": 17650 }, { "epoch": 2.629579982126899, "grad_norm": 1.2062675952911377, "learning_rate": 4.984929618565252e-05, "loss": 0.6914, "num_input_tokens_seen": 10242920, "step": 17655 }, { "epoch": 2.6303246946678582, "grad_norm": 1.6798619031906128, "learning_rate": 4.9848939723455085e-05, "loss": 0.8051, "num_input_tokens_seen": 10246408, "step": 17660 }, { "epoch": 2.6310694072088174, "grad_norm": 0.8647260069847107, "learning_rate": 4.984858284146008e-05, "loss": 0.5737, "num_input_tokens_seen": 10249256, "step": 17665 }, { "epoch": 2.6318141197497766, "grad_norm": 1.1135236024856567, "learning_rate": 4.9848225539673513e-05, "loss": 0.6931, "num_input_tokens_seen": 10252264, "step": 17670 }, { "epoch": 2.632558832290736, "grad_norm": 0.6685647964477539, "learning_rate": 4.9847867818101436e-05, "loss": 0.7266, "num_input_tokens_seen": 10255304, "step": 17675 }, { "epoch": 2.633303544831695, "grad_norm": 1.159952998161316, "learning_rate": 4.984750967674989e-05, "loss": 0.7802, "num_input_tokens_seen": 10258120, "step": 17680 }, { "epoch": 2.6340482573726542, "grad_norm": 1.1044056415557861, "learning_rate": 4.9847151115624916e-05, "loss": 0.668, "num_input_tokens_seen": 10260872, "step": 17685 }, { "epoch": 2.6347929699136134, "grad_norm": 0.6552382111549377, "learning_rate": 4.984679213473258e-05, "loss": 0.7158, "num_input_tokens_seen": 10263752, "step": 17690 }, { "epoch": 2.6355376824545726, "grad_norm": 1.5935723781585693, "learning_rate": 4.984643273407894e-05, "loss": 0.6594, "num_input_tokens_seen": 10266824, "step": 17695 }, { "epoch": 2.636282394995532, "grad_norm": 0.8011772632598877, "learning_rate": 4.984607291367007e-05, "loss": 0.7895, "num_input_tokens_seen": 10269960, "step": 17700 }, { "epoch": 2.637027107536491, "grad_norm": 0.9105437994003296, "learning_rate": 4.984571267351206e-05, "loss": 0.7837, "num_input_tokens_seen": 10272968, "step": 17705 }, { "epoch": 2.6377718200774503, "grad_norm": 0.7724934816360474, "learning_rate": 4.984535201361098e-05, "loss": 0.7322, "num_input_tokens_seen": 10276104, "step": 17710 }, { "epoch": 2.6385165326184095, "grad_norm": 1.0983635187149048, "learning_rate": 4.984499093397294e-05, "loss": 0.6339, "num_input_tokens_seen": 10278888, "step": 17715 }, { "epoch": 2.6392612451593687, "grad_norm": 1.6497701406478882, "learning_rate": 4.984462943460402e-05, "loss": 0.7559, "num_input_tokens_seen": 10281832, "step": 17720 }, { "epoch": 2.6400059577003274, "grad_norm": 0.8638412952423096, "learning_rate": 4.984426751551033e-05, "loss": 0.6273, "num_input_tokens_seen": 10284680, "step": 17725 }, { "epoch": 2.640750670241287, "grad_norm": 1.312283992767334, "learning_rate": 4.9843905176698004e-05, "loss": 0.6848, "num_input_tokens_seen": 10287912, "step": 17730 }, { "epoch": 2.641495382782246, "grad_norm": 0.8423065543174744, "learning_rate": 4.984354241817314e-05, "loss": 0.6284, "num_input_tokens_seen": 10290856, "step": 17735 }, { "epoch": 2.6422400953232055, "grad_norm": 0.9425873160362244, "learning_rate": 4.984317923994188e-05, "loss": 0.7243, "num_input_tokens_seen": 10294024, "step": 17740 }, { "epoch": 2.6429848078641642, "grad_norm": 0.6359459757804871, "learning_rate": 4.984281564201036e-05, "loss": 0.6211, "num_input_tokens_seen": 10296904, "step": 17745 }, { "epoch": 2.643729520405124, "grad_norm": 0.8869771957397461, "learning_rate": 4.9842451624384715e-05, "loss": 0.7222, "num_input_tokens_seen": 10299656, "step": 17750 }, { "epoch": 2.6444742329460826, "grad_norm": 1.1998130083084106, "learning_rate": 4.98420871870711e-05, "loss": 0.6313, "num_input_tokens_seen": 10302568, "step": 17755 }, { "epoch": 2.645218945487042, "grad_norm": 1.0968433618545532, "learning_rate": 4.984172233007567e-05, "loss": 0.6671, "num_input_tokens_seen": 10305416, "step": 17760 }, { "epoch": 2.645963658028001, "grad_norm": 1.1488664150238037, "learning_rate": 4.984135705340459e-05, "loss": 0.723, "num_input_tokens_seen": 10308360, "step": 17765 }, { "epoch": 2.6467083705689602, "grad_norm": 0.7855119705200195, "learning_rate": 4.984099135706402e-05, "loss": 0.6192, "num_input_tokens_seen": 10311208, "step": 17770 }, { "epoch": 2.6474530831099194, "grad_norm": 0.6908847689628601, "learning_rate": 4.984062524106017e-05, "loss": 0.698, "num_input_tokens_seen": 10314280, "step": 17775 }, { "epoch": 2.6481977956508786, "grad_norm": 1.1370961666107178, "learning_rate": 4.984025870539919e-05, "loss": 0.5426, "num_input_tokens_seen": 10317384, "step": 17780 }, { "epoch": 2.648942508191838, "grad_norm": 0.7683045268058777, "learning_rate": 4.983989175008729e-05, "loss": 0.5164, "num_input_tokens_seen": 10320200, "step": 17785 }, { "epoch": 2.649687220732797, "grad_norm": 0.825821042060852, "learning_rate": 4.983952437513066e-05, "loss": 0.7827, "num_input_tokens_seen": 10323464, "step": 17790 }, { "epoch": 2.6504319332737563, "grad_norm": 1.4987200498580933, "learning_rate": 4.983915658053551e-05, "loss": 0.7425, "num_input_tokens_seen": 10326408, "step": 17795 }, { "epoch": 2.6511766458147155, "grad_norm": 1.3875229358673096, "learning_rate": 4.983878836630806e-05, "loss": 0.8374, "num_input_tokens_seen": 10329384, "step": 17800 }, { "epoch": 2.6519213583556747, "grad_norm": 1.106106162071228, "learning_rate": 4.983841973245452e-05, "loss": 0.6859, "num_input_tokens_seen": 10332520, "step": 17805 }, { "epoch": 2.652666070896634, "grad_norm": 1.1066327095031738, "learning_rate": 4.983805067898113e-05, "loss": 0.644, "num_input_tokens_seen": 10335496, "step": 17810 }, { "epoch": 2.653410783437593, "grad_norm": 1.190226674079895, "learning_rate": 4.983768120589411e-05, "loss": 0.5795, "num_input_tokens_seen": 10338376, "step": 17815 }, { "epoch": 2.6541554959785523, "grad_norm": 0.6277433633804321, "learning_rate": 4.983731131319972e-05, "loss": 0.7092, "num_input_tokens_seen": 10341288, "step": 17820 }, { "epoch": 2.6549002085195115, "grad_norm": 1.0667411088943481, "learning_rate": 4.98369410009042e-05, "loss": 0.6483, "num_input_tokens_seen": 10344552, "step": 17825 }, { "epoch": 2.6556449210604707, "grad_norm": 1.37337327003479, "learning_rate": 4.9836570269013796e-05, "loss": 0.8853, "num_input_tokens_seen": 10347528, "step": 17830 }, { "epoch": 2.65638963360143, "grad_norm": 0.7022108435630798, "learning_rate": 4.983619911753478e-05, "loss": 0.5979, "num_input_tokens_seen": 10350344, "step": 17835 }, { "epoch": 2.657134346142389, "grad_norm": 1.0488404035568237, "learning_rate": 4.983582754647343e-05, "loss": 0.7187, "num_input_tokens_seen": 10353256, "step": 17840 }, { "epoch": 2.6578790586833483, "grad_norm": 0.7620150446891785, "learning_rate": 4.983545555583601e-05, "loss": 0.785, "num_input_tokens_seen": 10356040, "step": 17845 }, { "epoch": 2.6586237712243075, "grad_norm": 0.716910719871521, "learning_rate": 4.9835083145628816e-05, "loss": 0.6786, "num_input_tokens_seen": 10359144, "step": 17850 }, { "epoch": 2.6593684837652667, "grad_norm": 0.7381061315536499, "learning_rate": 4.9834710315858125e-05, "loss": 0.7721, "num_input_tokens_seen": 10362440, "step": 17855 }, { "epoch": 2.660113196306226, "grad_norm": 2.1295359134674072, "learning_rate": 4.983433706653024e-05, "loss": 0.5832, "num_input_tokens_seen": 10365000, "step": 17860 }, { "epoch": 2.660857908847185, "grad_norm": 0.7766037583351135, "learning_rate": 4.9833963397651485e-05, "loss": 0.6839, "num_input_tokens_seen": 10368040, "step": 17865 }, { "epoch": 2.6616026213881443, "grad_norm": 0.6086010336875916, "learning_rate": 4.9833589309228154e-05, "loss": 0.7223, "num_input_tokens_seen": 10370792, "step": 17870 }, { "epoch": 2.6623473339291035, "grad_norm": 0.8536949157714844, "learning_rate": 4.9833214801266565e-05, "loss": 0.7241, "num_input_tokens_seen": 10373352, "step": 17875 }, { "epoch": 2.6630920464700627, "grad_norm": 1.1302309036254883, "learning_rate": 4.9832839873773054e-05, "loss": 0.6556, "num_input_tokens_seen": 10376296, "step": 17880 }, { "epoch": 2.663836759011022, "grad_norm": 1.551331639289856, "learning_rate": 4.983246452675395e-05, "loss": 0.7875, "num_input_tokens_seen": 10378984, "step": 17885 }, { "epoch": 2.6645814715519807, "grad_norm": 0.633415162563324, "learning_rate": 4.983208876021561e-05, "loss": 0.8668, "num_input_tokens_seen": 10382120, "step": 17890 }, { "epoch": 2.6653261840929403, "grad_norm": 0.9664000272750854, "learning_rate": 4.983171257416436e-05, "loss": 0.606, "num_input_tokens_seen": 10384872, "step": 17895 }, { "epoch": 2.666070896633899, "grad_norm": 0.7289049029350281, "learning_rate": 4.983133596860656e-05, "loss": 0.6671, "num_input_tokens_seen": 10387560, "step": 17900 }, { "epoch": 2.6668156091748587, "grad_norm": 1.0509437322616577, "learning_rate": 4.983095894354858e-05, "loss": 0.6557, "num_input_tokens_seen": 10390440, "step": 17905 }, { "epoch": 2.6675603217158175, "grad_norm": 1.7901251316070557, "learning_rate": 4.9830581498996784e-05, "loss": 0.6808, "num_input_tokens_seen": 10393672, "step": 17910 }, { "epoch": 2.668305034256777, "grad_norm": 0.9645302891731262, "learning_rate": 4.983020363495755e-05, "loss": 0.6695, "num_input_tokens_seen": 10396424, "step": 17915 }, { "epoch": 2.669049746797736, "grad_norm": 0.925798237323761, "learning_rate": 4.982982535143727e-05, "loss": 0.7096, "num_input_tokens_seen": 10399368, "step": 17920 }, { "epoch": 2.6697944593386955, "grad_norm": 0.8457698225975037, "learning_rate": 4.982944664844231e-05, "loss": 0.6443, "num_input_tokens_seen": 10402120, "step": 17925 }, { "epoch": 2.6705391718796543, "grad_norm": 1.0644288063049316, "learning_rate": 4.98290675259791e-05, "loss": 0.6614, "num_input_tokens_seen": 10404712, "step": 17930 }, { "epoch": 2.6712838844206135, "grad_norm": 0.831512987613678, "learning_rate": 4.9828687984054015e-05, "loss": 0.6276, "num_input_tokens_seen": 10407656, "step": 17935 }, { "epoch": 2.6720285969615727, "grad_norm": 1.05061936378479, "learning_rate": 4.9828308022673494e-05, "loss": 0.6932, "num_input_tokens_seen": 10410472, "step": 17940 }, { "epoch": 2.672773309502532, "grad_norm": 1.0909987688064575, "learning_rate": 4.9827927641843944e-05, "loss": 0.6617, "num_input_tokens_seen": 10413352, "step": 17945 }, { "epoch": 2.673518022043491, "grad_norm": 0.7140825986862183, "learning_rate": 4.982754684157178e-05, "loss": 0.533, "num_input_tokens_seen": 10416456, "step": 17950 }, { "epoch": 2.6742627345844503, "grad_norm": 1.5863453149795532, "learning_rate": 4.982716562186345e-05, "loss": 0.6875, "num_input_tokens_seen": 10419240, "step": 17955 }, { "epoch": 2.6750074471254095, "grad_norm": 1.9745436906814575, "learning_rate": 4.982678398272539e-05, "loss": 0.8244, "num_input_tokens_seen": 10422696, "step": 17960 }, { "epoch": 2.6757521596663687, "grad_norm": 1.031636118888855, "learning_rate": 4.982640192416404e-05, "loss": 0.6877, "num_input_tokens_seen": 10425672, "step": 17965 }, { "epoch": 2.676496872207328, "grad_norm": 0.7904567718505859, "learning_rate": 4.982601944618588e-05, "loss": 0.6031, "num_input_tokens_seen": 10428744, "step": 17970 }, { "epoch": 2.677241584748287, "grad_norm": 0.7930914759635925, "learning_rate": 4.982563654879734e-05, "loss": 0.6372, "num_input_tokens_seen": 10431816, "step": 17975 }, { "epoch": 2.6779862972892463, "grad_norm": 1.0455275774002075, "learning_rate": 4.982525323200491e-05, "loss": 0.648, "num_input_tokens_seen": 10434696, "step": 17980 }, { "epoch": 2.6787310098302055, "grad_norm": 0.6465650200843811, "learning_rate": 4.982486949581505e-05, "loss": 0.8121, "num_input_tokens_seen": 10437640, "step": 17985 }, { "epoch": 2.6794757223711647, "grad_norm": 1.2380497455596924, "learning_rate": 4.982448534023426e-05, "loss": 0.6672, "num_input_tokens_seen": 10440392, "step": 17990 }, { "epoch": 2.680220434912124, "grad_norm": 0.8044615983963013, "learning_rate": 4.982410076526901e-05, "loss": 0.6687, "num_input_tokens_seen": 10443368, "step": 17995 }, { "epoch": 2.680965147453083, "grad_norm": 1.0916239023208618, "learning_rate": 4.9823715770925814e-05, "loss": 0.6456, "num_input_tokens_seen": 10446216, "step": 18000 }, { "epoch": 2.6817098599940423, "grad_norm": 1.291125774383545, "learning_rate": 4.982333035721117e-05, "loss": 0.7046, "num_input_tokens_seen": 10448936, "step": 18005 }, { "epoch": 2.6824545725350015, "grad_norm": 0.94810551404953, "learning_rate": 4.982294452413159e-05, "loss": 0.6711, "num_input_tokens_seen": 10451880, "step": 18010 }, { "epoch": 2.6831992850759607, "grad_norm": 0.9972156286239624, "learning_rate": 4.982255827169359e-05, "loss": 0.6224, "num_input_tokens_seen": 10454792, "step": 18015 }, { "epoch": 2.68394399761692, "grad_norm": 0.841614305973053, "learning_rate": 4.982217159990369e-05, "loss": 0.5539, "num_input_tokens_seen": 10457480, "step": 18020 }, { "epoch": 2.684688710157879, "grad_norm": 0.9745431542396545, "learning_rate": 4.982178450876843e-05, "loss": 0.8064, "num_input_tokens_seen": 10460360, "step": 18025 }, { "epoch": 2.6854334226988383, "grad_norm": 0.8965339064598083, "learning_rate": 4.9821396998294356e-05, "loss": 0.6991, "num_input_tokens_seen": 10462920, "step": 18030 }, { "epoch": 2.6861781352397975, "grad_norm": 1.019454836845398, "learning_rate": 4.982100906848801e-05, "loss": 0.7518, "num_input_tokens_seen": 10465416, "step": 18035 }, { "epoch": 2.6869228477807567, "grad_norm": 0.748770534992218, "learning_rate": 4.9820620719355934e-05, "loss": 0.5687, "num_input_tokens_seen": 10468136, "step": 18040 }, { "epoch": 2.687667560321716, "grad_norm": 1.011470079421997, "learning_rate": 4.982023195090469e-05, "loss": 0.7079, "num_input_tokens_seen": 10470952, "step": 18045 }, { "epoch": 2.688412272862675, "grad_norm": 2.0586156845092773, "learning_rate": 4.981984276314087e-05, "loss": 0.7636, "num_input_tokens_seen": 10473832, "step": 18050 }, { "epoch": 2.6891569854036343, "grad_norm": 0.677452027797699, "learning_rate": 4.981945315607103e-05, "loss": 0.7383, "num_input_tokens_seen": 10476744, "step": 18055 }, { "epoch": 2.6899016979445936, "grad_norm": 1.006107211112976, "learning_rate": 4.981906312970175e-05, "loss": 0.4714, "num_input_tokens_seen": 10479432, "step": 18060 }, { "epoch": 2.6906464104855523, "grad_norm": 0.9007038474082947, "learning_rate": 4.981867268403962e-05, "loss": 0.6984, "num_input_tokens_seen": 10482152, "step": 18065 }, { "epoch": 2.691391123026512, "grad_norm": 1.041356086730957, "learning_rate": 4.981828181909124e-05, "loss": 0.675, "num_input_tokens_seen": 10485128, "step": 18070 }, { "epoch": 2.6921358355674707, "grad_norm": 0.600344181060791, "learning_rate": 4.981789053486322e-05, "loss": 0.6709, "num_input_tokens_seen": 10488296, "step": 18075 }, { "epoch": 2.6928805481084304, "grad_norm": 1.05772864818573, "learning_rate": 4.981749883136215e-05, "loss": 0.6185, "num_input_tokens_seen": 10490984, "step": 18080 }, { "epoch": 2.693625260649389, "grad_norm": 1.4542444944381714, "learning_rate": 4.981710670859467e-05, "loss": 0.6997, "num_input_tokens_seen": 10493832, "step": 18085 }, { "epoch": 2.6943699731903488, "grad_norm": 1.573060154914856, "learning_rate": 4.98167141665674e-05, "loss": 0.7252, "num_input_tokens_seen": 10496616, "step": 18090 }, { "epoch": 2.6951146857313075, "grad_norm": 1.0672788619995117, "learning_rate": 4.981632120528696e-05, "loss": 0.6322, "num_input_tokens_seen": 10499400, "step": 18095 }, { "epoch": 2.695859398272267, "grad_norm": 0.7605207562446594, "learning_rate": 4.981592782476e-05, "loss": 0.7546, "num_input_tokens_seen": 10502248, "step": 18100 }, { "epoch": 2.696604110813226, "grad_norm": 1.386034607887268, "learning_rate": 4.981553402499316e-05, "loss": 0.7644, "num_input_tokens_seen": 10504936, "step": 18105 }, { "epoch": 2.697348823354185, "grad_norm": 1.2898023128509521, "learning_rate": 4.9815139805993086e-05, "loss": 0.7605, "num_input_tokens_seen": 10507944, "step": 18110 }, { "epoch": 2.6980935358951443, "grad_norm": 1.0889594554901123, "learning_rate": 4.9814745167766455e-05, "loss": 0.7736, "num_input_tokens_seen": 10510696, "step": 18115 }, { "epoch": 2.6988382484361035, "grad_norm": 1.0049028396606445, "learning_rate": 4.981435011031992e-05, "loss": 0.6474, "num_input_tokens_seen": 10513576, "step": 18120 }, { "epoch": 2.6995829609770627, "grad_norm": 1.1832679510116577, "learning_rate": 4.9813954633660166e-05, "loss": 0.4929, "num_input_tokens_seen": 10516520, "step": 18125 }, { "epoch": 2.700327673518022, "grad_norm": 0.7144086956977844, "learning_rate": 4.9813558737793865e-05, "loss": 0.6026, "num_input_tokens_seen": 10519400, "step": 18130 }, { "epoch": 2.701072386058981, "grad_norm": 0.7129385471343994, "learning_rate": 4.9813162422727705e-05, "loss": 0.6713, "num_input_tokens_seen": 10522696, "step": 18135 }, { "epoch": 2.7018170985999403, "grad_norm": 0.7382401823997498, "learning_rate": 4.981276568846839e-05, "loss": 0.8041, "num_input_tokens_seen": 10525736, "step": 18140 }, { "epoch": 2.7025618111408996, "grad_norm": 0.9819509387016296, "learning_rate": 4.981236853502261e-05, "loss": 0.5554, "num_input_tokens_seen": 10528808, "step": 18145 }, { "epoch": 2.7033065236818588, "grad_norm": 1.218688726425171, "learning_rate": 4.9811970962397095e-05, "loss": 0.6823, "num_input_tokens_seen": 10531880, "step": 18150 }, { "epoch": 2.704051236222818, "grad_norm": 1.427365779876709, "learning_rate": 4.981157297059853e-05, "loss": 0.7516, "num_input_tokens_seen": 10534696, "step": 18155 }, { "epoch": 2.704795948763777, "grad_norm": 1.058807134628296, "learning_rate": 4.981117455963367e-05, "loss": 0.5781, "num_input_tokens_seen": 10537608, "step": 18160 }, { "epoch": 2.7055406613047364, "grad_norm": 0.6371505260467529, "learning_rate": 4.981077572950923e-05, "loss": 0.5888, "num_input_tokens_seen": 10540040, "step": 18165 }, { "epoch": 2.7062853738456956, "grad_norm": 0.8443830609321594, "learning_rate": 4.9810376480231944e-05, "loss": 0.6455, "num_input_tokens_seen": 10542632, "step": 18170 }, { "epoch": 2.7070300863866548, "grad_norm": 1.0668997764587402, "learning_rate": 4.980997681180858e-05, "loss": 0.4887, "num_input_tokens_seen": 10545512, "step": 18175 }, { "epoch": 2.707774798927614, "grad_norm": 0.8064216375350952, "learning_rate": 4.980957672424586e-05, "loss": 0.6342, "num_input_tokens_seen": 10548200, "step": 18180 }, { "epoch": 2.708519511468573, "grad_norm": 0.9882312417030334, "learning_rate": 4.980917621755056e-05, "loss": 0.667, "num_input_tokens_seen": 10551016, "step": 18185 }, { "epoch": 2.7092642240095324, "grad_norm": 1.769150733947754, "learning_rate": 4.9808775291729445e-05, "loss": 0.5636, "num_input_tokens_seen": 10553416, "step": 18190 }, { "epoch": 2.7100089365504916, "grad_norm": 1.5725972652435303, "learning_rate": 4.980837394678928e-05, "loss": 0.7324, "num_input_tokens_seen": 10556456, "step": 18195 }, { "epoch": 2.710753649091451, "grad_norm": 0.8286890387535095, "learning_rate": 4.980797218273685e-05, "loss": 0.5968, "num_input_tokens_seen": 10559528, "step": 18200 }, { "epoch": 2.71149836163241, "grad_norm": 1.0167828798294067, "learning_rate": 4.980756999957895e-05, "loss": 0.6487, "num_input_tokens_seen": 10562664, "step": 18205 }, { "epoch": 2.712243074173369, "grad_norm": 0.7873126268386841, "learning_rate": 4.9807167397322376e-05, "loss": 0.5515, "num_input_tokens_seen": 10565576, "step": 18210 }, { "epoch": 2.7129877867143284, "grad_norm": 1.135756492614746, "learning_rate": 4.980676437597391e-05, "loss": 0.6559, "num_input_tokens_seen": 10568360, "step": 18215 }, { "epoch": 2.7137324992552876, "grad_norm": 0.9083162546157837, "learning_rate": 4.980636093554038e-05, "loss": 0.7014, "num_input_tokens_seen": 10571432, "step": 18220 }, { "epoch": 2.714477211796247, "grad_norm": 0.7361985445022583, "learning_rate": 4.980595707602858e-05, "loss": 0.6127, "num_input_tokens_seen": 10574600, "step": 18225 }, { "epoch": 2.715221924337206, "grad_norm": 1.5282341241836548, "learning_rate": 4.980555279744535e-05, "loss": 0.8138, "num_input_tokens_seen": 10577672, "step": 18230 }, { "epoch": 2.715966636878165, "grad_norm": 0.9817619919776917, "learning_rate": 4.980514809979753e-05, "loss": 0.5534, "num_input_tokens_seen": 10580392, "step": 18235 }, { "epoch": 2.716711349419124, "grad_norm": 0.4827839434146881, "learning_rate": 4.9804742983091934e-05, "loss": 0.6134, "num_input_tokens_seen": 10583304, "step": 18240 }, { "epoch": 2.7174560619600836, "grad_norm": 1.074564814567566, "learning_rate": 4.9804337447335414e-05, "loss": 0.696, "num_input_tokens_seen": 10585960, "step": 18245 }, { "epoch": 2.7182007745010424, "grad_norm": 1.4232828617095947, "learning_rate": 4.980393149253483e-05, "loss": 0.5113, "num_input_tokens_seen": 10588648, "step": 18250 }, { "epoch": 2.718945487042002, "grad_norm": 0.8664072155952454, "learning_rate": 4.980352511869703e-05, "loss": 0.6348, "num_input_tokens_seen": 10591560, "step": 18255 }, { "epoch": 2.7196901995829608, "grad_norm": 0.9188960194587708, "learning_rate": 4.980311832582888e-05, "loss": 0.5956, "num_input_tokens_seen": 10594248, "step": 18260 }, { "epoch": 2.7204349121239204, "grad_norm": 1.5415489673614502, "learning_rate": 4.980271111393726e-05, "loss": 0.7174, "num_input_tokens_seen": 10597064, "step": 18265 }, { "epoch": 2.721179624664879, "grad_norm": 0.797696590423584, "learning_rate": 4.980230348302904e-05, "loss": 0.7508, "num_input_tokens_seen": 10599944, "step": 18270 }, { "epoch": 2.721924337205839, "grad_norm": 1.0790168046951294, "learning_rate": 4.9801895433111115e-05, "loss": 0.6261, "num_input_tokens_seen": 10602760, "step": 18275 }, { "epoch": 2.7226690497467976, "grad_norm": 1.15316903591156, "learning_rate": 4.9801486964190366e-05, "loss": 0.6463, "num_input_tokens_seen": 10606184, "step": 18280 }, { "epoch": 2.723413762287757, "grad_norm": 1.3265258073806763, "learning_rate": 4.9801078076273704e-05, "loss": 0.648, "num_input_tokens_seen": 10609064, "step": 18285 }, { "epoch": 2.724158474828716, "grad_norm": 1.4492822885513306, "learning_rate": 4.980066876936804e-05, "loss": 0.772, "num_input_tokens_seen": 10611912, "step": 18290 }, { "epoch": 2.724903187369675, "grad_norm": 1.436476707458496, "learning_rate": 4.980025904348028e-05, "loss": 0.6778, "num_input_tokens_seen": 10614984, "step": 18295 }, { "epoch": 2.7256478999106344, "grad_norm": 1.0732805728912354, "learning_rate": 4.979984889861735e-05, "loss": 0.7687, "num_input_tokens_seen": 10617640, "step": 18300 }, { "epoch": 2.7263926124515936, "grad_norm": 1.1439489126205444, "learning_rate": 4.9799438334786174e-05, "loss": 0.8449, "num_input_tokens_seen": 10620680, "step": 18305 }, { "epoch": 2.727137324992553, "grad_norm": 0.8266502618789673, "learning_rate": 4.97990273519937e-05, "loss": 0.7149, "num_input_tokens_seen": 10623464, "step": 18310 }, { "epoch": 2.727882037533512, "grad_norm": 1.1437749862670898, "learning_rate": 4.9798615950246855e-05, "loss": 0.7773, "num_input_tokens_seen": 10626216, "step": 18315 }, { "epoch": 2.728626750074471, "grad_norm": 1.4561805725097656, "learning_rate": 4.97982041295526e-05, "loss": 0.5907, "num_input_tokens_seen": 10629512, "step": 18320 }, { "epoch": 2.7293714626154304, "grad_norm": 1.495393991470337, "learning_rate": 4.97977918899179e-05, "loss": 0.7813, "num_input_tokens_seen": 10632264, "step": 18325 }, { "epoch": 2.7301161751563896, "grad_norm": 1.2464033365249634, "learning_rate": 4.97973792313497e-05, "loss": 0.754, "num_input_tokens_seen": 10635304, "step": 18330 }, { "epoch": 2.730860887697349, "grad_norm": 0.7829005718231201, "learning_rate": 4.979696615385499e-05, "loss": 0.8428, "num_input_tokens_seen": 10637960, "step": 18335 }, { "epoch": 2.731605600238308, "grad_norm": 1.5673465728759766, "learning_rate": 4.979655265744072e-05, "loss": 0.7277, "num_input_tokens_seen": 10640680, "step": 18340 }, { "epoch": 2.732350312779267, "grad_norm": 1.801218032836914, "learning_rate": 4.979613874211391e-05, "loss": 0.8411, "num_input_tokens_seen": 10643496, "step": 18345 }, { "epoch": 2.7330950253202264, "grad_norm": 0.9266399145126343, "learning_rate": 4.979572440788154e-05, "loss": 0.6659, "num_input_tokens_seen": 10646408, "step": 18350 }, { "epoch": 2.7338397378611856, "grad_norm": 1.0488383769989014, "learning_rate": 4.97953096547506e-05, "loss": 0.6941, "num_input_tokens_seen": 10649064, "step": 18355 }, { "epoch": 2.734584450402145, "grad_norm": 0.9555533528327942, "learning_rate": 4.9794894482728105e-05, "loss": 0.7116, "num_input_tokens_seen": 10651752, "step": 18360 }, { "epoch": 2.735329162943104, "grad_norm": 0.8825251460075378, "learning_rate": 4.979447889182107e-05, "loss": 0.7471, "num_input_tokens_seen": 10654376, "step": 18365 }, { "epoch": 2.7360738754840632, "grad_norm": 1.6132447719573975, "learning_rate": 4.979406288203651e-05, "loss": 0.6609, "num_input_tokens_seen": 10657448, "step": 18370 }, { "epoch": 2.7368185880250224, "grad_norm": 1.0798990726470947, "learning_rate": 4.979364645338146e-05, "loss": 0.7573, "num_input_tokens_seen": 10660264, "step": 18375 }, { "epoch": 2.7375633005659816, "grad_norm": 1.137964129447937, "learning_rate": 4.979322960586296e-05, "loss": 0.7127, "num_input_tokens_seen": 10663336, "step": 18380 }, { "epoch": 2.738308013106941, "grad_norm": 1.2580598592758179, "learning_rate": 4.979281233948803e-05, "loss": 0.7833, "num_input_tokens_seen": 10666216, "step": 18385 }, { "epoch": 2.7390527256479, "grad_norm": 0.5745387673377991, "learning_rate": 4.9792394654263744e-05, "loss": 0.54, "num_input_tokens_seen": 10669384, "step": 18390 }, { "epoch": 2.7397974381888592, "grad_norm": 0.7385257482528687, "learning_rate": 4.9791976550197144e-05, "loss": 0.5574, "num_input_tokens_seen": 10672232, "step": 18395 }, { "epoch": 2.7405421507298184, "grad_norm": 1.696683406829834, "learning_rate": 4.9791558027295296e-05, "loss": 0.6807, "num_input_tokens_seen": 10675112, "step": 18400 }, { "epoch": 2.7412868632707776, "grad_norm": 1.6522670984268188, "learning_rate": 4.9791139085565274e-05, "loss": 0.746, "num_input_tokens_seen": 10678088, "step": 18405 }, { "epoch": 2.742031575811737, "grad_norm": 0.891719400882721, "learning_rate": 4.9790719725014154e-05, "loss": 0.7424, "num_input_tokens_seen": 10681256, "step": 18410 }, { "epoch": 2.7427762883526956, "grad_norm": 0.6415848135948181, "learning_rate": 4.979029994564902e-05, "loss": 0.6209, "num_input_tokens_seen": 10684008, "step": 18415 }, { "epoch": 2.7435210008936552, "grad_norm": 0.8609561920166016, "learning_rate": 4.978987974747697e-05, "loss": 0.6916, "num_input_tokens_seen": 10686728, "step": 18420 }, { "epoch": 2.744265713434614, "grad_norm": 1.5918892621994019, "learning_rate": 4.9789459130505086e-05, "loss": 0.5711, "num_input_tokens_seen": 10689384, "step": 18425 }, { "epoch": 2.7450104259755737, "grad_norm": 1.7337048053741455, "learning_rate": 4.97890380947405e-05, "loss": 0.7653, "num_input_tokens_seen": 10692744, "step": 18430 }, { "epoch": 2.7457551385165324, "grad_norm": 1.4781519174575806, "learning_rate": 4.97886166401903e-05, "loss": 0.6145, "num_input_tokens_seen": 10695656, "step": 18435 }, { "epoch": 2.746499851057492, "grad_norm": 1.0229108333587646, "learning_rate": 4.978819476686162e-05, "loss": 0.6546, "num_input_tokens_seen": 10698472, "step": 18440 }, { "epoch": 2.747244563598451, "grad_norm": 0.9117957949638367, "learning_rate": 4.9787772474761575e-05, "loss": 0.6544, "num_input_tokens_seen": 10701640, "step": 18445 }, { "epoch": 2.7479892761394105, "grad_norm": 0.9419746994972229, "learning_rate": 4.978734976389732e-05, "loss": 0.8436, "num_input_tokens_seen": 10704392, "step": 18450 }, { "epoch": 2.7487339886803692, "grad_norm": 1.054561734199524, "learning_rate": 4.9786926634275964e-05, "loss": 0.6325, "num_input_tokens_seen": 10707080, "step": 18455 }, { "epoch": 2.7494787012213284, "grad_norm": 1.404160499572754, "learning_rate": 4.978650308590469e-05, "loss": 0.6827, "num_input_tokens_seen": 10709928, "step": 18460 }, { "epoch": 2.7502234137622876, "grad_norm": 1.1078072786331177, "learning_rate": 4.9786079118790635e-05, "loss": 0.746, "num_input_tokens_seen": 10713128, "step": 18465 }, { "epoch": 2.750968126303247, "grad_norm": 0.8049167394638062, "learning_rate": 4.9785654732940964e-05, "loss": 0.5421, "num_input_tokens_seen": 10716168, "step": 18470 }, { "epoch": 2.751712838844206, "grad_norm": 1.4747120141983032, "learning_rate": 4.9785229928362854e-05, "loss": 0.8468, "num_input_tokens_seen": 10718888, "step": 18475 }, { "epoch": 2.7524575513851652, "grad_norm": 1.0554862022399902, "learning_rate": 4.9784804705063465e-05, "loss": 0.7017, "num_input_tokens_seen": 10722056, "step": 18480 }, { "epoch": 2.7532022639261244, "grad_norm": 3.9648146629333496, "learning_rate": 4.978437906304999e-05, "loss": 0.6648, "num_input_tokens_seen": 10725064, "step": 18485 }, { "epoch": 2.7539469764670836, "grad_norm": 0.756855845451355, "learning_rate": 4.978395300232963e-05, "loss": 0.6583, "num_input_tokens_seen": 10728008, "step": 18490 }, { "epoch": 2.754691689008043, "grad_norm": 0.7320258021354675, "learning_rate": 4.978352652290956e-05, "loss": 0.7072, "num_input_tokens_seen": 10730824, "step": 18495 }, { "epoch": 2.755436401549002, "grad_norm": 0.9170675277709961, "learning_rate": 4.978309962479701e-05, "loss": 0.7423, "num_input_tokens_seen": 10733640, "step": 18500 }, { "epoch": 2.7561811140899612, "grad_norm": 0.6694431900978088, "learning_rate": 4.978267230799918e-05, "loss": 0.5668, "num_input_tokens_seen": 10736360, "step": 18505 }, { "epoch": 2.7569258266309205, "grad_norm": 1.3040555715560913, "learning_rate": 4.9782244572523284e-05, "loss": 0.6208, "num_input_tokens_seen": 10739272, "step": 18510 }, { "epoch": 2.7576705391718797, "grad_norm": 0.8774930238723755, "learning_rate": 4.978181641837656e-05, "loss": 0.6787, "num_input_tokens_seen": 10742184, "step": 18515 }, { "epoch": 2.758415251712839, "grad_norm": 0.6926966309547424, "learning_rate": 4.978138784556623e-05, "loss": 0.7316, "num_input_tokens_seen": 10745000, "step": 18520 }, { "epoch": 2.759159964253798, "grad_norm": 0.8450611233711243, "learning_rate": 4.9780958854099535e-05, "loss": 0.7414, "num_input_tokens_seen": 10747752, "step": 18525 }, { "epoch": 2.7599046767947573, "grad_norm": 0.6073600053787231, "learning_rate": 4.978052944398373e-05, "loss": 0.5488, "num_input_tokens_seen": 10750792, "step": 18530 }, { "epoch": 2.7606493893357165, "grad_norm": 1.0421587228775024, "learning_rate": 4.978009961522607e-05, "loss": 0.6973, "num_input_tokens_seen": 10753832, "step": 18535 }, { "epoch": 2.7613941018766757, "grad_norm": 1.003301739692688, "learning_rate": 4.9779669367833804e-05, "loss": 0.5907, "num_input_tokens_seen": 10756584, "step": 18540 }, { "epoch": 2.762138814417635, "grad_norm": 1.1410200595855713, "learning_rate": 4.9779238701814214e-05, "loss": 0.7544, "num_input_tokens_seen": 10759432, "step": 18545 }, { "epoch": 2.762883526958594, "grad_norm": 1.2199722528457642, "learning_rate": 4.977880761717457e-05, "loss": 0.8165, "num_input_tokens_seen": 10762472, "step": 18550 }, { "epoch": 2.7636282394995533, "grad_norm": 1.3083220720291138, "learning_rate": 4.977837611392216e-05, "loss": 0.9202, "num_input_tokens_seen": 10765256, "step": 18555 }, { "epoch": 2.7643729520405125, "grad_norm": 0.6713888049125671, "learning_rate": 4.9777944192064264e-05, "loss": 0.606, "num_input_tokens_seen": 10767880, "step": 18560 }, { "epoch": 2.7651176645814717, "grad_norm": 0.9572186470031738, "learning_rate": 4.9777511851608185e-05, "loss": 0.8354, "num_input_tokens_seen": 10770664, "step": 18565 }, { "epoch": 2.765862377122431, "grad_norm": 1.7819169759750366, "learning_rate": 4.9777079092561224e-05, "loss": 0.8332, "num_input_tokens_seen": 10773352, "step": 18570 }, { "epoch": 2.76660708966339, "grad_norm": 0.7831771969795227, "learning_rate": 4.97766459149307e-05, "loss": 0.7422, "num_input_tokens_seen": 10776360, "step": 18575 }, { "epoch": 2.7673518022043493, "grad_norm": 0.7630377411842346, "learning_rate": 4.977621231872392e-05, "loss": 0.6348, "num_input_tokens_seen": 10779336, "step": 18580 }, { "epoch": 2.7680965147453085, "grad_norm": 0.756471574306488, "learning_rate": 4.977577830394822e-05, "loss": 0.7318, "num_input_tokens_seen": 10782152, "step": 18585 }, { "epoch": 2.7688412272862672, "grad_norm": 1.8970810174942017, "learning_rate": 4.977534387061091e-05, "loss": 0.7982, "num_input_tokens_seen": 10785096, "step": 18590 }, { "epoch": 2.769585939827227, "grad_norm": 0.6108636260032654, "learning_rate": 4.977490901871936e-05, "loss": 0.7086, "num_input_tokens_seen": 10787944, "step": 18595 }, { "epoch": 2.7703306523681857, "grad_norm": 1.327327013015747, "learning_rate": 4.97744737482809e-05, "loss": 0.6678, "num_input_tokens_seen": 10791176, "step": 18600 }, { "epoch": 2.7710753649091453, "grad_norm": 0.9491395950317383, "learning_rate": 4.977403805930288e-05, "loss": 0.8051, "num_input_tokens_seen": 10793864, "step": 18605 }, { "epoch": 2.771820077450104, "grad_norm": 1.2954087257385254, "learning_rate": 4.977360195179268e-05, "loss": 0.6555, "num_input_tokens_seen": 10796328, "step": 18610 }, { "epoch": 2.7725647899910637, "grad_norm": 1.385331153869629, "learning_rate": 4.9773165425757646e-05, "loss": 0.7591, "num_input_tokens_seen": 10798984, "step": 18615 }, { "epoch": 2.7733095025320225, "grad_norm": 0.721187949180603, "learning_rate": 4.977272848120516e-05, "loss": 0.8332, "num_input_tokens_seen": 10801768, "step": 18620 }, { "epoch": 2.7740542150729817, "grad_norm": 0.7395844459533691, "learning_rate": 4.9772291118142604e-05, "loss": 0.709, "num_input_tokens_seen": 10804744, "step": 18625 }, { "epoch": 2.774798927613941, "grad_norm": 0.8710947036743164, "learning_rate": 4.9771853336577366e-05, "loss": 0.6856, "num_input_tokens_seen": 10807688, "step": 18630 }, { "epoch": 2.7755436401549, "grad_norm": 1.2304726839065552, "learning_rate": 4.9771415136516846e-05, "loss": 0.7145, "num_input_tokens_seen": 10810472, "step": 18635 }, { "epoch": 2.7762883526958593, "grad_norm": 1.1215736865997314, "learning_rate": 4.977097651796844e-05, "loss": 0.8114, "num_input_tokens_seen": 10813224, "step": 18640 }, { "epoch": 2.7770330652368185, "grad_norm": 0.6634880304336548, "learning_rate": 4.977053748093956e-05, "loss": 0.6596, "num_input_tokens_seen": 10815784, "step": 18645 }, { "epoch": 2.7777777777777777, "grad_norm": 0.7498555779457092, "learning_rate": 4.9770098025437634e-05, "loss": 0.6764, "num_input_tokens_seen": 10818792, "step": 18650 }, { "epoch": 2.778522490318737, "grad_norm": 0.8079434037208557, "learning_rate": 4.9769658151470075e-05, "loss": 0.5817, "num_input_tokens_seen": 10821800, "step": 18655 }, { "epoch": 2.779267202859696, "grad_norm": 0.6282769441604614, "learning_rate": 4.976921785904431e-05, "loss": 0.6744, "num_input_tokens_seen": 10824968, "step": 18660 }, { "epoch": 2.7800119154006553, "grad_norm": 0.6011875867843628, "learning_rate": 4.976877714816779e-05, "loss": 0.636, "num_input_tokens_seen": 10827912, "step": 18665 }, { "epoch": 2.7807566279416145, "grad_norm": 0.8969115018844604, "learning_rate": 4.976833601884795e-05, "loss": 0.5256, "num_input_tokens_seen": 10830728, "step": 18670 }, { "epoch": 2.7815013404825737, "grad_norm": 1.436140537261963, "learning_rate": 4.9767894471092246e-05, "loss": 0.7126, "num_input_tokens_seen": 10833544, "step": 18675 }, { "epoch": 2.782246053023533, "grad_norm": 0.8856062293052673, "learning_rate": 4.9767452504908143e-05, "loss": 0.6063, "num_input_tokens_seen": 10836296, "step": 18680 }, { "epoch": 2.782990765564492, "grad_norm": 0.8474796414375305, "learning_rate": 4.9767010120303094e-05, "loss": 0.7994, "num_input_tokens_seen": 10839336, "step": 18685 }, { "epoch": 2.7837354781054513, "grad_norm": 1.7695395946502686, "learning_rate": 4.9766567317284585e-05, "loss": 0.652, "num_input_tokens_seen": 10841864, "step": 18690 }, { "epoch": 2.7844801906464105, "grad_norm": 0.8728076219558716, "learning_rate": 4.976612409586009e-05, "loss": 0.694, "num_input_tokens_seen": 10844680, "step": 18695 }, { "epoch": 2.7852249031873697, "grad_norm": 0.8582123517990112, "learning_rate": 4.9765680456037106e-05, "loss": 0.6589, "num_input_tokens_seen": 10847368, "step": 18700 }, { "epoch": 2.785969615728329, "grad_norm": 0.8157289028167725, "learning_rate": 4.976523639782312e-05, "loss": 0.7274, "num_input_tokens_seen": 10850216, "step": 18705 }, { "epoch": 2.786714328269288, "grad_norm": 0.7078317403793335, "learning_rate": 4.976479192122563e-05, "loss": 0.7525, "num_input_tokens_seen": 10853032, "step": 18710 }, { "epoch": 2.7874590408102473, "grad_norm": 0.7347899675369263, "learning_rate": 4.9764347026252156e-05, "loss": 0.6952, "num_input_tokens_seen": 10856072, "step": 18715 }, { "epoch": 2.7882037533512065, "grad_norm": 0.7145355343818665, "learning_rate": 4.97639017129102e-05, "loss": 0.5947, "num_input_tokens_seen": 10858632, "step": 18720 }, { "epoch": 2.7889484658921657, "grad_norm": 0.9153357744216919, "learning_rate": 4.9763455981207305e-05, "loss": 0.7329, "num_input_tokens_seen": 10861704, "step": 18725 }, { "epoch": 2.789693178433125, "grad_norm": 0.7962161898612976, "learning_rate": 4.976300983115099e-05, "loss": 0.6932, "num_input_tokens_seen": 10864680, "step": 18730 }, { "epoch": 2.790437890974084, "grad_norm": 1.1252329349517822, "learning_rate": 4.976256326274878e-05, "loss": 0.7847, "num_input_tokens_seen": 10867720, "step": 18735 }, { "epoch": 2.7911826035150433, "grad_norm": 0.7612826824188232, "learning_rate": 4.976211627600823e-05, "loss": 0.6715, "num_input_tokens_seen": 10870408, "step": 18740 }, { "epoch": 2.7919273160560025, "grad_norm": 0.8330305218696594, "learning_rate": 4.976166887093691e-05, "loss": 0.5759, "num_input_tokens_seen": 10873320, "step": 18745 }, { "epoch": 2.7926720285969617, "grad_norm": 1.3385419845581055, "learning_rate": 4.976122104754235e-05, "loss": 0.9064, "num_input_tokens_seen": 10876552, "step": 18750 }, { "epoch": 2.7934167411379205, "grad_norm": 1.6025997400283813, "learning_rate": 4.976077280583212e-05, "loss": 0.771, "num_input_tokens_seen": 10879592, "step": 18755 }, { "epoch": 2.79416145367888, "grad_norm": 0.6354577541351318, "learning_rate": 4.9760324145813806e-05, "loss": 0.7136, "num_input_tokens_seen": 10882472, "step": 18760 }, { "epoch": 2.794906166219839, "grad_norm": 3.6644575595855713, "learning_rate": 4.975987506749499e-05, "loss": 0.8895, "num_input_tokens_seen": 10885352, "step": 18765 }, { "epoch": 2.7956508787607985, "grad_norm": 1.2683194875717163, "learning_rate": 4.975942557088324e-05, "loss": 0.6884, "num_input_tokens_seen": 10888488, "step": 18770 }, { "epoch": 2.7963955913017573, "grad_norm": 0.7667339444160461, "learning_rate": 4.9758975655986164e-05, "loss": 0.7677, "num_input_tokens_seen": 10891304, "step": 18775 }, { "epoch": 2.797140303842717, "grad_norm": 1.0477408170700073, "learning_rate": 4.975852532281135e-05, "loss": 0.7055, "num_input_tokens_seen": 10893928, "step": 18780 }, { "epoch": 2.7978850163836757, "grad_norm": 0.879615843296051, "learning_rate": 4.975807457136642e-05, "loss": 0.6533, "num_input_tokens_seen": 10896936, "step": 18785 }, { "epoch": 2.7986297289246354, "grad_norm": 0.5131047368049622, "learning_rate": 4.975762340165898e-05, "loss": 0.7979, "num_input_tokens_seen": 10899880, "step": 18790 }, { "epoch": 2.799374441465594, "grad_norm": 1.8244351148605347, "learning_rate": 4.975717181369666e-05, "loss": 0.7388, "num_input_tokens_seen": 10903016, "step": 18795 }, { "epoch": 2.8001191540065533, "grad_norm": 1.594252109527588, "learning_rate": 4.9756719807487076e-05, "loss": 0.6802, "num_input_tokens_seen": 10905960, "step": 18800 }, { "epoch": 2.8008638665475125, "grad_norm": 1.3282593488693237, "learning_rate": 4.975626738303788e-05, "loss": 0.5771, "num_input_tokens_seen": 10908808, "step": 18805 }, { "epoch": 2.8016085790884717, "grad_norm": 0.8858806490898132, "learning_rate": 4.975581454035671e-05, "loss": 0.711, "num_input_tokens_seen": 10911720, "step": 18810 }, { "epoch": 2.802353291629431, "grad_norm": 0.7770121097564697, "learning_rate": 4.975536127945121e-05, "loss": 0.621, "num_input_tokens_seen": 10914920, "step": 18815 }, { "epoch": 2.80309800417039, "grad_norm": 0.5498865246772766, "learning_rate": 4.975490760032904e-05, "loss": 0.6164, "num_input_tokens_seen": 10917832, "step": 18820 }, { "epoch": 2.8038427167113493, "grad_norm": 0.7684054374694824, "learning_rate": 4.975445350299787e-05, "loss": 0.6445, "num_input_tokens_seen": 10920616, "step": 18825 }, { "epoch": 2.8045874292523085, "grad_norm": 0.8707605004310608, "learning_rate": 4.975399898746536e-05, "loss": 0.6516, "num_input_tokens_seen": 10923656, "step": 18830 }, { "epoch": 2.8053321417932677, "grad_norm": 0.6306900978088379, "learning_rate": 4.9753544053739197e-05, "loss": 0.7085, "num_input_tokens_seen": 10926664, "step": 18835 }, { "epoch": 2.806076854334227, "grad_norm": 0.9480266571044922, "learning_rate": 4.975308870182707e-05, "loss": 0.648, "num_input_tokens_seen": 10929576, "step": 18840 }, { "epoch": 2.806821566875186, "grad_norm": 0.7433620095252991, "learning_rate": 4.9752632931736665e-05, "loss": 0.7206, "num_input_tokens_seen": 10932424, "step": 18845 }, { "epoch": 2.8075662794161453, "grad_norm": 0.902256965637207, "learning_rate": 4.9752176743475684e-05, "loss": 0.7117, "num_input_tokens_seen": 10935208, "step": 18850 }, { "epoch": 2.8083109919571045, "grad_norm": 0.9538919925689697, "learning_rate": 4.9751720137051836e-05, "loss": 0.6754, "num_input_tokens_seen": 10938024, "step": 18855 }, { "epoch": 2.8090557044980637, "grad_norm": 0.7242757678031921, "learning_rate": 4.9751263112472834e-05, "loss": 0.6939, "num_input_tokens_seen": 10941160, "step": 18860 }, { "epoch": 2.809800417039023, "grad_norm": 0.9733222126960754, "learning_rate": 4.9750805669746395e-05, "loss": 0.6012, "num_input_tokens_seen": 10943784, "step": 18865 }, { "epoch": 2.810545129579982, "grad_norm": 0.7618296146392822, "learning_rate": 4.975034780888025e-05, "loss": 0.8305, "num_input_tokens_seen": 10946792, "step": 18870 }, { "epoch": 2.8112898421209414, "grad_norm": 1.0829960107803345, "learning_rate": 4.9749889529882134e-05, "loss": 0.6883, "num_input_tokens_seen": 10949576, "step": 18875 }, { "epoch": 2.8120345546619006, "grad_norm": 0.7023564577102661, "learning_rate": 4.974943083275979e-05, "loss": 0.7994, "num_input_tokens_seen": 10952104, "step": 18880 }, { "epoch": 2.8127792672028598, "grad_norm": 1.303881049156189, "learning_rate": 4.974897171752097e-05, "loss": 0.6871, "num_input_tokens_seen": 10954888, "step": 18885 }, { "epoch": 2.813523979743819, "grad_norm": 0.6374669671058655, "learning_rate": 4.9748512184173416e-05, "loss": 0.582, "num_input_tokens_seen": 10957768, "step": 18890 }, { "epoch": 2.814268692284778, "grad_norm": 1.0252939462661743, "learning_rate": 4.9748052232724905e-05, "loss": 0.6428, "num_input_tokens_seen": 10960488, "step": 18895 }, { "epoch": 2.8150134048257374, "grad_norm": 0.6019003987312317, "learning_rate": 4.974759186318321e-05, "loss": 0.6368, "num_input_tokens_seen": 10963464, "step": 18900 }, { "epoch": 2.8157581173666966, "grad_norm": 0.937027633190155, "learning_rate": 4.97471310755561e-05, "loss": 0.6102, "num_input_tokens_seen": 10966376, "step": 18905 }, { "epoch": 2.8165028299076558, "grad_norm": 1.0015573501586914, "learning_rate": 4.974666986985136e-05, "loss": 0.9792, "num_input_tokens_seen": 10969224, "step": 18910 }, { "epoch": 2.817247542448615, "grad_norm": 0.7039412260055542, "learning_rate": 4.974620824607679e-05, "loss": 0.5756, "num_input_tokens_seen": 10971944, "step": 18915 }, { "epoch": 2.817992254989574, "grad_norm": 0.5206055641174316, "learning_rate": 4.9745746204240175e-05, "loss": 0.7332, "num_input_tokens_seen": 10975016, "step": 18920 }, { "epoch": 2.8187369675305334, "grad_norm": 0.8511110544204712, "learning_rate": 4.974528374434934e-05, "loss": 0.6838, "num_input_tokens_seen": 10977672, "step": 18925 }, { "epoch": 2.819481680071492, "grad_norm": 1.2671270370483398, "learning_rate": 4.974482086641207e-05, "loss": 0.7086, "num_input_tokens_seen": 10980360, "step": 18930 }, { "epoch": 2.820226392612452, "grad_norm": 0.822178065776825, "learning_rate": 4.974435757043621e-05, "loss": 0.6356, "num_input_tokens_seen": 10983272, "step": 18935 }, { "epoch": 2.8209711051534105, "grad_norm": 0.9603022336959839, "learning_rate": 4.974389385642958e-05, "loss": 0.7317, "num_input_tokens_seen": 10985992, "step": 18940 }, { "epoch": 2.82171581769437, "grad_norm": 1.973262906074524, "learning_rate": 4.9743429724400007e-05, "loss": 0.7216, "num_input_tokens_seen": 10988872, "step": 18945 }, { "epoch": 2.822460530235329, "grad_norm": 0.7107555866241455, "learning_rate": 4.974296517435534e-05, "loss": 0.6923, "num_input_tokens_seen": 10991784, "step": 18950 }, { "epoch": 2.8232052427762886, "grad_norm": 1.9977527856826782, "learning_rate": 4.974250020630342e-05, "loss": 0.7528, "num_input_tokens_seen": 10994728, "step": 18955 }, { "epoch": 2.8239499553172474, "grad_norm": 0.6610994935035706, "learning_rate": 4.9742034820252116e-05, "loss": 0.7358, "num_input_tokens_seen": 10997576, "step": 18960 }, { "epoch": 2.824694667858207, "grad_norm": 1.2423739433288574, "learning_rate": 4.974156901620927e-05, "loss": 0.6161, "num_input_tokens_seen": 11000168, "step": 18965 }, { "epoch": 2.8254393803991658, "grad_norm": 1.1801179647445679, "learning_rate": 4.974110279418277e-05, "loss": 0.684, "num_input_tokens_seen": 11002984, "step": 18970 }, { "epoch": 2.826184092940125, "grad_norm": 0.7957686185836792, "learning_rate": 4.9740636154180476e-05, "loss": 0.7378, "num_input_tokens_seen": 11005768, "step": 18975 }, { "epoch": 2.826928805481084, "grad_norm": 0.9645434617996216, "learning_rate": 4.974016909621029e-05, "loss": 0.7248, "num_input_tokens_seen": 11008744, "step": 18980 }, { "epoch": 2.8276735180220434, "grad_norm": 0.7507906556129456, "learning_rate": 4.9739701620280076e-05, "loss": 0.7991, "num_input_tokens_seen": 11011720, "step": 18985 }, { "epoch": 2.8284182305630026, "grad_norm": 1.0008575916290283, "learning_rate": 4.973923372639776e-05, "loss": 0.6443, "num_input_tokens_seen": 11014312, "step": 18990 }, { "epoch": 2.8291629431039618, "grad_norm": 0.7120854258537292, "learning_rate": 4.973876541457123e-05, "loss": 0.6628, "num_input_tokens_seen": 11017128, "step": 18995 }, { "epoch": 2.829907655644921, "grad_norm": 0.8821698427200317, "learning_rate": 4.97382966848084e-05, "loss": 0.7137, "num_input_tokens_seen": 11020072, "step": 19000 }, { "epoch": 2.83065236818588, "grad_norm": 0.7082727551460266, "learning_rate": 4.9737827537117196e-05, "loss": 0.6206, "num_input_tokens_seen": 11023080, "step": 19005 }, { "epoch": 2.8313970807268394, "grad_norm": 0.852128803730011, "learning_rate": 4.973735797150553e-05, "loss": 0.6636, "num_input_tokens_seen": 11025960, "step": 19010 }, { "epoch": 2.8321417932677986, "grad_norm": 0.8068310022354126, "learning_rate": 4.973688798798135e-05, "loss": 0.7303, "num_input_tokens_seen": 11028488, "step": 19015 }, { "epoch": 2.832886505808758, "grad_norm": 0.8683531284332275, "learning_rate": 4.973641758655259e-05, "loss": 0.8198, "num_input_tokens_seen": 11031464, "step": 19020 }, { "epoch": 2.833631218349717, "grad_norm": 1.76139235496521, "learning_rate": 4.973594676722719e-05, "loss": 0.7055, "num_input_tokens_seen": 11034440, "step": 19025 }, { "epoch": 2.834375930890676, "grad_norm": 1.0152090787887573, "learning_rate": 4.973547553001311e-05, "loss": 0.8046, "num_input_tokens_seen": 11037384, "step": 19030 }, { "epoch": 2.8351206434316354, "grad_norm": 0.7600873112678528, "learning_rate": 4.9735003874918314e-05, "loss": 0.5763, "num_input_tokens_seen": 11040392, "step": 19035 }, { "epoch": 2.8358653559725946, "grad_norm": 0.7318819761276245, "learning_rate": 4.9734531801950765e-05, "loss": 0.8304, "num_input_tokens_seen": 11043528, "step": 19040 }, { "epoch": 2.836610068513554, "grad_norm": 0.7203708291053772, "learning_rate": 4.9734059311118444e-05, "loss": 0.6041, "num_input_tokens_seen": 11046216, "step": 19045 }, { "epoch": 2.837354781054513, "grad_norm": 0.9816778302192688, "learning_rate": 4.973358640242932e-05, "loss": 0.7137, "num_input_tokens_seen": 11049128, "step": 19050 }, { "epoch": 2.838099493595472, "grad_norm": 0.760685384273529, "learning_rate": 4.97331130758914e-05, "loss": 0.6464, "num_input_tokens_seen": 11051976, "step": 19055 }, { "epoch": 2.8388442061364314, "grad_norm": 2.2329728603363037, "learning_rate": 4.9732639331512675e-05, "loss": 0.7748, "num_input_tokens_seen": 11054888, "step": 19060 }, { "epoch": 2.8395889186773906, "grad_norm": 0.7404086589813232, "learning_rate": 4.973216516930114e-05, "loss": 0.6086, "num_input_tokens_seen": 11057896, "step": 19065 }, { "epoch": 2.84033363121835, "grad_norm": 1.0429681539535522, "learning_rate": 4.973169058926481e-05, "loss": 0.7411, "num_input_tokens_seen": 11060808, "step": 19070 }, { "epoch": 2.841078343759309, "grad_norm": 0.7436699271202087, "learning_rate": 4.973121559141171e-05, "loss": 0.751, "num_input_tokens_seen": 11063592, "step": 19075 }, { "epoch": 2.841823056300268, "grad_norm": 0.8703271746635437, "learning_rate": 4.9730740175749854e-05, "loss": 0.7721, "num_input_tokens_seen": 11066376, "step": 19080 }, { "epoch": 2.8425677688412274, "grad_norm": 0.5041348338127136, "learning_rate": 4.973026434228728e-05, "loss": 0.6214, "num_input_tokens_seen": 11069160, "step": 19085 }, { "epoch": 2.8433124813821866, "grad_norm": 1.110693335533142, "learning_rate": 4.972978809103202e-05, "loss": 0.6809, "num_input_tokens_seen": 11072136, "step": 19090 }, { "epoch": 2.844057193923146, "grad_norm": 0.8048547506332397, "learning_rate": 4.972931142199213e-05, "loss": 0.8032, "num_input_tokens_seen": 11075368, "step": 19095 }, { "epoch": 2.844801906464105, "grad_norm": 0.7875704765319824, "learning_rate": 4.972883433517566e-05, "loss": 0.6639, "num_input_tokens_seen": 11078312, "step": 19100 }, { "epoch": 2.845546619005064, "grad_norm": 0.708801805973053, "learning_rate": 4.972835683059065e-05, "loss": 0.6693, "num_input_tokens_seen": 11081288, "step": 19105 }, { "epoch": 2.8462913315460234, "grad_norm": 0.7380319237709045, "learning_rate": 4.97278789082452e-05, "loss": 0.6011, "num_input_tokens_seen": 11084232, "step": 19110 }, { "epoch": 2.847036044086982, "grad_norm": 0.6187740564346313, "learning_rate": 4.9727400568147364e-05, "loss": 0.6769, "num_input_tokens_seen": 11087240, "step": 19115 }, { "epoch": 2.847780756627942, "grad_norm": 1.0470519065856934, "learning_rate": 4.972692181030523e-05, "loss": 0.8533, "num_input_tokens_seen": 11090184, "step": 19120 }, { "epoch": 2.8485254691689006, "grad_norm": 1.297800898551941, "learning_rate": 4.972644263472688e-05, "loss": 0.7699, "num_input_tokens_seen": 11093032, "step": 19125 }, { "epoch": 2.8492701817098602, "grad_norm": 0.7180755734443665, "learning_rate": 4.972596304142041e-05, "loss": 0.6391, "num_input_tokens_seen": 11095848, "step": 19130 }, { "epoch": 2.850014894250819, "grad_norm": 0.7868127822875977, "learning_rate": 4.9725483030393924e-05, "loss": 0.6566, "num_input_tokens_seen": 11099016, "step": 19135 }, { "epoch": 2.8507596067917786, "grad_norm": 1.8708406686782837, "learning_rate": 4.972500260165555e-05, "loss": 0.8374, "num_input_tokens_seen": 11102472, "step": 19140 }, { "epoch": 2.8515043193327374, "grad_norm": 0.49218201637268066, "learning_rate": 4.972452175521337e-05, "loss": 0.7064, "num_input_tokens_seen": 11105608, "step": 19145 }, { "epoch": 2.8522490318736966, "grad_norm": 0.706404447555542, "learning_rate": 4.972404049107552e-05, "loss": 0.7059, "num_input_tokens_seen": 11108392, "step": 19150 }, { "epoch": 2.852993744414656, "grad_norm": 1.7265605926513672, "learning_rate": 4.972355880925014e-05, "loss": 0.7167, "num_input_tokens_seen": 11111496, "step": 19155 }, { "epoch": 2.853738456955615, "grad_norm": 0.6727153658866882, "learning_rate": 4.9723076709745365e-05, "loss": 0.7012, "num_input_tokens_seen": 11114216, "step": 19160 }, { "epoch": 2.854483169496574, "grad_norm": 0.7700799703598022, "learning_rate": 4.972259419256933e-05, "loss": 0.6522, "num_input_tokens_seen": 11117448, "step": 19165 }, { "epoch": 2.8552278820375334, "grad_norm": 0.6383696794509888, "learning_rate": 4.97221112577302e-05, "loss": 0.7061, "num_input_tokens_seen": 11120296, "step": 19170 }, { "epoch": 2.8559725945784926, "grad_norm": 0.923239529132843, "learning_rate": 4.972162790523612e-05, "loss": 0.7604, "num_input_tokens_seen": 11123208, "step": 19175 }, { "epoch": 2.856717307119452, "grad_norm": 0.7354596853256226, "learning_rate": 4.9721144135095265e-05, "loss": 0.5359, "num_input_tokens_seen": 11126056, "step": 19180 }, { "epoch": 2.857462019660411, "grad_norm": 1.000690221786499, "learning_rate": 4.9720659947315815e-05, "loss": 0.7115, "num_input_tokens_seen": 11128744, "step": 19185 }, { "epoch": 2.8582067322013702, "grad_norm": 1.4347131252288818, "learning_rate": 4.972017534190593e-05, "loss": 0.7121, "num_input_tokens_seen": 11131464, "step": 19190 }, { "epoch": 2.8589514447423294, "grad_norm": 0.8197643160820007, "learning_rate": 4.971969031887381e-05, "loss": 0.6311, "num_input_tokens_seen": 11134344, "step": 19195 }, { "epoch": 2.8596961572832886, "grad_norm": 0.915516197681427, "learning_rate": 4.971920487822764e-05, "loss": 0.7721, "num_input_tokens_seen": 11137032, "step": 19200 }, { "epoch": 2.860440869824248, "grad_norm": 0.6351513266563416, "learning_rate": 4.971871901997563e-05, "loss": 0.8435, "num_input_tokens_seen": 11140040, "step": 19205 }, { "epoch": 2.861185582365207, "grad_norm": 0.727258026599884, "learning_rate": 4.9718232744125995e-05, "loss": 0.7072, "num_input_tokens_seen": 11143048, "step": 19210 }, { "epoch": 2.8619302949061662, "grad_norm": 0.669451892375946, "learning_rate": 4.9717746050686925e-05, "loss": 0.7705, "num_input_tokens_seen": 11145768, "step": 19215 }, { "epoch": 2.8626750074471254, "grad_norm": 0.9086239337921143, "learning_rate": 4.9717258939666663e-05, "loss": 0.7599, "num_input_tokens_seen": 11148424, "step": 19220 }, { "epoch": 2.8634197199880846, "grad_norm": 0.9942346811294556, "learning_rate": 4.9716771411073436e-05, "loss": 0.6726, "num_input_tokens_seen": 11151272, "step": 19225 }, { "epoch": 2.864164432529044, "grad_norm": 0.7329925298690796, "learning_rate": 4.9716283464915484e-05, "loss": 0.5996, "num_input_tokens_seen": 11154440, "step": 19230 }, { "epoch": 2.864909145070003, "grad_norm": 1.5759124755859375, "learning_rate": 4.9715795101201025e-05, "loss": 0.8514, "num_input_tokens_seen": 11157384, "step": 19235 }, { "epoch": 2.8656538576109623, "grad_norm": 1.1540251970291138, "learning_rate": 4.9715306319938335e-05, "loss": 0.7399, "num_input_tokens_seen": 11160200, "step": 19240 }, { "epoch": 2.8663985701519215, "grad_norm": 0.8305284976959229, "learning_rate": 4.971481712113567e-05, "loss": 0.6163, "num_input_tokens_seen": 11163208, "step": 19245 }, { "epoch": 2.8671432826928807, "grad_norm": 0.8421934247016907, "learning_rate": 4.9714327504801286e-05, "loss": 0.612, "num_input_tokens_seen": 11165736, "step": 19250 }, { "epoch": 2.86788799523384, "grad_norm": 0.7428117394447327, "learning_rate": 4.971383747094346e-05, "loss": 0.648, "num_input_tokens_seen": 11168488, "step": 19255 }, { "epoch": 2.868632707774799, "grad_norm": 0.9344428181648254, "learning_rate": 4.9713347019570465e-05, "loss": 0.6947, "num_input_tokens_seen": 11171496, "step": 19260 }, { "epoch": 2.8693774203157583, "grad_norm": 1.0203959941864014, "learning_rate": 4.971285615069059e-05, "loss": 0.6969, "num_input_tokens_seen": 11174568, "step": 19265 }, { "epoch": 2.8701221328567175, "grad_norm": 0.811364471912384, "learning_rate": 4.9712364864312125e-05, "loss": 0.6049, "num_input_tokens_seen": 11177544, "step": 19270 }, { "epoch": 2.8708668453976767, "grad_norm": 0.8661707639694214, "learning_rate": 4.9711873160443375e-05, "loss": 0.6701, "num_input_tokens_seen": 11180200, "step": 19275 }, { "epoch": 2.8716115579386354, "grad_norm": 0.7548487782478333, "learning_rate": 4.971138103909264e-05, "loss": 0.8331, "num_input_tokens_seen": 11183048, "step": 19280 }, { "epoch": 2.872356270479595, "grad_norm": 0.5975167155265808, "learning_rate": 4.9710888500268236e-05, "loss": 0.6481, "num_input_tokens_seen": 11185768, "step": 19285 }, { "epoch": 2.873100983020554, "grad_norm": 0.7588370442390442, "learning_rate": 4.9710395543978495e-05, "loss": 0.6568, "num_input_tokens_seen": 11188776, "step": 19290 }, { "epoch": 2.8738456955615135, "grad_norm": 0.8120064735412598, "learning_rate": 4.970990217023173e-05, "loss": 0.723, "num_input_tokens_seen": 11191528, "step": 19295 }, { "epoch": 2.8745904081024722, "grad_norm": 0.9505169987678528, "learning_rate": 4.9709408379036284e-05, "loss": 0.6213, "num_input_tokens_seen": 11194312, "step": 19300 }, { "epoch": 2.875335120643432, "grad_norm": 0.7776005864143372, "learning_rate": 4.97089141704005e-05, "loss": 0.7533, "num_input_tokens_seen": 11197320, "step": 19305 }, { "epoch": 2.8760798331843906, "grad_norm": 0.8398424983024597, "learning_rate": 4.970841954433272e-05, "loss": 0.8138, "num_input_tokens_seen": 11200360, "step": 19310 }, { "epoch": 2.8768245457253503, "grad_norm": 0.8436309099197388, "learning_rate": 4.97079245008413e-05, "loss": 0.6352, "num_input_tokens_seen": 11202952, "step": 19315 }, { "epoch": 2.877569258266309, "grad_norm": 1.1913648843765259, "learning_rate": 4.970742903993462e-05, "loss": 0.6554, "num_input_tokens_seen": 11206056, "step": 19320 }, { "epoch": 2.8783139708072683, "grad_norm": 0.8206289410591125, "learning_rate": 4.970693316162103e-05, "loss": 0.6832, "num_input_tokens_seen": 11208904, "step": 19325 }, { "epoch": 2.8790586833482275, "grad_norm": 1.0242316722869873, "learning_rate": 4.9706436865908915e-05, "loss": 0.6991, "num_input_tokens_seen": 11211944, "step": 19330 }, { "epoch": 2.8798033958891867, "grad_norm": 0.750972330570221, "learning_rate": 4.970594015280665e-05, "loss": 0.5794, "num_input_tokens_seen": 11214856, "step": 19335 }, { "epoch": 2.880548108430146, "grad_norm": 1.1167809963226318, "learning_rate": 4.970544302232265e-05, "loss": 0.6905, "num_input_tokens_seen": 11217832, "step": 19340 }, { "epoch": 2.881292820971105, "grad_norm": 0.8678686022758484, "learning_rate": 4.97049454744653e-05, "loss": 0.6683, "num_input_tokens_seen": 11220520, "step": 19345 }, { "epoch": 2.8820375335120643, "grad_norm": 0.7680957317352295, "learning_rate": 4.9704447509243e-05, "loss": 0.5997, "num_input_tokens_seen": 11223432, "step": 19350 }, { "epoch": 2.8827822460530235, "grad_norm": 1.0555596351623535, "learning_rate": 4.970394912666416e-05, "loss": 0.694, "num_input_tokens_seen": 11226024, "step": 19355 }, { "epoch": 2.8835269585939827, "grad_norm": 0.6763259172439575, "learning_rate": 4.970345032673722e-05, "loss": 0.6302, "num_input_tokens_seen": 11229160, "step": 19360 }, { "epoch": 2.884271671134942, "grad_norm": 0.8139277100563049, "learning_rate": 4.97029511094706e-05, "loss": 0.7313, "num_input_tokens_seen": 11232168, "step": 19365 }, { "epoch": 2.885016383675901, "grad_norm": 1.3810876607894897, "learning_rate": 4.970245147487271e-05, "loss": 0.6195, "num_input_tokens_seen": 11235080, "step": 19370 }, { "epoch": 2.8857610962168603, "grad_norm": 0.8186101913452148, "learning_rate": 4.970195142295202e-05, "loss": 0.6566, "num_input_tokens_seen": 11237992, "step": 19375 }, { "epoch": 2.8865058087578195, "grad_norm": 0.5888049006462097, "learning_rate": 4.9701450953716965e-05, "loss": 0.7264, "num_input_tokens_seen": 11241416, "step": 19380 }, { "epoch": 2.8872505212987787, "grad_norm": 0.8884139657020569, "learning_rate": 4.9700950067176e-05, "loss": 0.6627, "num_input_tokens_seen": 11244584, "step": 19385 }, { "epoch": 2.887995233839738, "grad_norm": 1.6907634735107422, "learning_rate": 4.970044876333759e-05, "loss": 0.7341, "num_input_tokens_seen": 11247976, "step": 19390 }, { "epoch": 2.888739946380697, "grad_norm": 1.3734022378921509, "learning_rate": 4.9699947042210196e-05, "loss": 0.6301, "num_input_tokens_seen": 11251080, "step": 19395 }, { "epoch": 2.8894846589216563, "grad_norm": 0.6732125878334045, "learning_rate": 4.96994449038023e-05, "loss": 0.7744, "num_input_tokens_seen": 11253928, "step": 19400 }, { "epoch": 2.8902293714626155, "grad_norm": 0.9042002558708191, "learning_rate": 4.9698942348122404e-05, "loss": 0.7222, "num_input_tokens_seen": 11256840, "step": 19405 }, { "epoch": 2.8909740840035747, "grad_norm": 0.7833748459815979, "learning_rate": 4.9698439375178965e-05, "loss": 0.6842, "num_input_tokens_seen": 11259496, "step": 19410 }, { "epoch": 2.891718796544534, "grad_norm": 0.8796544671058655, "learning_rate": 4.9697935984980496e-05, "loss": 0.7637, "num_input_tokens_seen": 11262344, "step": 19415 }, { "epoch": 2.892463509085493, "grad_norm": 1.1273504495620728, "learning_rate": 4.96974321775355e-05, "loss": 0.6114, "num_input_tokens_seen": 11265608, "step": 19420 }, { "epoch": 2.8932082216264523, "grad_norm": 0.7046864032745361, "learning_rate": 4.969692795285249e-05, "loss": 0.7206, "num_input_tokens_seen": 11268584, "step": 19425 }, { "epoch": 2.8939529341674115, "grad_norm": 1.5189299583435059, "learning_rate": 4.9696423310939985e-05, "loss": 0.7876, "num_input_tokens_seen": 11271208, "step": 19430 }, { "epoch": 2.8946976467083707, "grad_norm": 1.503646731376648, "learning_rate": 4.9695918251806506e-05, "loss": 0.6986, "num_input_tokens_seen": 11274056, "step": 19435 }, { "epoch": 2.89544235924933, "grad_norm": 1.2256923913955688, "learning_rate": 4.969541277546059e-05, "loss": 0.7269, "num_input_tokens_seen": 11277096, "step": 19440 }, { "epoch": 2.896187071790289, "grad_norm": 0.6860061287879944, "learning_rate": 4.9694906881910776e-05, "loss": 0.6362, "num_input_tokens_seen": 11279944, "step": 19445 }, { "epoch": 2.8969317843312483, "grad_norm": 0.8027641773223877, "learning_rate": 4.969440057116561e-05, "loss": 0.8228, "num_input_tokens_seen": 11282728, "step": 19450 }, { "epoch": 2.897676496872207, "grad_norm": 1.286210536956787, "learning_rate": 4.969389384323364e-05, "loss": 0.7653, "num_input_tokens_seen": 11285448, "step": 19455 }, { "epoch": 2.8984212094131667, "grad_norm": 0.6336888074874878, "learning_rate": 4.969338669812343e-05, "loss": 0.7783, "num_input_tokens_seen": 11288168, "step": 19460 }, { "epoch": 2.8991659219541255, "grad_norm": 1.272995114326477, "learning_rate": 4.969287913584355e-05, "loss": 0.597, "num_input_tokens_seen": 11291048, "step": 19465 }, { "epoch": 2.899910634495085, "grad_norm": 1.5278011560440063, "learning_rate": 4.969237115640258e-05, "loss": 0.6306, "num_input_tokens_seen": 11293768, "step": 19470 }, { "epoch": 2.900655347036044, "grad_norm": 0.677172839641571, "learning_rate": 4.969186275980909e-05, "loss": 0.6928, "num_input_tokens_seen": 11296808, "step": 19475 }, { "epoch": 2.9014000595770035, "grad_norm": 0.6487923860549927, "learning_rate": 4.969135394607167e-05, "loss": 0.6857, "num_input_tokens_seen": 11299432, "step": 19480 }, { "epoch": 2.9021447721179623, "grad_norm": 0.81817626953125, "learning_rate": 4.969084471519893e-05, "loss": 0.6746, "num_input_tokens_seen": 11302632, "step": 19485 }, { "epoch": 2.9028894846589215, "grad_norm": 0.9464057683944702, "learning_rate": 4.9690335067199464e-05, "loss": 0.7153, "num_input_tokens_seen": 11305480, "step": 19490 }, { "epoch": 2.9036341971998807, "grad_norm": 0.8471247553825378, "learning_rate": 4.9689825002081866e-05, "loss": 0.6715, "num_input_tokens_seen": 11308200, "step": 19495 }, { "epoch": 2.90437890974084, "grad_norm": 1.2299734354019165, "learning_rate": 4.9689314519854786e-05, "loss": 0.6689, "num_input_tokens_seen": 11311112, "step": 19500 }, { "epoch": 2.905123622281799, "grad_norm": 0.9275200963020325, "learning_rate": 4.968880362052682e-05, "loss": 0.4984, "num_input_tokens_seen": 11314152, "step": 19505 }, { "epoch": 2.9058683348227583, "grad_norm": 0.992693305015564, "learning_rate": 4.968829230410661e-05, "loss": 0.5458, "num_input_tokens_seen": 11316904, "step": 19510 }, { "epoch": 2.9066130473637175, "grad_norm": 0.8444511294364929, "learning_rate": 4.96877805706028e-05, "loss": 0.7117, "num_input_tokens_seen": 11320040, "step": 19515 }, { "epoch": 2.9073577599046767, "grad_norm": 1.0777032375335693, "learning_rate": 4.968726842002402e-05, "loss": 0.6605, "num_input_tokens_seen": 11322920, "step": 19520 }, { "epoch": 2.908102472445636, "grad_norm": 0.8469080328941345, "learning_rate": 4.968675585237894e-05, "loss": 0.662, "num_input_tokens_seen": 11325768, "step": 19525 }, { "epoch": 2.908847184986595, "grad_norm": 1.0110746622085571, "learning_rate": 4.9686242867676204e-05, "loss": 0.6059, "num_input_tokens_seen": 11328488, "step": 19530 }, { "epoch": 2.9095918975275543, "grad_norm": 1.0684486627578735, "learning_rate": 4.968572946592448e-05, "loss": 0.7254, "num_input_tokens_seen": 11331304, "step": 19535 }, { "epoch": 2.9103366100685135, "grad_norm": 1.1361223459243774, "learning_rate": 4.968521564713246e-05, "loss": 0.7614, "num_input_tokens_seen": 11334120, "step": 19540 }, { "epoch": 2.9110813226094727, "grad_norm": 0.608575701713562, "learning_rate": 4.9684701411308796e-05, "loss": 0.5802, "num_input_tokens_seen": 11336712, "step": 19545 }, { "epoch": 2.911826035150432, "grad_norm": 0.8499637842178345, "learning_rate": 4.9684186758462205e-05, "loss": 0.5817, "num_input_tokens_seen": 11339656, "step": 19550 }, { "epoch": 2.912570747691391, "grad_norm": 0.8962589502334595, "learning_rate": 4.968367168860136e-05, "loss": 0.8524, "num_input_tokens_seen": 11342632, "step": 19555 }, { "epoch": 2.9133154602323503, "grad_norm": 1.606188416481018, "learning_rate": 4.968315620173496e-05, "loss": 0.7398, "num_input_tokens_seen": 11345768, "step": 19560 }, { "epoch": 2.9140601727733095, "grad_norm": 0.9849572777748108, "learning_rate": 4.968264029787173e-05, "loss": 0.5536, "num_input_tokens_seen": 11348552, "step": 19565 }, { "epoch": 2.9148048853142687, "grad_norm": 1.043923020362854, "learning_rate": 4.9682123977020385e-05, "loss": 0.7333, "num_input_tokens_seen": 11351336, "step": 19570 }, { "epoch": 2.915549597855228, "grad_norm": 1.8293238878250122, "learning_rate": 4.968160723918963e-05, "loss": 0.8644, "num_input_tokens_seen": 11354024, "step": 19575 }, { "epoch": 2.916294310396187, "grad_norm": 1.418134331703186, "learning_rate": 4.968109008438821e-05, "loss": 0.7698, "num_input_tokens_seen": 11356840, "step": 19580 }, { "epoch": 2.9170390229371463, "grad_norm": 1.008885383605957, "learning_rate": 4.9680572512624865e-05, "loss": 0.6279, "num_input_tokens_seen": 11359656, "step": 19585 }, { "epoch": 2.9177837354781055, "grad_norm": 1.2812386751174927, "learning_rate": 4.968005452390832e-05, "loss": 0.7391, "num_input_tokens_seen": 11362472, "step": 19590 }, { "epoch": 2.9185284480190647, "grad_norm": 0.7956868410110474, "learning_rate": 4.967953611824735e-05, "loss": 0.6329, "num_input_tokens_seen": 11365672, "step": 19595 }, { "epoch": 2.919273160560024, "grad_norm": 1.1255930662155151, "learning_rate": 4.9679017295650694e-05, "loss": 0.7141, "num_input_tokens_seen": 11368552, "step": 19600 }, { "epoch": 2.920017873100983, "grad_norm": 0.7184685468673706, "learning_rate": 4.9678498056127124e-05, "loss": 0.6925, "num_input_tokens_seen": 11371528, "step": 19605 }, { "epoch": 2.9207625856419424, "grad_norm": 1.3675756454467773, "learning_rate": 4.967797839968541e-05, "loss": 0.566, "num_input_tokens_seen": 11374440, "step": 19610 }, { "epoch": 2.9215072981829016, "grad_norm": 1.1363773345947266, "learning_rate": 4.9677458326334336e-05, "loss": 0.5773, "num_input_tokens_seen": 11377064, "step": 19615 }, { "epoch": 2.9222520107238603, "grad_norm": 0.909507155418396, "learning_rate": 4.967693783608268e-05, "loss": 0.6765, "num_input_tokens_seen": 11379944, "step": 19620 }, { "epoch": 2.92299672326482, "grad_norm": 1.1636849641799927, "learning_rate": 4.967641692893924e-05, "loss": 0.7395, "num_input_tokens_seen": 11382760, "step": 19625 }, { "epoch": 2.9237414358057787, "grad_norm": 1.1737955808639526, "learning_rate": 4.967589560491282e-05, "loss": 0.5655, "num_input_tokens_seen": 11385704, "step": 19630 }, { "epoch": 2.9244861483467384, "grad_norm": 3.8546600341796875, "learning_rate": 4.967537386401222e-05, "loss": 0.8441, "num_input_tokens_seen": 11388392, "step": 19635 }, { "epoch": 2.925230860887697, "grad_norm": 1.0968683958053589, "learning_rate": 4.967485170624625e-05, "loss": 0.6653, "num_input_tokens_seen": 11391208, "step": 19640 }, { "epoch": 2.9259755734286568, "grad_norm": 0.9285271167755127, "learning_rate": 4.9674329131623756e-05, "loss": 0.5915, "num_input_tokens_seen": 11394280, "step": 19645 }, { "epoch": 2.9267202859696155, "grad_norm": 0.9087551236152649, "learning_rate": 4.967380614015354e-05, "loss": 0.7535, "num_input_tokens_seen": 11397256, "step": 19650 }, { "epoch": 2.927464998510575, "grad_norm": 1.2969295978546143, "learning_rate": 4.9673282731844444e-05, "loss": 0.6684, "num_input_tokens_seen": 11400328, "step": 19655 }, { "epoch": 2.928209711051534, "grad_norm": 1.185911774635315, "learning_rate": 4.967275890670532e-05, "loss": 0.7228, "num_input_tokens_seen": 11403176, "step": 19660 }, { "epoch": 2.928954423592493, "grad_norm": 1.11610746383667, "learning_rate": 4.967223466474501e-05, "loss": 0.7535, "num_input_tokens_seen": 11405992, "step": 19665 }, { "epoch": 2.9296991361334523, "grad_norm": 1.2259113788604736, "learning_rate": 4.967171000597236e-05, "loss": 0.6022, "num_input_tokens_seen": 11408808, "step": 19670 }, { "epoch": 2.9304438486744115, "grad_norm": 1.4498902559280396, "learning_rate": 4.967118493039625e-05, "loss": 0.7726, "num_input_tokens_seen": 11411624, "step": 19675 }, { "epoch": 2.9311885612153707, "grad_norm": 0.7590456008911133, "learning_rate": 4.9670659438025545e-05, "loss": 0.7086, "num_input_tokens_seen": 11414536, "step": 19680 }, { "epoch": 2.93193327375633, "grad_norm": 0.8092650175094604, "learning_rate": 4.967013352886913e-05, "loss": 0.7501, "num_input_tokens_seen": 11417384, "step": 19685 }, { "epoch": 2.932677986297289, "grad_norm": 1.45906400680542, "learning_rate": 4.9669607202935876e-05, "loss": 0.7297, "num_input_tokens_seen": 11420296, "step": 19690 }, { "epoch": 2.9334226988382484, "grad_norm": 1.5233768224716187, "learning_rate": 4.966908046023468e-05, "loss": 0.5754, "num_input_tokens_seen": 11423144, "step": 19695 }, { "epoch": 2.9341674113792076, "grad_norm": 0.7296062707901001, "learning_rate": 4.966855330077445e-05, "loss": 0.5408, "num_input_tokens_seen": 11426120, "step": 19700 }, { "epoch": 2.9349121239201668, "grad_norm": 1.5854848623275757, "learning_rate": 4.966802572456408e-05, "loss": 0.6851, "num_input_tokens_seen": 11429000, "step": 19705 }, { "epoch": 2.935656836461126, "grad_norm": 0.48412689566612244, "learning_rate": 4.966749773161249e-05, "loss": 0.7366, "num_input_tokens_seen": 11431688, "step": 19710 }, { "epoch": 2.936401549002085, "grad_norm": 0.9085178971290588, "learning_rate": 4.966696932192859e-05, "loss": 0.5814, "num_input_tokens_seen": 11434376, "step": 19715 }, { "epoch": 2.9371462615430444, "grad_norm": 0.6314709186553955, "learning_rate": 4.9666440495521313e-05, "loss": 0.802, "num_input_tokens_seen": 11437320, "step": 19720 }, { "epoch": 2.9378909740840036, "grad_norm": 0.9798633456230164, "learning_rate": 4.96659112523996e-05, "loss": 0.7855, "num_input_tokens_seen": 11440168, "step": 19725 }, { "epoch": 2.9386356866249628, "grad_norm": 0.9967576265335083, "learning_rate": 4.9665381592572387e-05, "loss": 0.6942, "num_input_tokens_seen": 11442984, "step": 19730 }, { "epoch": 2.939380399165922, "grad_norm": 1.0574390888214111, "learning_rate": 4.9664851516048615e-05, "loss": 0.6314, "num_input_tokens_seen": 11446056, "step": 19735 }, { "epoch": 2.940125111706881, "grad_norm": 1.1784144639968872, "learning_rate": 4.9664321022837244e-05, "loss": 0.6988, "num_input_tokens_seen": 11449000, "step": 19740 }, { "epoch": 2.9408698242478404, "grad_norm": 1.4472079277038574, "learning_rate": 4.966379011294724e-05, "loss": 0.5941, "num_input_tokens_seen": 11451560, "step": 19745 }, { "epoch": 2.9416145367887996, "grad_norm": 0.9381048679351807, "learning_rate": 4.966325878638757e-05, "loss": 0.7206, "num_input_tokens_seen": 11454856, "step": 19750 }, { "epoch": 2.942359249329759, "grad_norm": 0.962219774723053, "learning_rate": 4.966272704316721e-05, "loss": 0.77, "num_input_tokens_seen": 11457832, "step": 19755 }, { "epoch": 2.943103961870718, "grad_norm": 0.9693194031715393, "learning_rate": 4.966219488329514e-05, "loss": 0.5976, "num_input_tokens_seen": 11460904, "step": 19760 }, { "epoch": 2.943848674411677, "grad_norm": 2.4069197177886963, "learning_rate": 4.966166230678035e-05, "loss": 0.7304, "num_input_tokens_seen": 11465064, "step": 19765 }, { "epoch": 2.9445933869526364, "grad_norm": 0.7382439374923706, "learning_rate": 4.966112931363185e-05, "loss": 0.6663, "num_input_tokens_seen": 11468168, "step": 19770 }, { "epoch": 2.9453380994935956, "grad_norm": 0.8105363845825195, "learning_rate": 4.966059590385863e-05, "loss": 0.7815, "num_input_tokens_seen": 11471208, "step": 19775 }, { "epoch": 2.946082812034555, "grad_norm": 0.9078127145767212, "learning_rate": 4.9660062077469706e-05, "loss": 0.7334, "num_input_tokens_seen": 11474376, "step": 19780 }, { "epoch": 2.946827524575514, "grad_norm": 1.0041247606277466, "learning_rate": 4.965952783447409e-05, "loss": 0.7983, "num_input_tokens_seen": 11477320, "step": 19785 }, { "epoch": 2.947572237116473, "grad_norm": 0.8760926127433777, "learning_rate": 4.965899317488082e-05, "loss": 0.6484, "num_input_tokens_seen": 11480136, "step": 19790 }, { "epoch": 2.948316949657432, "grad_norm": 1.0088165998458862, "learning_rate": 4.9658458098698926e-05, "loss": 0.5775, "num_input_tokens_seen": 11483176, "step": 19795 }, { "epoch": 2.9490616621983916, "grad_norm": 0.741693377494812, "learning_rate": 4.965792260593744e-05, "loss": 0.6939, "num_input_tokens_seen": 11485704, "step": 19800 }, { "epoch": 2.9498063747393504, "grad_norm": 1.069855809211731, "learning_rate": 4.965738669660541e-05, "loss": 0.5497, "num_input_tokens_seen": 11488552, "step": 19805 }, { "epoch": 2.95055108728031, "grad_norm": 1.9011757373809814, "learning_rate": 4.96568503707119e-05, "loss": 0.8301, "num_input_tokens_seen": 11491336, "step": 19810 }, { "epoch": 2.9512957998212688, "grad_norm": 0.7458236217498779, "learning_rate": 4.965631362826596e-05, "loss": 0.6696, "num_input_tokens_seen": 11494376, "step": 19815 }, { "epoch": 2.9520405123622284, "grad_norm": 0.9477102160453796, "learning_rate": 4.965577646927666e-05, "loss": 0.5861, "num_input_tokens_seen": 11497352, "step": 19820 }, { "epoch": 2.952785224903187, "grad_norm": 0.5744551420211792, "learning_rate": 4.965523889375308e-05, "loss": 0.6572, "num_input_tokens_seen": 11500008, "step": 19825 }, { "epoch": 2.953529937444147, "grad_norm": 0.7157144546508789, "learning_rate": 4.9654700901704286e-05, "loss": 0.7303, "num_input_tokens_seen": 11502824, "step": 19830 }, { "epoch": 2.9542746499851056, "grad_norm": 0.9896833300590515, "learning_rate": 4.965416249313939e-05, "loss": 0.7439, "num_input_tokens_seen": 11505832, "step": 19835 }, { "epoch": 2.955019362526065, "grad_norm": 0.7434048652648926, "learning_rate": 4.965362366806747e-05, "loss": 0.6304, "num_input_tokens_seen": 11508872, "step": 19840 }, { "epoch": 2.955764075067024, "grad_norm": 1.187970757484436, "learning_rate": 4.9653084426497633e-05, "loss": 0.6757, "num_input_tokens_seen": 11511464, "step": 19845 }, { "epoch": 2.956508787607983, "grad_norm": 1.2612900733947754, "learning_rate": 4.965254476843899e-05, "loss": 0.6346, "num_input_tokens_seen": 11514216, "step": 19850 }, { "epoch": 2.9572535001489424, "grad_norm": 0.9114896655082703, "learning_rate": 4.965200469390067e-05, "loss": 0.5809, "num_input_tokens_seen": 11517480, "step": 19855 }, { "epoch": 2.9579982126899016, "grad_norm": 1.157301902770996, "learning_rate": 4.965146420289177e-05, "loss": 0.7504, "num_input_tokens_seen": 11520136, "step": 19860 }, { "epoch": 2.958742925230861, "grad_norm": 0.8840148448944092, "learning_rate": 4.965092329542145e-05, "loss": 0.5838, "num_input_tokens_seen": 11522952, "step": 19865 }, { "epoch": 2.95948763777182, "grad_norm": 1.212733268737793, "learning_rate": 4.9650381971498824e-05, "loss": 0.8529, "num_input_tokens_seen": 11525960, "step": 19870 }, { "epoch": 2.960232350312779, "grad_norm": 1.0931423902511597, "learning_rate": 4.964984023113306e-05, "loss": 0.6735, "num_input_tokens_seen": 11528488, "step": 19875 }, { "epoch": 2.9609770628537384, "grad_norm": 0.5419098138809204, "learning_rate": 4.9649298074333294e-05, "loss": 0.5747, "num_input_tokens_seen": 11531432, "step": 19880 }, { "epoch": 2.9617217753946976, "grad_norm": 1.1992794275283813, "learning_rate": 4.964875550110869e-05, "loss": 0.6392, "num_input_tokens_seen": 11534248, "step": 19885 }, { "epoch": 2.962466487935657, "grad_norm": 0.6335030794143677, "learning_rate": 4.964821251146841e-05, "loss": 0.6533, "num_input_tokens_seen": 11537128, "step": 19890 }, { "epoch": 2.963211200476616, "grad_norm": 1.0678737163543701, "learning_rate": 4.964766910542164e-05, "loss": 0.7819, "num_input_tokens_seen": 11540264, "step": 19895 }, { "epoch": 2.963955913017575, "grad_norm": 0.8378621935844421, "learning_rate": 4.9647125282977536e-05, "loss": 0.8528, "num_input_tokens_seen": 11543144, "step": 19900 }, { "epoch": 2.9647006255585344, "grad_norm": 0.8622934222221375, "learning_rate": 4.964658104414531e-05, "loss": 0.5844, "num_input_tokens_seen": 11546152, "step": 19905 }, { "epoch": 2.9654453380994936, "grad_norm": 0.6172409653663635, "learning_rate": 4.964603638893415e-05, "loss": 0.6497, "num_input_tokens_seen": 11548744, "step": 19910 }, { "epoch": 2.966190050640453, "grad_norm": 0.9121353030204773, "learning_rate": 4.9645491317353246e-05, "loss": 0.6482, "num_input_tokens_seen": 11551560, "step": 19915 }, { "epoch": 2.966934763181412, "grad_norm": 1.1014068126678467, "learning_rate": 4.9644945829411815e-05, "loss": 0.656, "num_input_tokens_seen": 11554600, "step": 19920 }, { "epoch": 2.9676794757223712, "grad_norm": 0.818824291229248, "learning_rate": 4.964439992511908e-05, "loss": 0.7268, "num_input_tokens_seen": 11557704, "step": 19925 }, { "epoch": 2.9684241882633304, "grad_norm": 1.5398486852645874, "learning_rate": 4.964385360448425e-05, "loss": 0.7345, "num_input_tokens_seen": 11560424, "step": 19930 }, { "epoch": 2.9691689008042896, "grad_norm": 0.6560568809509277, "learning_rate": 4.964330686751656e-05, "loss": 0.7672, "num_input_tokens_seen": 11563496, "step": 19935 }, { "epoch": 2.969913613345249, "grad_norm": 0.627864420413971, "learning_rate": 4.964275971422525e-05, "loss": 0.7205, "num_input_tokens_seen": 11566376, "step": 19940 }, { "epoch": 2.970658325886208, "grad_norm": 0.7133232355117798, "learning_rate": 4.964221214461956e-05, "loss": 0.5743, "num_input_tokens_seen": 11569128, "step": 19945 }, { "epoch": 2.9714030384271672, "grad_norm": 0.8773500323295593, "learning_rate": 4.964166415870874e-05, "loss": 0.5492, "num_input_tokens_seen": 11571912, "step": 19950 }, { "epoch": 2.9721477509681264, "grad_norm": 1.109412670135498, "learning_rate": 4.964111575650205e-05, "loss": 0.6836, "num_input_tokens_seen": 11574696, "step": 19955 }, { "epoch": 2.9728924635090856, "grad_norm": 1.02341890335083, "learning_rate": 4.9640566938008745e-05, "loss": 0.7683, "num_input_tokens_seen": 11577512, "step": 19960 }, { "epoch": 2.973637176050045, "grad_norm": 0.6499743461608887, "learning_rate": 4.964001770323812e-05, "loss": 0.5854, "num_input_tokens_seen": 11580616, "step": 19965 }, { "epoch": 2.9743818885910036, "grad_norm": 2.310283660888672, "learning_rate": 4.9639468052199426e-05, "loss": 0.8252, "num_input_tokens_seen": 11583368, "step": 19970 }, { "epoch": 2.9751266011319633, "grad_norm": 1.0720245838165283, "learning_rate": 4.963891798490197e-05, "loss": 0.6271, "num_input_tokens_seen": 11586504, "step": 19975 }, { "epoch": 2.975871313672922, "grad_norm": 0.9140868186950684, "learning_rate": 4.963836750135503e-05, "loss": 0.6172, "num_input_tokens_seen": 11589352, "step": 19980 }, { "epoch": 2.9766160262138817, "grad_norm": 0.97134929895401, "learning_rate": 4.963781660156792e-05, "loss": 0.7272, "num_input_tokens_seen": 11592360, "step": 19985 }, { "epoch": 2.9773607387548404, "grad_norm": 1.090682864189148, "learning_rate": 4.9637265285549935e-05, "loss": 0.7046, "num_input_tokens_seen": 11595272, "step": 19990 }, { "epoch": 2.9781054512958, "grad_norm": 0.9683828353881836, "learning_rate": 4.9636713553310396e-05, "loss": 0.6831, "num_input_tokens_seen": 11598408, "step": 19995 }, { "epoch": 2.978850163836759, "grad_norm": 1.1691066026687622, "learning_rate": 4.963616140485862e-05, "loss": 0.6249, "num_input_tokens_seen": 11601000, "step": 20000 }, { "epoch": 2.9795948763777185, "grad_norm": 1.042540192604065, "learning_rate": 4.963560884020393e-05, "loss": 0.7064, "num_input_tokens_seen": 11603784, "step": 20005 }, { "epoch": 2.9803395889186772, "grad_norm": 1.7165045738220215, "learning_rate": 4.963505585935567e-05, "loss": 0.7522, "num_input_tokens_seen": 11606728, "step": 20010 }, { "epoch": 2.9810843014596364, "grad_norm": 0.82503342628479, "learning_rate": 4.9634502462323186e-05, "loss": 0.6883, "num_input_tokens_seen": 11609672, "step": 20015 }, { "epoch": 2.9818290140005956, "grad_norm": 0.7549121975898743, "learning_rate": 4.9633948649115816e-05, "loss": 0.5927, "num_input_tokens_seen": 11612584, "step": 20020 }, { "epoch": 2.982573726541555, "grad_norm": 1.4921098947525024, "learning_rate": 4.9633394419742917e-05, "loss": 0.6528, "num_input_tokens_seen": 11615560, "step": 20025 }, { "epoch": 2.983318439082514, "grad_norm": 0.6626343131065369, "learning_rate": 4.963283977421386e-05, "loss": 0.5246, "num_input_tokens_seen": 11618664, "step": 20030 }, { "epoch": 2.9840631516234732, "grad_norm": 0.7316843867301941, "learning_rate": 4.9632284712538005e-05, "loss": 0.6169, "num_input_tokens_seen": 11621512, "step": 20035 }, { "epoch": 2.9848078641644324, "grad_norm": 1.2898197174072266, "learning_rate": 4.9631729234724736e-05, "loss": 0.5554, "num_input_tokens_seen": 11624296, "step": 20040 }, { "epoch": 2.9855525767053916, "grad_norm": 0.5557402968406677, "learning_rate": 4.9631173340783445e-05, "loss": 0.6615, "num_input_tokens_seen": 11627112, "step": 20045 }, { "epoch": 2.986297289246351, "grad_norm": 1.3570972681045532, "learning_rate": 4.96306170307235e-05, "loss": 0.8667, "num_input_tokens_seen": 11630088, "step": 20050 }, { "epoch": 2.98704200178731, "grad_norm": 1.7667851448059082, "learning_rate": 4.963006030455433e-05, "loss": 0.6334, "num_input_tokens_seen": 11632744, "step": 20055 }, { "epoch": 2.9877867143282693, "grad_norm": 1.292932391166687, "learning_rate": 4.962950316228532e-05, "loss": 0.7415, "num_input_tokens_seen": 11636104, "step": 20060 }, { "epoch": 2.9885314268692285, "grad_norm": 0.9508554935455322, "learning_rate": 4.9628945603925884e-05, "loss": 0.6047, "num_input_tokens_seen": 11639112, "step": 20065 }, { "epoch": 2.9892761394101877, "grad_norm": 0.9388192892074585, "learning_rate": 4.9628387629485435e-05, "loss": 0.7013, "num_input_tokens_seen": 11641864, "step": 20070 }, { "epoch": 2.990020851951147, "grad_norm": 0.5933234095573425, "learning_rate": 4.962782923897342e-05, "loss": 0.6319, "num_input_tokens_seen": 11644776, "step": 20075 }, { "epoch": 2.990765564492106, "grad_norm": 1.4070227146148682, "learning_rate": 4.962727043239925e-05, "loss": 0.6377, "num_input_tokens_seen": 11647752, "step": 20080 }, { "epoch": 2.9915102770330653, "grad_norm": 1.3760193586349487, "learning_rate": 4.962671120977238e-05, "loss": 0.7235, "num_input_tokens_seen": 11650536, "step": 20085 }, { "epoch": 2.9922549895740245, "grad_norm": 0.848959743976593, "learning_rate": 4.962615157110226e-05, "loss": 0.7847, "num_input_tokens_seen": 11653256, "step": 20090 }, { "epoch": 2.9929997021149837, "grad_norm": 0.5936391353607178, "learning_rate": 4.9625591516398336e-05, "loss": 0.48, "num_input_tokens_seen": 11656072, "step": 20095 }, { "epoch": 2.993744414655943, "grad_norm": 0.948052704334259, "learning_rate": 4.962503104567007e-05, "loss": 0.6188, "num_input_tokens_seen": 11658984, "step": 20100 }, { "epoch": 2.994489127196902, "grad_norm": 0.6868252754211426, "learning_rate": 4.9624470158926925e-05, "loss": 0.5904, "num_input_tokens_seen": 11662120, "step": 20105 }, { "epoch": 2.9952338397378613, "grad_norm": 1.6901557445526123, "learning_rate": 4.962390885617839e-05, "loss": 0.7169, "num_input_tokens_seen": 11665512, "step": 20110 }, { "epoch": 2.9959785522788205, "grad_norm": 0.8626149296760559, "learning_rate": 4.9623347137433954e-05, "loss": 0.8896, "num_input_tokens_seen": 11668392, "step": 20115 }, { "epoch": 2.9967232648197797, "grad_norm": 0.9118280410766602, "learning_rate": 4.962278500270307e-05, "loss": 0.7828, "num_input_tokens_seen": 11671624, "step": 20120 }, { "epoch": 2.997467977360739, "grad_norm": 1.104519009590149, "learning_rate": 4.9622222451995274e-05, "loss": 0.5775, "num_input_tokens_seen": 11674408, "step": 20125 }, { "epoch": 2.998212689901698, "grad_norm": 0.7397947311401367, "learning_rate": 4.962165948532006e-05, "loss": 0.7492, "num_input_tokens_seen": 11677416, "step": 20130 }, { "epoch": 2.9989574024426573, "grad_norm": 0.795835554599762, "learning_rate": 4.962109610268692e-05, "loss": 0.6829, "num_input_tokens_seen": 11680424, "step": 20135 }, { "epoch": 2.9997021149836165, "grad_norm": 1.1146793365478516, "learning_rate": 4.9620532304105385e-05, "loss": 0.6538, "num_input_tokens_seen": 11683176, "step": 20140 }, { "epoch": 3.0, "eval_loss": 0.6760708093643188, "eval_runtime": 46.9898, "eval_samples_per_second": 63.503, "eval_steps_per_second": 15.876, "num_input_tokens_seen": 11683856, "step": 20142 }, { "epoch": 3.0004468275245757, "grad_norm": 1.1254291534423828, "learning_rate": 4.961996808958499e-05, "loss": 0.4893, "num_input_tokens_seen": 11685584, "step": 20145 }, { "epoch": 3.001191540065535, "grad_norm": 1.170067548751831, "learning_rate": 4.961940345913525e-05, "loss": 0.649, "num_input_tokens_seen": 11688464, "step": 20150 }, { "epoch": 3.001936252606494, "grad_norm": 1.2586140632629395, "learning_rate": 4.961883841276571e-05, "loss": 0.7589, "num_input_tokens_seen": 11691280, "step": 20155 }, { "epoch": 3.002680965147453, "grad_norm": 0.9519256949424744, "learning_rate": 4.961827295048592e-05, "loss": 0.7429, "num_input_tokens_seen": 11694000, "step": 20160 }, { "epoch": 3.003425677688412, "grad_norm": 0.7800664901733398, "learning_rate": 4.961770707230543e-05, "loss": 0.7929, "num_input_tokens_seen": 11697072, "step": 20165 }, { "epoch": 3.0041703902293713, "grad_norm": 0.8139472603797913, "learning_rate": 4.961714077823379e-05, "loss": 0.6023, "num_input_tokens_seen": 11700144, "step": 20170 }, { "epoch": 3.0049151027703305, "grad_norm": 0.7307946085929871, "learning_rate": 4.961657406828059e-05, "loss": 0.6643, "num_input_tokens_seen": 11703120, "step": 20175 }, { "epoch": 3.0056598153112897, "grad_norm": 0.6362242102622986, "learning_rate": 4.961600694245539e-05, "loss": 0.7393, "num_input_tokens_seen": 11705776, "step": 20180 }, { "epoch": 3.006404527852249, "grad_norm": 0.9063222408294678, "learning_rate": 4.961543940076776e-05, "loss": 0.6696, "num_input_tokens_seen": 11708592, "step": 20185 }, { "epoch": 3.007149240393208, "grad_norm": 1.342423439025879, "learning_rate": 4.961487144322731e-05, "loss": 0.7673, "num_input_tokens_seen": 11711472, "step": 20190 }, { "epoch": 3.0078939529341673, "grad_norm": 0.8736290335655212, "learning_rate": 4.961430306984362e-05, "loss": 0.6454, "num_input_tokens_seen": 11714800, "step": 20195 }, { "epoch": 3.0086386654751265, "grad_norm": 1.137830138206482, "learning_rate": 4.9613734280626287e-05, "loss": 0.703, "num_input_tokens_seen": 11717424, "step": 20200 }, { "epoch": 3.0093833780160857, "grad_norm": 1.2981523275375366, "learning_rate": 4.961316507558494e-05, "loss": 0.6484, "num_input_tokens_seen": 11720240, "step": 20205 }, { "epoch": 3.010128090557045, "grad_norm": 0.7786245942115784, "learning_rate": 4.961259545472918e-05, "loss": 0.6119, "num_input_tokens_seen": 11723248, "step": 20210 }, { "epoch": 3.010872803098004, "grad_norm": 1.8218485116958618, "learning_rate": 4.961202541806864e-05, "loss": 0.6493, "num_input_tokens_seen": 11726128, "step": 20215 }, { "epoch": 3.0116175156389633, "grad_norm": 1.3477882146835327, "learning_rate": 4.9611454965612944e-05, "loss": 0.6593, "num_input_tokens_seen": 11729008, "step": 20220 }, { "epoch": 3.0123622281799225, "grad_norm": 0.7397295832633972, "learning_rate": 4.9610884097371736e-05, "loss": 0.4787, "num_input_tokens_seen": 11732016, "step": 20225 }, { "epoch": 3.0131069407208817, "grad_norm": 1.108065128326416, "learning_rate": 4.961031281335464e-05, "loss": 0.7278, "num_input_tokens_seen": 11734928, "step": 20230 }, { "epoch": 3.013851653261841, "grad_norm": 0.8786592483520508, "learning_rate": 4.9609741113571336e-05, "loss": 0.6066, "num_input_tokens_seen": 11737968, "step": 20235 }, { "epoch": 3.0145963658028, "grad_norm": 0.6921672821044922, "learning_rate": 4.960916899803146e-05, "loss": 0.7052, "num_input_tokens_seen": 11740752, "step": 20240 }, { "epoch": 3.0153410783437593, "grad_norm": 1.6311986446380615, "learning_rate": 4.960859646674469e-05, "loss": 0.6322, "num_input_tokens_seen": 11743664, "step": 20245 }, { "epoch": 3.0160857908847185, "grad_norm": 1.3596261739730835, "learning_rate": 4.960802351972069e-05, "loss": 0.7704, "num_input_tokens_seen": 11746608, "step": 20250 }, { "epoch": 3.0168305034256777, "grad_norm": 1.0422000885009766, "learning_rate": 4.960745015696914e-05, "loss": 0.5897, "num_input_tokens_seen": 11749712, "step": 20255 }, { "epoch": 3.017575215966637, "grad_norm": 0.5449340343475342, "learning_rate": 4.960687637849974e-05, "loss": 0.6227, "num_input_tokens_seen": 11752528, "step": 20260 }, { "epoch": 3.018319928507596, "grad_norm": 1.0465209484100342, "learning_rate": 4.960630218432216e-05, "loss": 0.6445, "num_input_tokens_seen": 11755344, "step": 20265 }, { "epoch": 3.0190646410485553, "grad_norm": 0.8416024446487427, "learning_rate": 4.960572757444612e-05, "loss": 0.5556, "num_input_tokens_seen": 11758064, "step": 20270 }, { "epoch": 3.0198093535895145, "grad_norm": 0.8311433792114258, "learning_rate": 4.960515254888133e-05, "loss": 0.5946, "num_input_tokens_seen": 11760848, "step": 20275 }, { "epoch": 3.0205540661304737, "grad_norm": 0.8963088393211365, "learning_rate": 4.9604577107637484e-05, "loss": 0.7703, "num_input_tokens_seen": 11764048, "step": 20280 }, { "epoch": 3.021298778671433, "grad_norm": 1.737532377243042, "learning_rate": 4.960400125072431e-05, "loss": 0.7054, "num_input_tokens_seen": 11767248, "step": 20285 }, { "epoch": 3.022043491212392, "grad_norm": 0.8720256686210632, "learning_rate": 4.960342497815155e-05, "loss": 0.4524, "num_input_tokens_seen": 11770256, "step": 20290 }, { "epoch": 3.0227882037533513, "grad_norm": 0.8675774931907654, "learning_rate": 4.9602848289928926e-05, "loss": 0.651, "num_input_tokens_seen": 11773264, "step": 20295 }, { "epoch": 3.0235329162943105, "grad_norm": 0.6209941506385803, "learning_rate": 4.9602271186066194e-05, "loss": 0.6254, "num_input_tokens_seen": 11776336, "step": 20300 }, { "epoch": 3.0242776288352697, "grad_norm": 0.7571580410003662, "learning_rate": 4.960169366657309e-05, "loss": 0.5308, "num_input_tokens_seen": 11779408, "step": 20305 }, { "epoch": 3.025022341376229, "grad_norm": 0.9835595488548279, "learning_rate": 4.960111573145937e-05, "loss": 0.4927, "num_input_tokens_seen": 11782320, "step": 20310 }, { "epoch": 3.025767053917188, "grad_norm": 1.2526257038116455, "learning_rate": 4.960053738073481e-05, "loss": 0.642, "num_input_tokens_seen": 11786032, "step": 20315 }, { "epoch": 3.0265117664581473, "grad_norm": 1.4976956844329834, "learning_rate": 4.959995861440917e-05, "loss": 0.648, "num_input_tokens_seen": 11789360, "step": 20320 }, { "epoch": 3.0272564789991065, "grad_norm": 0.720436692237854, "learning_rate": 4.959937943249223e-05, "loss": 0.6567, "num_input_tokens_seen": 11792336, "step": 20325 }, { "epoch": 3.0280011915400658, "grad_norm": 1.1989021301269531, "learning_rate": 4.9598799834993784e-05, "loss": 0.7609, "num_input_tokens_seen": 11794992, "step": 20330 }, { "epoch": 3.0287459040810245, "grad_norm": 1.4962687492370605, "learning_rate": 4.9598219821923605e-05, "loss": 0.6938, "num_input_tokens_seen": 11797712, "step": 20335 }, { "epoch": 3.0294906166219837, "grad_norm": 1.3668960332870483, "learning_rate": 4.959763939329152e-05, "loss": 0.7483, "num_input_tokens_seen": 11800496, "step": 20340 }, { "epoch": 3.030235329162943, "grad_norm": 0.6756412982940674, "learning_rate": 4.95970585491073e-05, "loss": 0.6056, "num_input_tokens_seen": 11803248, "step": 20345 }, { "epoch": 3.030980041703902, "grad_norm": 0.7506167888641357, "learning_rate": 4.9596477289380786e-05, "loss": 0.6705, "num_input_tokens_seen": 11806000, "step": 20350 }, { "epoch": 3.0317247542448613, "grad_norm": 1.0072083473205566, "learning_rate": 4.959589561412178e-05, "loss": 0.6703, "num_input_tokens_seen": 11808976, "step": 20355 }, { "epoch": 3.0324694667858205, "grad_norm": 0.6685596704483032, "learning_rate": 4.959531352334012e-05, "loss": 0.6951, "num_input_tokens_seen": 11811888, "step": 20360 }, { "epoch": 3.0332141793267797, "grad_norm": 0.837642252445221, "learning_rate": 4.959473101704563e-05, "loss": 0.8125, "num_input_tokens_seen": 11814928, "step": 20365 }, { "epoch": 3.033958891867739, "grad_norm": 1.6781452894210815, "learning_rate": 4.959414809524816e-05, "loss": 0.8065, "num_input_tokens_seen": 11818320, "step": 20370 }, { "epoch": 3.034703604408698, "grad_norm": 1.0625104904174805, "learning_rate": 4.9593564757957554e-05, "loss": 0.6442, "num_input_tokens_seen": 11821072, "step": 20375 }, { "epoch": 3.0354483169496573, "grad_norm": 0.8314680457115173, "learning_rate": 4.959298100518367e-05, "loss": 0.7545, "num_input_tokens_seen": 11823792, "step": 20380 }, { "epoch": 3.0361930294906165, "grad_norm": 0.7830098271369934, "learning_rate": 4.959239683693636e-05, "loss": 0.6943, "num_input_tokens_seen": 11826832, "step": 20385 }, { "epoch": 3.0369377420315757, "grad_norm": 0.723073422908783, "learning_rate": 4.959181225322551e-05, "loss": 0.4798, "num_input_tokens_seen": 11829520, "step": 20390 }, { "epoch": 3.037682454572535, "grad_norm": 1.0576000213623047, "learning_rate": 4.959122725406098e-05, "loss": 0.6959, "num_input_tokens_seen": 11832496, "step": 20395 }, { "epoch": 3.038427167113494, "grad_norm": 1.6270396709442139, "learning_rate": 4.959064183945266e-05, "loss": 0.6707, "num_input_tokens_seen": 11835376, "step": 20400 }, { "epoch": 3.0391718796544533, "grad_norm": 0.8991285562515259, "learning_rate": 4.959005600941043e-05, "loss": 0.6406, "num_input_tokens_seen": 11838384, "step": 20405 }, { "epoch": 3.0399165921954125, "grad_norm": 0.6832647323608398, "learning_rate": 4.958946976394421e-05, "loss": 0.5745, "num_input_tokens_seen": 11841168, "step": 20410 }, { "epoch": 3.0406613047363718, "grad_norm": 1.6767643690109253, "learning_rate": 4.958888310306389e-05, "loss": 0.7552, "num_input_tokens_seen": 11844464, "step": 20415 }, { "epoch": 3.041406017277331, "grad_norm": 1.5482938289642334, "learning_rate": 4.958829602677937e-05, "loss": 0.6569, "num_input_tokens_seen": 11847312, "step": 20420 }, { "epoch": 3.04215072981829, "grad_norm": 0.6484487652778625, "learning_rate": 4.9587708535100584e-05, "loss": 0.5863, "num_input_tokens_seen": 11850256, "step": 20425 }, { "epoch": 3.0428954423592494, "grad_norm": 0.9355227947235107, "learning_rate": 4.958712062803745e-05, "loss": 0.666, "num_input_tokens_seen": 11853136, "step": 20430 }, { "epoch": 3.0436401549002086, "grad_norm": 0.9619458317756653, "learning_rate": 4.958653230559991e-05, "loss": 0.735, "num_input_tokens_seen": 11856304, "step": 20435 }, { "epoch": 3.0443848674411678, "grad_norm": 0.9152141213417053, "learning_rate": 4.958594356779789e-05, "loss": 0.6925, "num_input_tokens_seen": 11859248, "step": 20440 }, { "epoch": 3.045129579982127, "grad_norm": 1.342244029045105, "learning_rate": 4.958535441464134e-05, "loss": 0.6585, "num_input_tokens_seen": 11862032, "step": 20445 }, { "epoch": 3.045874292523086, "grad_norm": 0.7281630039215088, "learning_rate": 4.958476484614022e-05, "loss": 0.6675, "num_input_tokens_seen": 11864912, "step": 20450 }, { "epoch": 3.0466190050640454, "grad_norm": 0.7701177597045898, "learning_rate": 4.958417486230448e-05, "loss": 0.6444, "num_input_tokens_seen": 11867856, "step": 20455 }, { "epoch": 3.0473637176050046, "grad_norm": 1.2265557050704956, "learning_rate": 4.95835844631441e-05, "loss": 0.6297, "num_input_tokens_seen": 11870832, "step": 20460 }, { "epoch": 3.0481084301459638, "grad_norm": 0.9084656238555908, "learning_rate": 4.958299364866903e-05, "loss": 0.7365, "num_input_tokens_seen": 11874640, "step": 20465 }, { "epoch": 3.048853142686923, "grad_norm": 0.6911631226539612, "learning_rate": 4.958240241888928e-05, "loss": 0.6572, "num_input_tokens_seen": 11877296, "step": 20470 }, { "epoch": 3.049597855227882, "grad_norm": 0.9967292547225952, "learning_rate": 4.958181077381482e-05, "loss": 0.5758, "num_input_tokens_seen": 11879792, "step": 20475 }, { "epoch": 3.0503425677688414, "grad_norm": 2.3612351417541504, "learning_rate": 4.958121871345565e-05, "loss": 0.8169, "num_input_tokens_seen": 11882512, "step": 20480 }, { "epoch": 3.0510872803098006, "grad_norm": 1.27973210811615, "learning_rate": 4.958062623782178e-05, "loss": 0.8338, "num_input_tokens_seen": 11885520, "step": 20485 }, { "epoch": 3.05183199285076, "grad_norm": 0.8441765308380127, "learning_rate": 4.958003334692321e-05, "loss": 0.6386, "num_input_tokens_seen": 11888304, "step": 20490 }, { "epoch": 3.052576705391719, "grad_norm": 0.9624778032302856, "learning_rate": 4.957944004076995e-05, "loss": 0.7547, "num_input_tokens_seen": 11891056, "step": 20495 }, { "epoch": 3.053321417932678, "grad_norm": 0.9838938117027283, "learning_rate": 4.957884631937204e-05, "loss": 0.6471, "num_input_tokens_seen": 11893776, "step": 20500 }, { "epoch": 3.054066130473637, "grad_norm": 1.1135700941085815, "learning_rate": 4.9578252182739506e-05, "loss": 0.6688, "num_input_tokens_seen": 11897008, "step": 20505 }, { "epoch": 3.054810843014596, "grad_norm": 1.129950761795044, "learning_rate": 4.957765763088237e-05, "loss": 0.5664, "num_input_tokens_seen": 11900080, "step": 20510 }, { "epoch": 3.0555555555555554, "grad_norm": 0.7314115166664124, "learning_rate": 4.95770626638107e-05, "loss": 0.6765, "num_input_tokens_seen": 11902992, "step": 20515 }, { "epoch": 3.0563002680965146, "grad_norm": 1.0042333602905273, "learning_rate": 4.9576467281534526e-05, "loss": 0.709, "num_input_tokens_seen": 11905840, "step": 20520 }, { "epoch": 3.0570449806374738, "grad_norm": 0.8606306910514832, "learning_rate": 4.9575871484063915e-05, "loss": 0.6883, "num_input_tokens_seen": 11908464, "step": 20525 }, { "epoch": 3.057789693178433, "grad_norm": 0.6896948218345642, "learning_rate": 4.9575275271408944e-05, "loss": 0.5993, "num_input_tokens_seen": 11911312, "step": 20530 }, { "epoch": 3.058534405719392, "grad_norm": 0.8622449040412903, "learning_rate": 4.957467864357967e-05, "loss": 0.6875, "num_input_tokens_seen": 11914192, "step": 20535 }, { "epoch": 3.0592791182603514, "grad_norm": 1.0377299785614014, "learning_rate": 4.9574081600586175e-05, "loss": 0.6412, "num_input_tokens_seen": 11917328, "step": 20540 }, { "epoch": 3.0600238308013106, "grad_norm": 1.2213690280914307, "learning_rate": 4.957348414243855e-05, "loss": 0.8084, "num_input_tokens_seen": 11920048, "step": 20545 }, { "epoch": 3.0607685433422698, "grad_norm": 0.9513378143310547, "learning_rate": 4.9572886269146877e-05, "loss": 0.6764, "num_input_tokens_seen": 11922800, "step": 20550 }, { "epoch": 3.061513255883229, "grad_norm": 0.6886143088340759, "learning_rate": 4.957228798072128e-05, "loss": 0.6598, "num_input_tokens_seen": 11926096, "step": 20555 }, { "epoch": 3.062257968424188, "grad_norm": 0.8922539949417114, "learning_rate": 4.957168927717184e-05, "loss": 0.5646, "num_input_tokens_seen": 11928944, "step": 20560 }, { "epoch": 3.0630026809651474, "grad_norm": 0.8357729315757751, "learning_rate": 4.957109015850868e-05, "loss": 0.6354, "num_input_tokens_seen": 11932240, "step": 20565 }, { "epoch": 3.0637473935061066, "grad_norm": 0.8912544846534729, "learning_rate": 4.957049062474194e-05, "loss": 0.6561, "num_input_tokens_seen": 11935216, "step": 20570 }, { "epoch": 3.064492106047066, "grad_norm": 0.7219530344009399, "learning_rate": 4.956989067588172e-05, "loss": 0.6054, "num_input_tokens_seen": 11938288, "step": 20575 }, { "epoch": 3.065236818588025, "grad_norm": 1.1269547939300537, "learning_rate": 4.956929031193817e-05, "loss": 0.7374, "num_input_tokens_seen": 11940816, "step": 20580 }, { "epoch": 3.065981531128984, "grad_norm": 0.6737085580825806, "learning_rate": 4.956868953292143e-05, "loss": 0.6238, "num_input_tokens_seen": 11943728, "step": 20585 }, { "epoch": 3.0667262436699434, "grad_norm": 0.9526787400245667, "learning_rate": 4.9568088338841664e-05, "loss": 0.6355, "num_input_tokens_seen": 11946544, "step": 20590 }, { "epoch": 3.0674709562109026, "grad_norm": 1.0485178232192993, "learning_rate": 4.9567486729709e-05, "loss": 0.6979, "num_input_tokens_seen": 11949488, "step": 20595 }, { "epoch": 3.068215668751862, "grad_norm": 1.0529197454452515, "learning_rate": 4.956688470553363e-05, "loss": 0.6434, "num_input_tokens_seen": 11952240, "step": 20600 }, { "epoch": 3.068960381292821, "grad_norm": 0.8241246342658997, "learning_rate": 4.95662822663257e-05, "loss": 0.6987, "num_input_tokens_seen": 11954896, "step": 20605 }, { "epoch": 3.06970509383378, "grad_norm": 0.766840398311615, "learning_rate": 4.9565679412095415e-05, "loss": 0.7247, "num_input_tokens_seen": 11957712, "step": 20610 }, { "epoch": 3.0704498063747394, "grad_norm": 1.6165707111358643, "learning_rate": 4.956507614285293e-05, "loss": 0.5495, "num_input_tokens_seen": 11960592, "step": 20615 }, { "epoch": 3.0711945189156986, "grad_norm": 1.1838377714157104, "learning_rate": 4.9564472458608445e-05, "loss": 0.5984, "num_input_tokens_seen": 11963664, "step": 20620 }, { "epoch": 3.071939231456658, "grad_norm": 0.6618584990501404, "learning_rate": 4.956386835937218e-05, "loss": 0.6959, "num_input_tokens_seen": 11966928, "step": 20625 }, { "epoch": 3.072683943997617, "grad_norm": 0.8219633102416992, "learning_rate": 4.9563263845154315e-05, "loss": 0.7733, "num_input_tokens_seen": 11970000, "step": 20630 }, { "epoch": 3.073428656538576, "grad_norm": 1.7079263925552368, "learning_rate": 4.9562658915965075e-05, "loss": 0.6477, "num_input_tokens_seen": 11973040, "step": 20635 }, { "epoch": 3.0741733690795354, "grad_norm": 1.2433277368545532, "learning_rate": 4.956205357181467e-05, "loss": 0.6735, "num_input_tokens_seen": 11976016, "step": 20640 }, { "epoch": 3.0749180816204946, "grad_norm": 1.1153777837753296, "learning_rate": 4.9561447812713345e-05, "loss": 0.6084, "num_input_tokens_seen": 11979120, "step": 20645 }, { "epoch": 3.075662794161454, "grad_norm": 0.9596470594406128, "learning_rate": 4.956084163867132e-05, "loss": 0.5795, "num_input_tokens_seen": 11981776, "step": 20650 }, { "epoch": 3.076407506702413, "grad_norm": 1.3756510019302368, "learning_rate": 4.9560235049698834e-05, "loss": 0.7541, "num_input_tokens_seen": 11984400, "step": 20655 }, { "epoch": 3.0771522192433722, "grad_norm": 0.9084229469299316, "learning_rate": 4.955962804580614e-05, "loss": 0.7442, "num_input_tokens_seen": 11987248, "step": 20660 }, { "epoch": 3.0778969317843314, "grad_norm": 0.6593647599220276, "learning_rate": 4.9559020627003494e-05, "loss": 0.7887, "num_input_tokens_seen": 11990128, "step": 20665 }, { "epoch": 3.0786416443252906, "grad_norm": 0.7568756341934204, "learning_rate": 4.955841279330115e-05, "loss": 0.6029, "num_input_tokens_seen": 11993392, "step": 20670 }, { "epoch": 3.07938635686625, "grad_norm": 1.388264775276184, "learning_rate": 4.9557804544709385e-05, "loss": 0.7413, "num_input_tokens_seen": 11996464, "step": 20675 }, { "epoch": 3.0801310694072086, "grad_norm": 1.4315310716629028, "learning_rate": 4.955719588123847e-05, "loss": 0.5544, "num_input_tokens_seen": 11999472, "step": 20680 }, { "epoch": 3.080875781948168, "grad_norm": 1.3025418519973755, "learning_rate": 4.955658680289869e-05, "loss": 0.7688, "num_input_tokens_seen": 12002416, "step": 20685 }, { "epoch": 3.081620494489127, "grad_norm": 0.8496090769767761, "learning_rate": 4.955597730970034e-05, "loss": 0.7597, "num_input_tokens_seen": 12005456, "step": 20690 }, { "epoch": 3.082365207030086, "grad_norm": 0.6803029179573059, "learning_rate": 4.95553674016537e-05, "loss": 0.5682, "num_input_tokens_seen": 12008464, "step": 20695 }, { "epoch": 3.0831099195710454, "grad_norm": 1.299765944480896, "learning_rate": 4.9554757078769095e-05, "loss": 0.8271, "num_input_tokens_seen": 12011312, "step": 20700 }, { "epoch": 3.0838546321120046, "grad_norm": 0.9387822151184082, "learning_rate": 4.955414634105682e-05, "loss": 0.681, "num_input_tokens_seen": 12014512, "step": 20705 }, { "epoch": 3.084599344652964, "grad_norm": 1.3349173069000244, "learning_rate": 4.95535351885272e-05, "loss": 0.6568, "num_input_tokens_seen": 12017360, "step": 20710 }, { "epoch": 3.085344057193923, "grad_norm": 1.0387376546859741, "learning_rate": 4.955292362119055e-05, "loss": 0.6507, "num_input_tokens_seen": 12020112, "step": 20715 }, { "epoch": 3.086088769734882, "grad_norm": 0.8270565867424011, "learning_rate": 4.955231163905723e-05, "loss": 0.6733, "num_input_tokens_seen": 12022800, "step": 20720 }, { "epoch": 3.0868334822758414, "grad_norm": 1.0110994577407837, "learning_rate": 4.955169924213754e-05, "loss": 0.5835, "num_input_tokens_seen": 12025840, "step": 20725 }, { "epoch": 3.0875781948168006, "grad_norm": 1.3834954500198364, "learning_rate": 4.955108643044185e-05, "loss": 0.8053, "num_input_tokens_seen": 12028784, "step": 20730 }, { "epoch": 3.08832290735776, "grad_norm": 1.0688972473144531, "learning_rate": 4.955047320398051e-05, "loss": 0.6295, "num_input_tokens_seen": 12031408, "step": 20735 }, { "epoch": 3.089067619898719, "grad_norm": 0.8397176265716553, "learning_rate": 4.954985956276388e-05, "loss": 0.6649, "num_input_tokens_seen": 12034224, "step": 20740 }, { "epoch": 3.0898123324396782, "grad_norm": 0.9427405595779419, "learning_rate": 4.954924550680231e-05, "loss": 0.6966, "num_input_tokens_seen": 12036880, "step": 20745 }, { "epoch": 3.0905570449806374, "grad_norm": 0.8099631071090698, "learning_rate": 4.95486310361062e-05, "loss": 0.6549, "num_input_tokens_seen": 12039920, "step": 20750 }, { "epoch": 3.0913017575215966, "grad_norm": 0.7784053087234497, "learning_rate": 4.954801615068592e-05, "loss": 0.6255, "num_input_tokens_seen": 12043088, "step": 20755 }, { "epoch": 3.092046470062556, "grad_norm": 1.8863096237182617, "learning_rate": 4.9547400850551853e-05, "loss": 0.7399, "num_input_tokens_seen": 12045968, "step": 20760 }, { "epoch": 3.092791182603515, "grad_norm": 0.6340493559837341, "learning_rate": 4.9546785135714394e-05, "loss": 0.6843, "num_input_tokens_seen": 12049008, "step": 20765 }, { "epoch": 3.0935358951444742, "grad_norm": 1.6364524364471436, "learning_rate": 4.954616900618395e-05, "loss": 0.7324, "num_input_tokens_seen": 12051856, "step": 20770 }, { "epoch": 3.0942806076854334, "grad_norm": 0.7472470998764038, "learning_rate": 4.954555246197093e-05, "loss": 0.6862, "num_input_tokens_seen": 12054608, "step": 20775 }, { "epoch": 3.0950253202263927, "grad_norm": 1.1659586429595947, "learning_rate": 4.954493550308575e-05, "loss": 0.8515, "num_input_tokens_seen": 12057584, "step": 20780 }, { "epoch": 3.095770032767352, "grad_norm": 0.8140813708305359, "learning_rate": 4.9544318129538824e-05, "loss": 0.5503, "num_input_tokens_seen": 12060272, "step": 20785 }, { "epoch": 3.096514745308311, "grad_norm": 1.3956334590911865, "learning_rate": 4.95437003413406e-05, "loss": 0.6131, "num_input_tokens_seen": 12063120, "step": 20790 }, { "epoch": 3.0972594578492703, "grad_norm": 1.3731465339660645, "learning_rate": 4.9543082138501495e-05, "loss": 0.7639, "num_input_tokens_seen": 12066032, "step": 20795 }, { "epoch": 3.0980041703902295, "grad_norm": 0.991801917552948, "learning_rate": 4.954246352103197e-05, "loss": 0.6652, "num_input_tokens_seen": 12068816, "step": 20800 }, { "epoch": 3.0987488829311887, "grad_norm": 2.298736095428467, "learning_rate": 4.954184448894246e-05, "loss": 0.7961, "num_input_tokens_seen": 12071792, "step": 20805 }, { "epoch": 3.099493595472148, "grad_norm": 0.7354600429534912, "learning_rate": 4.954122504224343e-05, "loss": 0.7281, "num_input_tokens_seen": 12074576, "step": 20810 }, { "epoch": 3.100238308013107, "grad_norm": 0.9737146496772766, "learning_rate": 4.954060518094535e-05, "loss": 0.6452, "num_input_tokens_seen": 12077200, "step": 20815 }, { "epoch": 3.1009830205540663, "grad_norm": 1.1907992362976074, "learning_rate": 4.953998490505868e-05, "loss": 0.6423, "num_input_tokens_seen": 12080048, "step": 20820 }, { "epoch": 3.1017277330950255, "grad_norm": 1.1809041500091553, "learning_rate": 4.953936421459392e-05, "loss": 0.6137, "num_input_tokens_seen": 12082736, "step": 20825 }, { "epoch": 3.1024724456359847, "grad_norm": 1.0609372854232788, "learning_rate": 4.953874310956153e-05, "loss": 0.6736, "num_input_tokens_seen": 12085456, "step": 20830 }, { "epoch": 3.103217158176944, "grad_norm": 1.3131617307662964, "learning_rate": 4.953812158997202e-05, "loss": 0.6768, "num_input_tokens_seen": 12088464, "step": 20835 }, { "epoch": 3.103961870717903, "grad_norm": 0.9394952654838562, "learning_rate": 4.953749965583588e-05, "loss": 0.5856, "num_input_tokens_seen": 12091312, "step": 20840 }, { "epoch": 3.1047065832588623, "grad_norm": 0.9383239150047302, "learning_rate": 4.953687730716363e-05, "loss": 0.5984, "num_input_tokens_seen": 12094256, "step": 20845 }, { "epoch": 3.1054512957998215, "grad_norm": 0.7521320581436157, "learning_rate": 4.9536254543965775e-05, "loss": 0.6029, "num_input_tokens_seen": 12097040, "step": 20850 }, { "epoch": 3.1061960083407802, "grad_norm": 0.725662350654602, "learning_rate": 4.953563136625283e-05, "loss": 0.6365, "num_input_tokens_seen": 12100080, "step": 20855 }, { "epoch": 3.1069407208817394, "grad_norm": 1.9998520612716675, "learning_rate": 4.9535007774035335e-05, "loss": 0.7599, "num_input_tokens_seen": 12102864, "step": 20860 }, { "epoch": 3.1076854334226987, "grad_norm": 0.9465324282646179, "learning_rate": 4.9534383767323825e-05, "loss": 0.7428, "num_input_tokens_seen": 12106064, "step": 20865 }, { "epoch": 3.108430145963658, "grad_norm": 0.9145023822784424, "learning_rate": 4.9533759346128824e-05, "loss": 0.67, "num_input_tokens_seen": 12108784, "step": 20870 }, { "epoch": 3.109174858504617, "grad_norm": 0.8012177348136902, "learning_rate": 4.953313451046091e-05, "loss": 0.6769, "num_input_tokens_seen": 12111344, "step": 20875 }, { "epoch": 3.1099195710455763, "grad_norm": 0.9325191974639893, "learning_rate": 4.9532509260330615e-05, "loss": 0.7471, "num_input_tokens_seen": 12114224, "step": 20880 }, { "epoch": 3.1106642835865355, "grad_norm": 1.1490232944488525, "learning_rate": 4.953188359574851e-05, "loss": 0.6875, "num_input_tokens_seen": 12117328, "step": 20885 }, { "epoch": 3.1114089961274947, "grad_norm": 1.7599592208862305, "learning_rate": 4.953125751672516e-05, "loss": 0.4476, "num_input_tokens_seen": 12120016, "step": 20890 }, { "epoch": 3.112153708668454, "grad_norm": 1.6065974235534668, "learning_rate": 4.953063102327115e-05, "loss": 0.5672, "num_input_tokens_seen": 12122672, "step": 20895 }, { "epoch": 3.112898421209413, "grad_norm": 0.8743595480918884, "learning_rate": 4.953000411539706e-05, "loss": 0.7268, "num_input_tokens_seen": 12125648, "step": 20900 }, { "epoch": 3.1136431337503723, "grad_norm": 0.8250860571861267, "learning_rate": 4.952937679311348e-05, "loss": 0.7796, "num_input_tokens_seen": 12128464, "step": 20905 }, { "epoch": 3.1143878462913315, "grad_norm": 1.115289330482483, "learning_rate": 4.9528749056431015e-05, "loss": 0.6944, "num_input_tokens_seen": 12131376, "step": 20910 }, { "epoch": 3.1151325588322907, "grad_norm": 0.9730908274650574, "learning_rate": 4.9528120905360265e-05, "loss": 0.6316, "num_input_tokens_seen": 12134672, "step": 20915 }, { "epoch": 3.11587727137325, "grad_norm": 1.0511997938156128, "learning_rate": 4.9527492339911836e-05, "loss": 0.693, "num_input_tokens_seen": 12137616, "step": 20920 }, { "epoch": 3.116621983914209, "grad_norm": 1.3080931901931763, "learning_rate": 4.952686336009635e-05, "loss": 0.677, "num_input_tokens_seen": 12140496, "step": 20925 }, { "epoch": 3.1173666964551683, "grad_norm": 1.6263047456741333, "learning_rate": 4.952623396592445e-05, "loss": 0.5407, "num_input_tokens_seen": 12143632, "step": 20930 }, { "epoch": 3.1181114089961275, "grad_norm": 0.8080570101737976, "learning_rate": 4.952560415740674e-05, "loss": 0.5432, "num_input_tokens_seen": 12146448, "step": 20935 }, { "epoch": 3.1188561215370867, "grad_norm": 1.196099877357483, "learning_rate": 4.9524973934553884e-05, "loss": 0.673, "num_input_tokens_seen": 12149296, "step": 20940 }, { "epoch": 3.119600834078046, "grad_norm": 0.9517825245857239, "learning_rate": 4.952434329737651e-05, "loss": 0.7682, "num_input_tokens_seen": 12152144, "step": 20945 }, { "epoch": 3.120345546619005, "grad_norm": 0.7566289305686951, "learning_rate": 4.952371224588529e-05, "loss": 0.6981, "num_input_tokens_seen": 12155248, "step": 20950 }, { "epoch": 3.1210902591599643, "grad_norm": 1.2648576498031616, "learning_rate": 4.952308078009087e-05, "loss": 0.5927, "num_input_tokens_seen": 12158192, "step": 20955 }, { "epoch": 3.1218349717009235, "grad_norm": 1.3264548778533936, "learning_rate": 4.9522448900003925e-05, "loss": 0.6789, "num_input_tokens_seen": 12161232, "step": 20960 }, { "epoch": 3.1225796842418827, "grad_norm": 1.0863243341445923, "learning_rate": 4.952181660563514e-05, "loss": 0.632, "num_input_tokens_seen": 12163984, "step": 20965 }, { "epoch": 3.123324396782842, "grad_norm": 1.1843726634979248, "learning_rate": 4.952118389699517e-05, "loss": 0.7729, "num_input_tokens_seen": 12167152, "step": 20970 }, { "epoch": 3.124069109323801, "grad_norm": 0.9491707682609558, "learning_rate": 4.9520550774094735e-05, "loss": 0.7154, "num_input_tokens_seen": 12169968, "step": 20975 }, { "epoch": 3.1248138218647603, "grad_norm": 0.8319495320320129, "learning_rate": 4.9519917236944504e-05, "loss": 0.5402, "num_input_tokens_seen": 12173072, "step": 20980 }, { "epoch": 3.1255585344057195, "grad_norm": 1.145868182182312, "learning_rate": 4.9519283285555195e-05, "loss": 0.6211, "num_input_tokens_seen": 12176144, "step": 20985 }, { "epoch": 3.1263032469466787, "grad_norm": 0.6079201698303223, "learning_rate": 4.951864891993752e-05, "loss": 0.6617, "num_input_tokens_seen": 12178832, "step": 20990 }, { "epoch": 3.127047959487638, "grad_norm": 0.9527955055236816, "learning_rate": 4.951801414010219e-05, "loss": 0.5047, "num_input_tokens_seen": 12181392, "step": 20995 }, { "epoch": 3.127792672028597, "grad_norm": 0.884604811668396, "learning_rate": 4.9517378946059936e-05, "loss": 0.783, "num_input_tokens_seen": 12184272, "step": 21000 }, { "epoch": 3.1285373845695563, "grad_norm": 1.3110264539718628, "learning_rate": 4.951674333782147e-05, "loss": 0.736, "num_input_tokens_seen": 12187152, "step": 21005 }, { "epoch": 3.1292820971105155, "grad_norm": 0.8190916776657104, "learning_rate": 4.9516107315397554e-05, "loss": 0.6355, "num_input_tokens_seen": 12190352, "step": 21010 }, { "epoch": 3.1300268096514747, "grad_norm": 1.4214941263198853, "learning_rate": 4.951547087879891e-05, "loss": 0.6416, "num_input_tokens_seen": 12193168, "step": 21015 }, { "epoch": 3.1307715221924335, "grad_norm": 2.778391122817993, "learning_rate": 4.951483402803631e-05, "loss": 0.8371, "num_input_tokens_seen": 12195984, "step": 21020 }, { "epoch": 3.131516234733393, "grad_norm": 0.7788795232772827, "learning_rate": 4.95141967631205e-05, "loss": 0.5632, "num_input_tokens_seen": 12198960, "step": 21025 }, { "epoch": 3.132260947274352, "grad_norm": 2.4197006225585938, "learning_rate": 4.951355908406226e-05, "loss": 0.6065, "num_input_tokens_seen": 12201712, "step": 21030 }, { "epoch": 3.133005659815311, "grad_norm": 0.9946309328079224, "learning_rate": 4.951292099087235e-05, "loss": 0.5893, "num_input_tokens_seen": 12204528, "step": 21035 }, { "epoch": 3.1337503723562703, "grad_norm": 1.459686279296875, "learning_rate": 4.951228248356155e-05, "loss": 0.7178, "num_input_tokens_seen": 12207376, "step": 21040 }, { "epoch": 3.1344950848972295, "grad_norm": 1.6202224493026733, "learning_rate": 4.951164356214065e-05, "loss": 0.7317, "num_input_tokens_seen": 12210256, "step": 21045 }, { "epoch": 3.1352397974381887, "grad_norm": 0.7486812472343445, "learning_rate": 4.951100422662045e-05, "loss": 0.7174, "num_input_tokens_seen": 12213488, "step": 21050 }, { "epoch": 3.135984509979148, "grad_norm": 0.9318271279335022, "learning_rate": 4.951036447701174e-05, "loss": 0.5245, "num_input_tokens_seen": 12216272, "step": 21055 }, { "epoch": 3.136729222520107, "grad_norm": 1.1288414001464844, "learning_rate": 4.950972431332534e-05, "loss": 0.7265, "num_input_tokens_seen": 12219120, "step": 21060 }, { "epoch": 3.1374739350610663, "grad_norm": 0.6626919507980347, "learning_rate": 4.9509083735572055e-05, "loss": 0.6925, "num_input_tokens_seen": 12221840, "step": 21065 }, { "epoch": 3.1382186476020255, "grad_norm": 0.7338756918907166, "learning_rate": 4.950844274376271e-05, "loss": 0.6602, "num_input_tokens_seen": 12225136, "step": 21070 }, { "epoch": 3.1389633601429847, "grad_norm": 1.11806058883667, "learning_rate": 4.950780133790813e-05, "loss": 0.7424, "num_input_tokens_seen": 12227856, "step": 21075 }, { "epoch": 3.139708072683944, "grad_norm": 1.0565721988677979, "learning_rate": 4.950715951801916e-05, "loss": 0.6312, "num_input_tokens_seen": 12230704, "step": 21080 }, { "epoch": 3.140452785224903, "grad_norm": 0.8864277601242065, "learning_rate": 4.950651728410663e-05, "loss": 0.594, "num_input_tokens_seen": 12233872, "step": 21085 }, { "epoch": 3.1411974977658623, "grad_norm": 0.9992954134941101, "learning_rate": 4.9505874636181414e-05, "loss": 0.8198, "num_input_tokens_seen": 12236496, "step": 21090 }, { "epoch": 3.1419422103068215, "grad_norm": 0.9091503024101257, "learning_rate": 4.950523157425434e-05, "loss": 0.7214, "num_input_tokens_seen": 12239216, "step": 21095 }, { "epoch": 3.1426869228477807, "grad_norm": 1.1044950485229492, "learning_rate": 4.950458809833629e-05, "loss": 0.6215, "num_input_tokens_seen": 12242032, "step": 21100 }, { "epoch": 3.14343163538874, "grad_norm": 1.016841173171997, "learning_rate": 4.9503944208438124e-05, "loss": 0.6123, "num_input_tokens_seen": 12245232, "step": 21105 }, { "epoch": 3.144176347929699, "grad_norm": 0.8024969100952148, "learning_rate": 4.950329990457073e-05, "loss": 0.6791, "num_input_tokens_seen": 12247856, "step": 21110 }, { "epoch": 3.1449210604706583, "grad_norm": 1.5742751359939575, "learning_rate": 4.950265518674498e-05, "loss": 0.7256, "num_input_tokens_seen": 12250480, "step": 21115 }, { "epoch": 3.1456657730116175, "grad_norm": 0.7707582712173462, "learning_rate": 4.950201005497179e-05, "loss": 0.7639, "num_input_tokens_seen": 12253232, "step": 21120 }, { "epoch": 3.1464104855525767, "grad_norm": 1.834038257598877, "learning_rate": 4.950136450926203e-05, "loss": 0.8014, "num_input_tokens_seen": 12256240, "step": 21125 }, { "epoch": 3.147155198093536, "grad_norm": 1.3489141464233398, "learning_rate": 4.950071854962662e-05, "loss": 0.6152, "num_input_tokens_seen": 12259344, "step": 21130 }, { "epoch": 3.147899910634495, "grad_norm": 1.2577574253082275, "learning_rate": 4.950007217607647e-05, "loss": 0.7294, "num_input_tokens_seen": 12262192, "step": 21135 }, { "epoch": 3.1486446231754543, "grad_norm": 0.8724827766418457, "learning_rate": 4.949942538862251e-05, "loss": 0.7646, "num_input_tokens_seen": 12265136, "step": 21140 }, { "epoch": 3.1493893357164136, "grad_norm": 1.1331112384796143, "learning_rate": 4.949877818727565e-05, "loss": 0.6031, "num_input_tokens_seen": 12268240, "step": 21145 }, { "epoch": 3.1501340482573728, "grad_norm": 0.814460813999176, "learning_rate": 4.949813057204684e-05, "loss": 0.6747, "num_input_tokens_seen": 12271120, "step": 21150 }, { "epoch": 3.150878760798332, "grad_norm": 0.9844933152198792, "learning_rate": 4.9497482542947004e-05, "loss": 0.5026, "num_input_tokens_seen": 12273776, "step": 21155 }, { "epoch": 3.151623473339291, "grad_norm": 0.8982154726982117, "learning_rate": 4.9496834099987106e-05, "loss": 0.6344, "num_input_tokens_seen": 12277008, "step": 21160 }, { "epoch": 3.1523681858802504, "grad_norm": 1.3736013174057007, "learning_rate": 4.949618524317809e-05, "loss": 0.6084, "num_input_tokens_seen": 12279568, "step": 21165 }, { "epoch": 3.1531128984212096, "grad_norm": 1.4773204326629639, "learning_rate": 4.9495535972530924e-05, "loss": 0.6455, "num_input_tokens_seen": 12282352, "step": 21170 }, { "epoch": 3.1538576109621688, "grad_norm": 0.7577702403068542, "learning_rate": 4.949488628805657e-05, "loss": 0.7729, "num_input_tokens_seen": 12285296, "step": 21175 }, { "epoch": 3.154602323503128, "grad_norm": 1.1338977813720703, "learning_rate": 4.9494236189766005e-05, "loss": 0.8467, "num_input_tokens_seen": 12287984, "step": 21180 }, { "epoch": 3.155347036044087, "grad_norm": 1.0787107944488525, "learning_rate": 4.9493585677670216e-05, "loss": 0.6855, "num_input_tokens_seen": 12290928, "step": 21185 }, { "epoch": 3.1560917485850464, "grad_norm": 1.1142164468765259, "learning_rate": 4.94929347517802e-05, "loss": 0.7586, "num_input_tokens_seen": 12294192, "step": 21190 }, { "epoch": 3.156836461126005, "grad_norm": 1.3436623811721802, "learning_rate": 4.9492283412106934e-05, "loss": 0.6726, "num_input_tokens_seen": 12296976, "step": 21195 }, { "epoch": 3.157581173666965, "grad_norm": 1.0924533605575562, "learning_rate": 4.9491631658661436e-05, "loss": 0.7165, "num_input_tokens_seen": 12299984, "step": 21200 }, { "epoch": 3.1583258862079235, "grad_norm": 1.139985203742981, "learning_rate": 4.9490979491454716e-05, "loss": 0.7076, "num_input_tokens_seen": 12303152, "step": 21205 }, { "epoch": 3.1590705987488827, "grad_norm": 1.0045783519744873, "learning_rate": 4.9490326910497786e-05, "loss": 0.5577, "num_input_tokens_seen": 12306032, "step": 21210 }, { "epoch": 3.159815311289842, "grad_norm": 0.878460705280304, "learning_rate": 4.948967391580167e-05, "loss": 0.5561, "num_input_tokens_seen": 12308656, "step": 21215 }, { "epoch": 3.160560023830801, "grad_norm": 1.4868676662445068, "learning_rate": 4.948902050737741e-05, "loss": 0.7142, "num_input_tokens_seen": 12311600, "step": 21220 }, { "epoch": 3.1613047363717603, "grad_norm": 0.8265056014060974, "learning_rate": 4.948836668523604e-05, "loss": 0.6113, "num_input_tokens_seen": 12314416, "step": 21225 }, { "epoch": 3.1620494489127196, "grad_norm": 0.9833069443702698, "learning_rate": 4.9487712449388604e-05, "loss": 0.6953, "num_input_tokens_seen": 12317520, "step": 21230 }, { "epoch": 3.1627941614536788, "grad_norm": 0.71901935338974, "learning_rate": 4.948705779984614e-05, "loss": 0.5065, "num_input_tokens_seen": 12320496, "step": 21235 }, { "epoch": 3.163538873994638, "grad_norm": 0.7040554285049438, "learning_rate": 4.9486402736619736e-05, "loss": 0.7858, "num_input_tokens_seen": 12323856, "step": 21240 }, { "epoch": 3.164283586535597, "grad_norm": 1.163819432258606, "learning_rate": 4.9485747259720435e-05, "loss": 0.7022, "num_input_tokens_seen": 12326736, "step": 21245 }, { "epoch": 3.1650282990765564, "grad_norm": 0.7419182658195496, "learning_rate": 4.9485091369159334e-05, "loss": 0.7516, "num_input_tokens_seen": 12329456, "step": 21250 }, { "epoch": 3.1657730116175156, "grad_norm": 1.2840691804885864, "learning_rate": 4.948443506494749e-05, "loss": 0.7979, "num_input_tokens_seen": 12332240, "step": 21255 }, { "epoch": 3.1665177241584748, "grad_norm": 1.054543137550354, "learning_rate": 4.9483778347096e-05, "loss": 0.6864, "num_input_tokens_seen": 12335120, "step": 21260 }, { "epoch": 3.167262436699434, "grad_norm": 0.9105631113052368, "learning_rate": 4.948312121561596e-05, "loss": 0.6524, "num_input_tokens_seen": 12338384, "step": 21265 }, { "epoch": 3.168007149240393, "grad_norm": 0.9432842135429382, "learning_rate": 4.9482463670518476e-05, "loss": 0.6281, "num_input_tokens_seen": 12341200, "step": 21270 }, { "epoch": 3.1687518617813524, "grad_norm": 0.9398854374885559, "learning_rate": 4.9481805711814645e-05, "loss": 0.6798, "num_input_tokens_seen": 12344080, "step": 21275 }, { "epoch": 3.1694965743223116, "grad_norm": 1.0683854818344116, "learning_rate": 4.948114733951559e-05, "loss": 0.7164, "num_input_tokens_seen": 12347120, "step": 21280 }, { "epoch": 3.170241286863271, "grad_norm": 0.768267810344696, "learning_rate": 4.948048855363243e-05, "loss": 0.5795, "num_input_tokens_seen": 12350128, "step": 21285 }, { "epoch": 3.17098599940423, "grad_norm": 1.44460928440094, "learning_rate": 4.94798293541763e-05, "loss": 0.707, "num_input_tokens_seen": 12352848, "step": 21290 }, { "epoch": 3.171730711945189, "grad_norm": 0.8192169070243835, "learning_rate": 4.9479169741158336e-05, "loss": 0.6825, "num_input_tokens_seen": 12355920, "step": 21295 }, { "epoch": 3.1724754244861484, "grad_norm": 0.6605413556098938, "learning_rate": 4.947850971458968e-05, "loss": 0.653, "num_input_tokens_seen": 12359120, "step": 21300 }, { "epoch": 3.1732201370271076, "grad_norm": 0.8582649230957031, "learning_rate": 4.947784927448147e-05, "loss": 0.7221, "num_input_tokens_seen": 12361808, "step": 21305 }, { "epoch": 3.173964849568067, "grad_norm": 0.6061381101608276, "learning_rate": 4.9477188420844886e-05, "loss": 0.703, "num_input_tokens_seen": 12364816, "step": 21310 }, { "epoch": 3.174709562109026, "grad_norm": 0.9651428461074829, "learning_rate": 4.947652715369108e-05, "loss": 0.7868, "num_input_tokens_seen": 12367696, "step": 21315 }, { "epoch": 3.175454274649985, "grad_norm": 0.9960357546806335, "learning_rate": 4.947586547303121e-05, "loss": 0.7148, "num_input_tokens_seen": 12370704, "step": 21320 }, { "epoch": 3.1761989871909444, "grad_norm": 0.9006998538970947, "learning_rate": 4.947520337887649e-05, "loss": 0.6541, "num_input_tokens_seen": 12373584, "step": 21325 }, { "epoch": 3.1769436997319036, "grad_norm": 1.0191410779953003, "learning_rate": 4.947454087123807e-05, "loss": 0.6326, "num_input_tokens_seen": 12376912, "step": 21330 }, { "epoch": 3.177688412272863, "grad_norm": 1.2213377952575684, "learning_rate": 4.947387795012716e-05, "loss": 0.6415, "num_input_tokens_seen": 12379920, "step": 21335 }, { "epoch": 3.178433124813822, "grad_norm": 0.874164342880249, "learning_rate": 4.947321461555496e-05, "loss": 0.7371, "num_input_tokens_seen": 12382864, "step": 21340 }, { "epoch": 3.179177837354781, "grad_norm": 0.5887818336486816, "learning_rate": 4.947255086753268e-05, "loss": 0.7383, "num_input_tokens_seen": 12385744, "step": 21345 }, { "epoch": 3.1799225498957404, "grad_norm": 1.1374390125274658, "learning_rate": 4.9471886706071504e-05, "loss": 0.5909, "num_input_tokens_seen": 12388624, "step": 21350 }, { "epoch": 3.1806672624366996, "grad_norm": 0.7211999893188477, "learning_rate": 4.9471222131182685e-05, "loss": 0.6762, "num_input_tokens_seen": 12391408, "step": 21355 }, { "epoch": 3.181411974977659, "grad_norm": 1.0063172578811646, "learning_rate": 4.9470557142877446e-05, "loss": 0.5955, "num_input_tokens_seen": 12394320, "step": 21360 }, { "epoch": 3.182156687518618, "grad_norm": 0.6778229475021362, "learning_rate": 4.946989174116701e-05, "loss": 0.6022, "num_input_tokens_seen": 12397264, "step": 21365 }, { "epoch": 3.182901400059577, "grad_norm": 0.943564772605896, "learning_rate": 4.9469225926062625e-05, "loss": 0.6727, "num_input_tokens_seen": 12399984, "step": 21370 }, { "epoch": 3.1836461126005364, "grad_norm": 0.8328053951263428, "learning_rate": 4.946855969757553e-05, "loss": 0.5748, "num_input_tokens_seen": 12402832, "step": 21375 }, { "epoch": 3.184390825141495, "grad_norm": 0.8624258041381836, "learning_rate": 4.9467893055716996e-05, "loss": 0.653, "num_input_tokens_seen": 12405712, "step": 21380 }, { "epoch": 3.1851355376824544, "grad_norm": 1.2701070308685303, "learning_rate": 4.946722600049827e-05, "loss": 0.7319, "num_input_tokens_seen": 12408752, "step": 21385 }, { "epoch": 3.1858802502234136, "grad_norm": 0.8691337704658508, "learning_rate": 4.946655853193063e-05, "loss": 0.6935, "num_input_tokens_seen": 12411760, "step": 21390 }, { "epoch": 3.186624962764373, "grad_norm": 0.6855093836784363, "learning_rate": 4.946589065002535e-05, "loss": 0.5984, "num_input_tokens_seen": 12414224, "step": 21395 }, { "epoch": 3.187369675305332, "grad_norm": 0.8550962805747986, "learning_rate": 4.946522235479372e-05, "loss": 0.7085, "num_input_tokens_seen": 12417296, "step": 21400 }, { "epoch": 3.188114387846291, "grad_norm": 1.0198333263397217, "learning_rate": 4.946455364624702e-05, "loss": 0.7397, "num_input_tokens_seen": 12420240, "step": 21405 }, { "epoch": 3.1888591003872504, "grad_norm": 0.8700436353683472, "learning_rate": 4.9463884524396555e-05, "loss": 0.6125, "num_input_tokens_seen": 12422928, "step": 21410 }, { "epoch": 3.1896038129282096, "grad_norm": 0.6962943077087402, "learning_rate": 4.946321498925362e-05, "loss": 0.748, "num_input_tokens_seen": 12425936, "step": 21415 }, { "epoch": 3.190348525469169, "grad_norm": 0.7808501720428467, "learning_rate": 4.946254504082952e-05, "loss": 0.5738, "num_input_tokens_seen": 12429040, "step": 21420 }, { "epoch": 3.191093238010128, "grad_norm": 0.9232904314994812, "learning_rate": 4.94618746791356e-05, "loss": 0.7009, "num_input_tokens_seen": 12431952, "step": 21425 }, { "epoch": 3.191837950551087, "grad_norm": 0.8177938461303711, "learning_rate": 4.946120390418316e-05, "loss": 0.5664, "num_input_tokens_seen": 12434800, "step": 21430 }, { "epoch": 3.1925826630920464, "grad_norm": 1.6297516822814941, "learning_rate": 4.946053271598355e-05, "loss": 0.5879, "num_input_tokens_seen": 12437744, "step": 21435 }, { "epoch": 3.1933273756330056, "grad_norm": 1.0430718660354614, "learning_rate": 4.94598611145481e-05, "loss": 0.7314, "num_input_tokens_seen": 12440496, "step": 21440 }, { "epoch": 3.194072088173965, "grad_norm": 0.5548598766326904, "learning_rate": 4.945918909988815e-05, "loss": 0.7354, "num_input_tokens_seen": 12443504, "step": 21445 }, { "epoch": 3.194816800714924, "grad_norm": 0.8199238181114197, "learning_rate": 4.945851667201507e-05, "loss": 0.5068, "num_input_tokens_seen": 12446352, "step": 21450 }, { "epoch": 3.1955615132558832, "grad_norm": 0.9645512700080872, "learning_rate": 4.945784383094019e-05, "loss": 0.6152, "num_input_tokens_seen": 12449392, "step": 21455 }, { "epoch": 3.1963062257968424, "grad_norm": 1.0208930969238281, "learning_rate": 4.9457170576674914e-05, "loss": 0.6217, "num_input_tokens_seen": 12452336, "step": 21460 }, { "epoch": 3.1970509383378016, "grad_norm": 0.977733314037323, "learning_rate": 4.945649690923059e-05, "loss": 0.7659, "num_input_tokens_seen": 12455280, "step": 21465 }, { "epoch": 3.197795650878761, "grad_norm": 1.7527837753295898, "learning_rate": 4.94558228286186e-05, "loss": 0.6602, "num_input_tokens_seen": 12458384, "step": 21470 }, { "epoch": 3.19854036341972, "grad_norm": 0.9181679487228394, "learning_rate": 4.945514833485036e-05, "loss": 0.5942, "num_input_tokens_seen": 12461136, "step": 21475 }, { "epoch": 3.1992850759606792, "grad_norm": 1.0390052795410156, "learning_rate": 4.9454473427937225e-05, "loss": 0.594, "num_input_tokens_seen": 12463856, "step": 21480 }, { "epoch": 3.2000297885016384, "grad_norm": 0.9160130620002747, "learning_rate": 4.9453798107890624e-05, "loss": 0.6242, "num_input_tokens_seen": 12466704, "step": 21485 }, { "epoch": 3.2007745010425976, "grad_norm": 0.6140778660774231, "learning_rate": 4.945312237472196e-05, "loss": 0.6177, "num_input_tokens_seen": 12469744, "step": 21490 }, { "epoch": 3.201519213583557, "grad_norm": 0.752117395401001, "learning_rate": 4.945244622844264e-05, "loss": 0.6359, "num_input_tokens_seen": 12472432, "step": 21495 }, { "epoch": 3.202263926124516, "grad_norm": 0.9686890840530396, "learning_rate": 4.9451769669064096e-05, "loss": 0.666, "num_input_tokens_seen": 12475312, "step": 21500 }, { "epoch": 3.2030086386654752, "grad_norm": 1.7189254760742188, "learning_rate": 4.945109269659776e-05, "loss": 0.7141, "num_input_tokens_seen": 12478448, "step": 21505 }, { "epoch": 3.2037533512064345, "grad_norm": 1.3852283954620361, "learning_rate": 4.945041531105505e-05, "loss": 0.677, "num_input_tokens_seen": 12481520, "step": 21510 }, { "epoch": 3.2044980637473937, "grad_norm": 1.1404935121536255, "learning_rate": 4.9449737512447435e-05, "loss": 0.8123, "num_input_tokens_seen": 12484656, "step": 21515 }, { "epoch": 3.205242776288353, "grad_norm": 0.8485738635063171, "learning_rate": 4.9449059300786355e-05, "loss": 0.8786, "num_input_tokens_seen": 12487376, "step": 21520 }, { "epoch": 3.205987488829312, "grad_norm": 1.482823371887207, "learning_rate": 4.944838067608326e-05, "loss": 0.615, "num_input_tokens_seen": 12490480, "step": 21525 }, { "epoch": 3.2067322013702713, "grad_norm": 1.2984919548034668, "learning_rate": 4.944770163834963e-05, "loss": 0.7124, "num_input_tokens_seen": 12493360, "step": 21530 }, { "epoch": 3.2074769139112305, "grad_norm": 0.7577803134918213, "learning_rate": 4.944702218759692e-05, "loss": 0.6549, "num_input_tokens_seen": 12496272, "step": 21535 }, { "epoch": 3.2082216264521897, "grad_norm": 1.111817717552185, "learning_rate": 4.944634232383662e-05, "loss": 0.6811, "num_input_tokens_seen": 12499152, "step": 21540 }, { "epoch": 3.2089663389931484, "grad_norm": 1.592690348625183, "learning_rate": 4.944566204708022e-05, "loss": 0.7629, "num_input_tokens_seen": 12502320, "step": 21545 }, { "epoch": 3.2097110515341076, "grad_norm": 1.3216195106506348, "learning_rate": 4.94449813573392e-05, "loss": 0.524, "num_input_tokens_seen": 12505008, "step": 21550 }, { "epoch": 3.210455764075067, "grad_norm": 0.5107622742652893, "learning_rate": 4.944430025462507e-05, "loss": 0.6525, "num_input_tokens_seen": 12508016, "step": 21555 }, { "epoch": 3.211200476616026, "grad_norm": 0.6287720799446106, "learning_rate": 4.944361873894932e-05, "loss": 0.6798, "num_input_tokens_seen": 12510896, "step": 21560 }, { "epoch": 3.2119451891569852, "grad_norm": 0.8760231733322144, "learning_rate": 4.944293681032348e-05, "loss": 0.5606, "num_input_tokens_seen": 12513872, "step": 21565 }, { "epoch": 3.2126899016979444, "grad_norm": 0.7281593680381775, "learning_rate": 4.9442254468759065e-05, "loss": 0.6098, "num_input_tokens_seen": 12516816, "step": 21570 }, { "epoch": 3.2134346142389036, "grad_norm": 1.1103490591049194, "learning_rate": 4.94415717142676e-05, "loss": 0.6957, "num_input_tokens_seen": 12519632, "step": 21575 }, { "epoch": 3.214179326779863, "grad_norm": 0.9908931255340576, "learning_rate": 4.944088854686062e-05, "loss": 0.7002, "num_input_tokens_seen": 12522352, "step": 21580 }, { "epoch": 3.214924039320822, "grad_norm": 0.8161936402320862, "learning_rate": 4.944020496654968e-05, "loss": 0.5741, "num_input_tokens_seen": 12525104, "step": 21585 }, { "epoch": 3.2156687518617812, "grad_norm": 0.7152025699615479, "learning_rate": 4.943952097334631e-05, "loss": 0.7569, "num_input_tokens_seen": 12527920, "step": 21590 }, { "epoch": 3.2164134644027405, "grad_norm": 0.8795884847640991, "learning_rate": 4.943883656726207e-05, "loss": 0.6186, "num_input_tokens_seen": 12530800, "step": 21595 }, { "epoch": 3.2171581769436997, "grad_norm": 0.836104154586792, "learning_rate": 4.943815174830853e-05, "loss": 0.7229, "num_input_tokens_seen": 12533936, "step": 21600 }, { "epoch": 3.217902889484659, "grad_norm": 1.0965068340301514, "learning_rate": 4.9437466516497255e-05, "loss": 0.761, "num_input_tokens_seen": 12536656, "step": 21605 }, { "epoch": 3.218647602025618, "grad_norm": 0.5705932378768921, "learning_rate": 4.943678087183982e-05, "loss": 0.626, "num_input_tokens_seen": 12539600, "step": 21610 }, { "epoch": 3.2193923145665773, "grad_norm": 1.2111773490905762, "learning_rate": 4.94360948143478e-05, "loss": 0.6382, "num_input_tokens_seen": 12542352, "step": 21615 }, { "epoch": 3.2201370271075365, "grad_norm": 0.8580120205879211, "learning_rate": 4.94354083440328e-05, "loss": 0.5515, "num_input_tokens_seen": 12545296, "step": 21620 }, { "epoch": 3.2208817396484957, "grad_norm": 1.2195864915847778, "learning_rate": 4.9434721460906406e-05, "loss": 0.7333, "num_input_tokens_seen": 12548240, "step": 21625 }, { "epoch": 3.221626452189455, "grad_norm": 0.8575410842895508, "learning_rate": 4.9434034164980233e-05, "loss": 0.6611, "num_input_tokens_seen": 12551152, "step": 21630 }, { "epoch": 3.222371164730414, "grad_norm": 1.0791348218917847, "learning_rate": 4.94333464562659e-05, "loss": 0.6532, "num_input_tokens_seen": 12553840, "step": 21635 }, { "epoch": 3.2231158772713733, "grad_norm": 1.6415756940841675, "learning_rate": 4.9432658334774984e-05, "loss": 0.715, "num_input_tokens_seen": 12556976, "step": 21640 }, { "epoch": 3.2238605898123325, "grad_norm": 0.8030216097831726, "learning_rate": 4.943196980051915e-05, "loss": 0.6907, "num_input_tokens_seen": 12559664, "step": 21645 }, { "epoch": 3.2246053023532917, "grad_norm": 1.2451742887496948, "learning_rate": 4.943128085351002e-05, "loss": 0.7611, "num_input_tokens_seen": 12562704, "step": 21650 }, { "epoch": 3.225350014894251, "grad_norm": 1.684294581413269, "learning_rate": 4.943059149375923e-05, "loss": 0.6858, "num_input_tokens_seen": 12565392, "step": 21655 }, { "epoch": 3.22609472743521, "grad_norm": 0.7746343016624451, "learning_rate": 4.9429901721278426e-05, "loss": 0.6723, "num_input_tokens_seen": 12568208, "step": 21660 }, { "epoch": 3.2268394399761693, "grad_norm": 0.973403811454773, "learning_rate": 4.9429211536079266e-05, "loss": 0.6717, "num_input_tokens_seen": 12571088, "step": 21665 }, { "epoch": 3.2275841525171285, "grad_norm": 0.8627339005470276, "learning_rate": 4.94285209381734e-05, "loss": 0.6587, "num_input_tokens_seen": 12573552, "step": 21670 }, { "epoch": 3.2283288650580877, "grad_norm": 0.9130327105522156, "learning_rate": 4.94278299275725e-05, "loss": 0.5291, "num_input_tokens_seen": 12576752, "step": 21675 }, { "epoch": 3.229073577599047, "grad_norm": 0.915676474571228, "learning_rate": 4.9427138504288245e-05, "loss": 0.534, "num_input_tokens_seen": 12579536, "step": 21680 }, { "epoch": 3.229818290140006, "grad_norm": 0.8716485500335693, "learning_rate": 4.942644666833231e-05, "loss": 0.5401, "num_input_tokens_seen": 12582384, "step": 21685 }, { "epoch": 3.2305630026809653, "grad_norm": 1.1614232063293457, "learning_rate": 4.9425754419716383e-05, "loss": 0.6979, "num_input_tokens_seen": 12585616, "step": 21690 }, { "epoch": 3.2313077152219245, "grad_norm": 0.7489132285118103, "learning_rate": 4.942506175845216e-05, "loss": 0.6427, "num_input_tokens_seen": 12588304, "step": 21695 }, { "epoch": 3.2320524277628837, "grad_norm": 0.8214944005012512, "learning_rate": 4.9424368684551347e-05, "loss": 0.596, "num_input_tokens_seen": 12591120, "step": 21700 }, { "epoch": 3.232797140303843, "grad_norm": 0.7231096029281616, "learning_rate": 4.942367519802565e-05, "loss": 0.6351, "num_input_tokens_seen": 12593648, "step": 21705 }, { "epoch": 3.233541852844802, "grad_norm": 0.7285727858543396, "learning_rate": 4.9422981298886776e-05, "loss": 0.6537, "num_input_tokens_seen": 12596496, "step": 21710 }, { "epoch": 3.2342865653857613, "grad_norm": 1.6143229007720947, "learning_rate": 4.942228698714646e-05, "loss": 0.6421, "num_input_tokens_seen": 12599248, "step": 21715 }, { "epoch": 3.23503127792672, "grad_norm": 2.1276497840881348, "learning_rate": 4.942159226281643e-05, "loss": 0.7647, "num_input_tokens_seen": 12601904, "step": 21720 }, { "epoch": 3.2357759904676793, "grad_norm": 1.218921422958374, "learning_rate": 4.942089712590842e-05, "loss": 0.7762, "num_input_tokens_seen": 12604880, "step": 21725 }, { "epoch": 3.2365207030086385, "grad_norm": 1.0122599601745605, "learning_rate": 4.9420201576434165e-05, "loss": 0.6211, "num_input_tokens_seen": 12607824, "step": 21730 }, { "epoch": 3.2372654155495977, "grad_norm": 1.323164701461792, "learning_rate": 4.941950561440543e-05, "loss": 0.6615, "num_input_tokens_seen": 12610800, "step": 21735 }, { "epoch": 3.238010128090557, "grad_norm": 0.7523544430732727, "learning_rate": 4.9418809239833964e-05, "loss": 0.673, "num_input_tokens_seen": 12613616, "step": 21740 }, { "epoch": 3.238754840631516, "grad_norm": 0.9791186451911926, "learning_rate": 4.9418112452731534e-05, "loss": 0.5897, "num_input_tokens_seen": 12616880, "step": 21745 }, { "epoch": 3.2394995531724753, "grad_norm": 0.8537806272506714, "learning_rate": 4.941741525310991e-05, "loss": 0.7952, "num_input_tokens_seen": 12619792, "step": 21750 }, { "epoch": 3.2402442657134345, "grad_norm": 1.1201260089874268, "learning_rate": 4.9416717640980884e-05, "loss": 0.6097, "num_input_tokens_seen": 12622576, "step": 21755 }, { "epoch": 3.2409889782543937, "grad_norm": 0.7442435622215271, "learning_rate": 4.941601961635621e-05, "loss": 0.6258, "num_input_tokens_seen": 12625456, "step": 21760 }, { "epoch": 3.241733690795353, "grad_norm": 1.269144892692566, "learning_rate": 4.941532117924772e-05, "loss": 0.6692, "num_input_tokens_seen": 12628400, "step": 21765 }, { "epoch": 3.242478403336312, "grad_norm": 0.9490901231765747, "learning_rate": 4.941462232966718e-05, "loss": 0.7821, "num_input_tokens_seen": 12631248, "step": 21770 }, { "epoch": 3.2432231158772713, "grad_norm": 0.8960041999816895, "learning_rate": 4.9413923067626413e-05, "loss": 0.6583, "num_input_tokens_seen": 12634192, "step": 21775 }, { "epoch": 3.2439678284182305, "grad_norm": 1.7358137369155884, "learning_rate": 4.941322339313723e-05, "loss": 0.6502, "num_input_tokens_seen": 12636880, "step": 21780 }, { "epoch": 3.2447125409591897, "grad_norm": 1.1369962692260742, "learning_rate": 4.941252330621145e-05, "loss": 0.6051, "num_input_tokens_seen": 12640016, "step": 21785 }, { "epoch": 3.245457253500149, "grad_norm": 0.8449194431304932, "learning_rate": 4.94118228068609e-05, "loss": 0.677, "num_input_tokens_seen": 12643120, "step": 21790 }, { "epoch": 3.246201966041108, "grad_norm": 1.0180302858352661, "learning_rate": 4.9411121895097414e-05, "loss": 0.5484, "num_input_tokens_seen": 12646000, "step": 21795 }, { "epoch": 3.2469466785820673, "grad_norm": 1.8905175924301147, "learning_rate": 4.941042057093284e-05, "loss": 0.7162, "num_input_tokens_seen": 12648784, "step": 21800 }, { "epoch": 3.2476913911230265, "grad_norm": 0.8586224317550659, "learning_rate": 4.940971883437901e-05, "loss": 0.725, "num_input_tokens_seen": 12651632, "step": 21805 }, { "epoch": 3.2484361036639857, "grad_norm": 1.2299206256866455, "learning_rate": 4.94090166854478e-05, "loss": 0.7088, "num_input_tokens_seen": 12654256, "step": 21810 }, { "epoch": 3.249180816204945, "grad_norm": 1.3623181581497192, "learning_rate": 4.940831412415105e-05, "loss": 0.6808, "num_input_tokens_seen": 12657104, "step": 21815 }, { "epoch": 3.249925528745904, "grad_norm": 0.5652124881744385, "learning_rate": 4.9407611150500646e-05, "loss": 0.5681, "num_input_tokens_seen": 12659728, "step": 21820 }, { "epoch": 3.2506702412868633, "grad_norm": 1.41843581199646, "learning_rate": 4.940690776450846e-05, "loss": 0.693, "num_input_tokens_seen": 12662768, "step": 21825 }, { "epoch": 3.2514149538278225, "grad_norm": 0.8704192042350769, "learning_rate": 4.940620396618637e-05, "loss": 0.7097, "num_input_tokens_seen": 12665776, "step": 21830 }, { "epoch": 3.2521596663687817, "grad_norm": 1.751592755317688, "learning_rate": 4.940549975554627e-05, "loss": 0.7269, "num_input_tokens_seen": 12668592, "step": 21835 }, { "epoch": 3.252904378909741, "grad_norm": 1.1247718334197998, "learning_rate": 4.940479513260006e-05, "loss": 0.6864, "num_input_tokens_seen": 12671664, "step": 21840 }, { "epoch": 3.2536490914507, "grad_norm": 0.907329261302948, "learning_rate": 4.940409009735964e-05, "loss": 0.7057, "num_input_tokens_seen": 12674512, "step": 21845 }, { "epoch": 3.2543938039916593, "grad_norm": 0.6941994428634644, "learning_rate": 4.940338464983691e-05, "loss": 0.6339, "num_input_tokens_seen": 12677520, "step": 21850 }, { "epoch": 3.2551385165326185, "grad_norm": 0.7655116319656372, "learning_rate": 4.940267879004381e-05, "loss": 0.6694, "num_input_tokens_seen": 12680304, "step": 21855 }, { "epoch": 3.2558832290735777, "grad_norm": 1.1684750318527222, "learning_rate": 4.9401972517992254e-05, "loss": 0.6148, "num_input_tokens_seen": 12683280, "step": 21860 }, { "epoch": 3.256627941614537, "grad_norm": 0.7452406883239746, "learning_rate": 4.9401265833694166e-05, "loss": 0.7977, "num_input_tokens_seen": 12686352, "step": 21865 }, { "epoch": 3.257372654155496, "grad_norm": 0.7545926570892334, "learning_rate": 4.940055873716149e-05, "loss": 0.7298, "num_input_tokens_seen": 12689424, "step": 21870 }, { "epoch": 3.2581173666964554, "grad_norm": 0.8004284501075745, "learning_rate": 4.939985122840619e-05, "loss": 0.8021, "num_input_tokens_seen": 12692304, "step": 21875 }, { "epoch": 3.2588620792374146, "grad_norm": 0.6870838403701782, "learning_rate": 4.939914330744019e-05, "loss": 0.7175, "num_input_tokens_seen": 12695184, "step": 21880 }, { "epoch": 3.2596067917783733, "grad_norm": 1.3235695362091064, "learning_rate": 4.939843497427547e-05, "loss": 0.6867, "num_input_tokens_seen": 12697808, "step": 21885 }, { "epoch": 3.260351504319333, "grad_norm": 0.7059760093688965, "learning_rate": 4.939772622892398e-05, "loss": 0.705, "num_input_tokens_seen": 12700720, "step": 21890 }, { "epoch": 3.2610962168602917, "grad_norm": 0.738317608833313, "learning_rate": 4.93970170713977e-05, "loss": 0.7094, "num_input_tokens_seen": 12703472, "step": 21895 }, { "epoch": 3.2618409294012514, "grad_norm": 1.3260459899902344, "learning_rate": 4.9396307501708625e-05, "loss": 0.764, "num_input_tokens_seen": 12707152, "step": 21900 }, { "epoch": 3.26258564194221, "grad_norm": 1.9450379610061646, "learning_rate": 4.939559751986872e-05, "loss": 0.6488, "num_input_tokens_seen": 12710192, "step": 21905 }, { "epoch": 3.2633303544831693, "grad_norm": 1.4539836645126343, "learning_rate": 4.939488712588999e-05, "loss": 0.7954, "num_input_tokens_seen": 12713264, "step": 21910 }, { "epoch": 3.2640750670241285, "grad_norm": 1.1267868280410767, "learning_rate": 4.939417631978444e-05, "loss": 0.7809, "num_input_tokens_seen": 12716112, "step": 21915 }, { "epoch": 3.2648197795650877, "grad_norm": 1.7278023958206177, "learning_rate": 4.939346510156407e-05, "loss": 0.672, "num_input_tokens_seen": 12718736, "step": 21920 }, { "epoch": 3.265564492106047, "grad_norm": 0.7639793753623962, "learning_rate": 4.93927534712409e-05, "loss": 0.6516, "num_input_tokens_seen": 12721680, "step": 21925 }, { "epoch": 3.266309204647006, "grad_norm": 0.7032055854797363, "learning_rate": 4.939204142882696e-05, "loss": 0.713, "num_input_tokens_seen": 12724496, "step": 21930 }, { "epoch": 3.2670539171879653, "grad_norm": 0.939254105091095, "learning_rate": 4.939132897433426e-05, "loss": 0.6781, "num_input_tokens_seen": 12727440, "step": 21935 }, { "epoch": 3.2677986297289245, "grad_norm": 1.4950993061065674, "learning_rate": 4.939061610777486e-05, "loss": 0.7413, "num_input_tokens_seen": 12730320, "step": 21940 }, { "epoch": 3.2685433422698837, "grad_norm": 1.287174105644226, "learning_rate": 4.938990282916078e-05, "loss": 0.6549, "num_input_tokens_seen": 12733264, "step": 21945 }, { "epoch": 3.269288054810843, "grad_norm": 0.786065399646759, "learning_rate": 4.938918913850408e-05, "loss": 0.7006, "num_input_tokens_seen": 12736144, "step": 21950 }, { "epoch": 3.270032767351802, "grad_norm": 1.3499566316604614, "learning_rate": 4.938847503581682e-05, "loss": 0.8109, "num_input_tokens_seen": 12739184, "step": 21955 }, { "epoch": 3.2707774798927614, "grad_norm": 0.7839632630348206, "learning_rate": 4.938776052111106e-05, "loss": 0.6728, "num_input_tokens_seen": 12742224, "step": 21960 }, { "epoch": 3.2715221924337206, "grad_norm": 0.7127974033355713, "learning_rate": 4.9387045594398875e-05, "loss": 0.7626, "num_input_tokens_seen": 12745232, "step": 21965 }, { "epoch": 3.2722669049746798, "grad_norm": 0.7879330515861511, "learning_rate": 4.9386330255692346e-05, "loss": 0.6549, "num_input_tokens_seen": 12747984, "step": 21970 }, { "epoch": 3.273011617515639, "grad_norm": 0.962639331817627, "learning_rate": 4.938561450500354e-05, "loss": 0.5861, "num_input_tokens_seen": 12751184, "step": 21975 }, { "epoch": 3.273756330056598, "grad_norm": 0.7285923361778259, "learning_rate": 4.938489834234457e-05, "loss": 0.591, "num_input_tokens_seen": 12753744, "step": 21980 }, { "epoch": 3.2745010425975574, "grad_norm": 0.5416396856307983, "learning_rate": 4.9384181767727524e-05, "loss": 0.5828, "num_input_tokens_seen": 12756848, "step": 21985 }, { "epoch": 3.2752457551385166, "grad_norm": 2.0628280639648438, "learning_rate": 4.9383464781164515e-05, "loss": 0.6897, "num_input_tokens_seen": 12759728, "step": 21990 }, { "epoch": 3.2759904676794758, "grad_norm": 0.6425840258598328, "learning_rate": 4.938274738266764e-05, "loss": 0.6259, "num_input_tokens_seen": 12762576, "step": 21995 }, { "epoch": 3.276735180220435, "grad_norm": 0.6927850842475891, "learning_rate": 4.938202957224903e-05, "loss": 0.5925, "num_input_tokens_seen": 12765520, "step": 22000 }, { "epoch": 3.277479892761394, "grad_norm": 0.9059906601905823, "learning_rate": 4.938131134992082e-05, "loss": 0.6207, "num_input_tokens_seen": 12768336, "step": 22005 }, { "epoch": 3.2782246053023534, "grad_norm": 1.0520962476730347, "learning_rate": 4.938059271569513e-05, "loss": 0.7131, "num_input_tokens_seen": 12771184, "step": 22010 }, { "epoch": 3.2789693178433126, "grad_norm": 0.7658782601356506, "learning_rate": 4.937987366958411e-05, "loss": 0.5754, "num_input_tokens_seen": 12774320, "step": 22015 }, { "epoch": 3.279714030384272, "grad_norm": 0.8037909269332886, "learning_rate": 4.93791542115999e-05, "loss": 0.7309, "num_input_tokens_seen": 12777008, "step": 22020 }, { "epoch": 3.280458742925231, "grad_norm": 0.7955263257026672, "learning_rate": 4.937843434175466e-05, "loss": 0.5709, "num_input_tokens_seen": 12780240, "step": 22025 }, { "epoch": 3.28120345546619, "grad_norm": 0.7366654276847839, "learning_rate": 4.937771406006054e-05, "loss": 0.6388, "num_input_tokens_seen": 12783056, "step": 22030 }, { "epoch": 3.2819481680071494, "grad_norm": 1.0432031154632568, "learning_rate": 4.937699336652973e-05, "loss": 0.7211, "num_input_tokens_seen": 12785712, "step": 22035 }, { "epoch": 3.2826928805481086, "grad_norm": 1.0108262300491333, "learning_rate": 4.937627226117438e-05, "loss": 0.6825, "num_input_tokens_seen": 12788496, "step": 22040 }, { "epoch": 3.283437593089068, "grad_norm": 1.078060507774353, "learning_rate": 4.9375550744006695e-05, "loss": 0.5928, "num_input_tokens_seen": 12791088, "step": 22045 }, { "epoch": 3.284182305630027, "grad_norm": 0.7649821043014526, "learning_rate": 4.9374828815038856e-05, "loss": 0.5682, "num_input_tokens_seen": 12794064, "step": 22050 }, { "epoch": 3.284927018170986, "grad_norm": 3.0462470054626465, "learning_rate": 4.937410647428304e-05, "loss": 0.7942, "num_input_tokens_seen": 12796848, "step": 22055 }, { "epoch": 3.285671730711945, "grad_norm": 1.1706695556640625, "learning_rate": 4.9373383721751486e-05, "loss": 0.7505, "num_input_tokens_seen": 12799824, "step": 22060 }, { "epoch": 3.2864164432529046, "grad_norm": 1.2282005548477173, "learning_rate": 4.9372660557456384e-05, "loss": 0.7168, "num_input_tokens_seen": 12802928, "step": 22065 }, { "epoch": 3.2871611557938634, "grad_norm": 1.9013917446136475, "learning_rate": 4.937193698140995e-05, "loss": 0.6035, "num_input_tokens_seen": 12805456, "step": 22070 }, { "epoch": 3.2879058683348226, "grad_norm": 0.9182218313217163, "learning_rate": 4.9371212993624405e-05, "loss": 0.7527, "num_input_tokens_seen": 12808112, "step": 22075 }, { "epoch": 3.2886505808757818, "grad_norm": 0.7694931626319885, "learning_rate": 4.9370488594112e-05, "loss": 0.6443, "num_input_tokens_seen": 12811088, "step": 22080 }, { "epoch": 3.289395293416741, "grad_norm": 1.2398368120193481, "learning_rate": 4.936976378288495e-05, "loss": 0.7056, "num_input_tokens_seen": 12813872, "step": 22085 }, { "epoch": 3.2901400059577, "grad_norm": 1.106133222579956, "learning_rate": 4.93690385599555e-05, "loss": 0.7226, "num_input_tokens_seen": 12816688, "step": 22090 }, { "epoch": 3.2908847184986594, "grad_norm": 0.959875226020813, "learning_rate": 4.9368312925335925e-05, "loss": 0.7617, "num_input_tokens_seen": 12819760, "step": 22095 }, { "epoch": 3.2916294310396186, "grad_norm": 1.2112839221954346, "learning_rate": 4.9367586879038466e-05, "loss": 0.7224, "num_input_tokens_seen": 12822736, "step": 22100 }, { "epoch": 3.292374143580578, "grad_norm": 1.0624728202819824, "learning_rate": 4.93668604210754e-05, "loss": 0.6237, "num_input_tokens_seen": 12825360, "step": 22105 }, { "epoch": 3.293118856121537, "grad_norm": 1.8846067190170288, "learning_rate": 4.936613355145898e-05, "loss": 0.7298, "num_input_tokens_seen": 12828400, "step": 22110 }, { "epoch": 3.293863568662496, "grad_norm": 0.9454411864280701, "learning_rate": 4.936540627020151e-05, "loss": 0.7606, "num_input_tokens_seen": 12831312, "step": 22115 }, { "epoch": 3.2946082812034554, "grad_norm": 0.9262951016426086, "learning_rate": 4.936467857731526e-05, "loss": 0.686, "num_input_tokens_seen": 12834128, "step": 22120 }, { "epoch": 3.2953529937444146, "grad_norm": 1.3383797407150269, "learning_rate": 4.9363950472812524e-05, "loss": 0.6888, "num_input_tokens_seen": 12837360, "step": 22125 }, { "epoch": 3.296097706285374, "grad_norm": 1.048649549484253, "learning_rate": 4.936322195670561e-05, "loss": 0.7279, "num_input_tokens_seen": 12840496, "step": 22130 }, { "epoch": 3.296842418826333, "grad_norm": 1.0989018678665161, "learning_rate": 4.936249302900682e-05, "loss": 0.7543, "num_input_tokens_seen": 12843472, "step": 22135 }, { "epoch": 3.297587131367292, "grad_norm": 0.8586545586585999, "learning_rate": 4.936176368972848e-05, "loss": 0.7005, "num_input_tokens_seen": 12846576, "step": 22140 }, { "epoch": 3.2983318439082514, "grad_norm": 0.7836444973945618, "learning_rate": 4.93610339388829e-05, "loss": 0.8017, "num_input_tokens_seen": 12849264, "step": 22145 }, { "epoch": 3.2990765564492106, "grad_norm": 0.8910703659057617, "learning_rate": 4.936030377648241e-05, "loss": 0.68, "num_input_tokens_seen": 12852176, "step": 22150 }, { "epoch": 3.29982126899017, "grad_norm": 0.9692017436027527, "learning_rate": 4.935957320253934e-05, "loss": 0.6214, "num_input_tokens_seen": 12854992, "step": 22155 }, { "epoch": 3.300565981531129, "grad_norm": 1.5261391401290894, "learning_rate": 4.9358842217066044e-05, "loss": 0.6147, "num_input_tokens_seen": 12857872, "step": 22160 }, { "epoch": 3.301310694072088, "grad_norm": 0.8927021026611328, "learning_rate": 4.935811082007487e-05, "loss": 0.6282, "num_input_tokens_seen": 12860752, "step": 22165 }, { "epoch": 3.3020554066130474, "grad_norm": 0.7838546633720398, "learning_rate": 4.935737901157816e-05, "loss": 0.6389, "num_input_tokens_seen": 12863440, "step": 22170 }, { "epoch": 3.3028001191540066, "grad_norm": 1.588353157043457, "learning_rate": 4.935664679158829e-05, "loss": 0.7284, "num_input_tokens_seen": 12866224, "step": 22175 }, { "epoch": 3.303544831694966, "grad_norm": 1.2108731269836426, "learning_rate": 4.935591416011763e-05, "loss": 0.6437, "num_input_tokens_seen": 12869040, "step": 22180 }, { "epoch": 3.304289544235925, "grad_norm": 0.7888093590736389, "learning_rate": 4.9355181117178564e-05, "loss": 0.6083, "num_input_tokens_seen": 12871888, "step": 22185 }, { "epoch": 3.3050342567768842, "grad_norm": 0.7545643448829651, "learning_rate": 4.935444766278345e-05, "loss": 0.6491, "num_input_tokens_seen": 12874576, "step": 22190 }, { "epoch": 3.3057789693178434, "grad_norm": 0.600063681602478, "learning_rate": 4.93537137969447e-05, "loss": 0.666, "num_input_tokens_seen": 12877456, "step": 22195 }, { "epoch": 3.3065236818588026, "grad_norm": 0.7930237650871277, "learning_rate": 4.935297951967471e-05, "loss": 0.723, "num_input_tokens_seen": 12880592, "step": 22200 }, { "epoch": 3.307268394399762, "grad_norm": 0.9464765787124634, "learning_rate": 4.9352244830985886e-05, "loss": 0.5373, "num_input_tokens_seen": 12883472, "step": 22205 }, { "epoch": 3.308013106940721, "grad_norm": 0.7356382608413696, "learning_rate": 4.935150973089063e-05, "loss": 0.7952, "num_input_tokens_seen": 12886032, "step": 22210 }, { "epoch": 3.3087578194816802, "grad_norm": 0.9226855039596558, "learning_rate": 4.935077421940137e-05, "loss": 0.6068, "num_input_tokens_seen": 12889360, "step": 22215 }, { "epoch": 3.3095025320226394, "grad_norm": 1.241784691810608, "learning_rate": 4.935003829653053e-05, "loss": 0.6719, "num_input_tokens_seen": 12892048, "step": 22220 }, { "epoch": 3.310247244563598, "grad_norm": 1.3872463703155518, "learning_rate": 4.934930196229054e-05, "loss": 0.6606, "num_input_tokens_seen": 12894896, "step": 22225 }, { "epoch": 3.310991957104558, "grad_norm": 1.0289336442947388, "learning_rate": 4.9348565216693845e-05, "loss": 0.6142, "num_input_tokens_seen": 12897648, "step": 22230 }, { "epoch": 3.3117366696455166, "grad_norm": 1.0789363384246826, "learning_rate": 4.9347828059752874e-05, "loss": 0.6647, "num_input_tokens_seen": 12900272, "step": 22235 }, { "epoch": 3.3124813821864763, "grad_norm": 0.9780856966972351, "learning_rate": 4.934709049148011e-05, "loss": 0.7188, "num_input_tokens_seen": 12903344, "step": 22240 }, { "epoch": 3.313226094727435, "grad_norm": 0.8488086462020874, "learning_rate": 4.934635251188799e-05, "loss": 0.6432, "num_input_tokens_seen": 12905904, "step": 22245 }, { "epoch": 3.313970807268394, "grad_norm": 0.8170868158340454, "learning_rate": 4.934561412098899e-05, "loss": 0.7194, "num_input_tokens_seen": 12908752, "step": 22250 }, { "epoch": 3.3147155198093534, "grad_norm": 0.8261061310768127, "learning_rate": 4.934487531879558e-05, "loss": 0.6986, "num_input_tokens_seen": 12911664, "step": 22255 }, { "epoch": 3.3154602323503126, "grad_norm": 0.9550371170043945, "learning_rate": 4.934413610532025e-05, "loss": 0.7612, "num_input_tokens_seen": 12914512, "step": 22260 }, { "epoch": 3.316204944891272, "grad_norm": 0.8708732724189758, "learning_rate": 4.9343396480575474e-05, "loss": 0.6872, "num_input_tokens_seen": 12917232, "step": 22265 }, { "epoch": 3.316949657432231, "grad_norm": 1.2539385557174683, "learning_rate": 4.9342656444573764e-05, "loss": 0.7384, "num_input_tokens_seen": 12920016, "step": 22270 }, { "epoch": 3.3176943699731902, "grad_norm": 0.8119682669639587, "learning_rate": 4.934191599732762e-05, "loss": 0.7475, "num_input_tokens_seen": 12923024, "step": 22275 }, { "epoch": 3.3184390825141494, "grad_norm": 0.751532793045044, "learning_rate": 4.934117513884953e-05, "loss": 0.6124, "num_input_tokens_seen": 12925872, "step": 22280 }, { "epoch": 3.3191837950551086, "grad_norm": 0.8448248505592346, "learning_rate": 4.934043386915203e-05, "loss": 0.5356, "num_input_tokens_seen": 12928720, "step": 22285 }, { "epoch": 3.319928507596068, "grad_norm": 1.2821427583694458, "learning_rate": 4.933969218824764e-05, "loss": 0.627, "num_input_tokens_seen": 12931600, "step": 22290 }, { "epoch": 3.320673220137027, "grad_norm": 1.4622122049331665, "learning_rate": 4.933895009614889e-05, "loss": 0.5735, "num_input_tokens_seen": 12934224, "step": 22295 }, { "epoch": 3.3214179326779862, "grad_norm": 0.9311002492904663, "learning_rate": 4.933820759286831e-05, "loss": 0.6302, "num_input_tokens_seen": 12937296, "step": 22300 }, { "epoch": 3.3221626452189454, "grad_norm": 0.950517475605011, "learning_rate": 4.933746467841846e-05, "loss": 0.6993, "num_input_tokens_seen": 12940112, "step": 22305 }, { "epoch": 3.3229073577599046, "grad_norm": 0.6548509001731873, "learning_rate": 4.9336721352811864e-05, "loss": 0.7859, "num_input_tokens_seen": 12943248, "step": 22310 }, { "epoch": 3.323652070300864, "grad_norm": 1.202799677848816, "learning_rate": 4.933597761606111e-05, "loss": 0.6156, "num_input_tokens_seen": 12945968, "step": 22315 }, { "epoch": 3.324396782841823, "grad_norm": 1.088860034942627, "learning_rate": 4.9335233468178744e-05, "loss": 0.6311, "num_input_tokens_seen": 12949040, "step": 22320 }, { "epoch": 3.3251414953827823, "grad_norm": 1.2402228116989136, "learning_rate": 4.9334488909177336e-05, "loss": 0.785, "num_input_tokens_seen": 12951504, "step": 22325 }, { "epoch": 3.3258862079237415, "grad_norm": 1.382397174835205, "learning_rate": 4.9333743939069476e-05, "loss": 0.7979, "num_input_tokens_seen": 12954608, "step": 22330 }, { "epoch": 3.3266309204647007, "grad_norm": 1.038985252380371, "learning_rate": 4.9332998557867735e-05, "loss": 0.7605, "num_input_tokens_seen": 12957232, "step": 22335 }, { "epoch": 3.32737563300566, "grad_norm": 0.9785682559013367, "learning_rate": 4.933225276558473e-05, "loss": 0.7055, "num_input_tokens_seen": 12959856, "step": 22340 }, { "epoch": 3.328120345546619, "grad_norm": 1.7013144493103027, "learning_rate": 4.933150656223303e-05, "loss": 0.7481, "num_input_tokens_seen": 12962928, "step": 22345 }, { "epoch": 3.3288650580875783, "grad_norm": 0.8529320955276489, "learning_rate": 4.933075994782527e-05, "loss": 0.5577, "num_input_tokens_seen": 12965648, "step": 22350 }, { "epoch": 3.3296097706285375, "grad_norm": 0.8040059208869934, "learning_rate": 4.933001292237404e-05, "loss": 0.6523, "num_input_tokens_seen": 12968528, "step": 22355 }, { "epoch": 3.3303544831694967, "grad_norm": 0.7914401888847351, "learning_rate": 4.9329265485891966e-05, "loss": 0.8133, "num_input_tokens_seen": 12971184, "step": 22360 }, { "epoch": 3.331099195710456, "grad_norm": 0.9966319799423218, "learning_rate": 4.9328517638391684e-05, "loss": 0.6445, "num_input_tokens_seen": 12974128, "step": 22365 }, { "epoch": 3.331843908251415, "grad_norm": 1.0179842710494995, "learning_rate": 4.932776937988582e-05, "loss": 0.6276, "num_input_tokens_seen": 12976976, "step": 22370 }, { "epoch": 3.3325886207923743, "grad_norm": 0.9854918718338013, "learning_rate": 4.932702071038703e-05, "loss": 0.7212, "num_input_tokens_seen": 12980112, "step": 22375 }, { "epoch": 3.3333333333333335, "grad_norm": 1.6103025674819946, "learning_rate": 4.932627162990794e-05, "loss": 0.7657, "num_input_tokens_seen": 12983184, "step": 22380 }, { "epoch": 3.3340780458742927, "grad_norm": 0.6918246150016785, "learning_rate": 4.932552213846121e-05, "loss": 0.6135, "num_input_tokens_seen": 12985968, "step": 22385 }, { "epoch": 3.334822758415252, "grad_norm": 0.7242661714553833, "learning_rate": 4.932477223605951e-05, "loss": 0.5902, "num_input_tokens_seen": 12989200, "step": 22390 }, { "epoch": 3.335567470956211, "grad_norm": 0.9744375348091125, "learning_rate": 4.932402192271551e-05, "loss": 0.5909, "num_input_tokens_seen": 12991952, "step": 22395 }, { "epoch": 3.33631218349717, "grad_norm": 2.5676448345184326, "learning_rate": 4.9323271198441886e-05, "loss": 0.8267, "num_input_tokens_seen": 12994832, "step": 22400 }, { "epoch": 3.3370568960381295, "grad_norm": 1.264367938041687, "learning_rate": 4.932252006325131e-05, "loss": 0.6436, "num_input_tokens_seen": 12997616, "step": 22405 }, { "epoch": 3.3378016085790883, "grad_norm": 1.089745283126831, "learning_rate": 4.932176851715647e-05, "loss": 0.7088, "num_input_tokens_seen": 13000304, "step": 22410 }, { "epoch": 3.338546321120048, "grad_norm": 0.9070781469345093, "learning_rate": 4.932101656017008e-05, "loss": 0.7948, "num_input_tokens_seen": 13003056, "step": 22415 }, { "epoch": 3.3392910336610067, "grad_norm": 1.2930351495742798, "learning_rate": 4.9320264192304835e-05, "loss": 0.5337, "num_input_tokens_seen": 13006160, "step": 22420 }, { "epoch": 3.340035746201966, "grad_norm": 1.09007728099823, "learning_rate": 4.931951141357344e-05, "loss": 0.8235, "num_input_tokens_seen": 13008848, "step": 22425 }, { "epoch": 3.340780458742925, "grad_norm": 0.6341335773468018, "learning_rate": 4.931875822398862e-05, "loss": 0.648, "num_input_tokens_seen": 13011536, "step": 22430 }, { "epoch": 3.3415251712838843, "grad_norm": 0.8502675294876099, "learning_rate": 4.93180046235631e-05, "loss": 0.7191, "num_input_tokens_seen": 13014384, "step": 22435 }, { "epoch": 3.3422698838248435, "grad_norm": 0.6305221319198608, "learning_rate": 4.9317250612309594e-05, "loss": 0.7313, "num_input_tokens_seen": 13017456, "step": 22440 }, { "epoch": 3.3430145963658027, "grad_norm": 0.49202194809913635, "learning_rate": 4.9316496190240866e-05, "loss": 0.5575, "num_input_tokens_seen": 13020432, "step": 22445 }, { "epoch": 3.343759308906762, "grad_norm": 0.9042345881462097, "learning_rate": 4.931574135736965e-05, "loss": 0.6587, "num_input_tokens_seen": 13023312, "step": 22450 }, { "epoch": 3.344504021447721, "grad_norm": 0.9253267049789429, "learning_rate": 4.931498611370869e-05, "loss": 0.5956, "num_input_tokens_seen": 13026096, "step": 22455 }, { "epoch": 3.3452487339886803, "grad_norm": 0.7069793343544006, "learning_rate": 4.9314230459270756e-05, "loss": 0.646, "num_input_tokens_seen": 13028592, "step": 22460 }, { "epoch": 3.3459934465296395, "grad_norm": 0.674115002155304, "learning_rate": 4.9313474394068604e-05, "loss": 0.6596, "num_input_tokens_seen": 13031888, "step": 22465 }, { "epoch": 3.3467381590705987, "grad_norm": 0.6236036419868469, "learning_rate": 4.931271791811502e-05, "loss": 0.7239, "num_input_tokens_seen": 13034672, "step": 22470 }, { "epoch": 3.347482871611558, "grad_norm": 1.4736920595169067, "learning_rate": 4.931196103142278e-05, "loss": 0.72, "num_input_tokens_seen": 13037520, "step": 22475 }, { "epoch": 3.348227584152517, "grad_norm": 0.6844878792762756, "learning_rate": 4.9311203734004665e-05, "loss": 0.6361, "num_input_tokens_seen": 13040688, "step": 22480 }, { "epoch": 3.3489722966934763, "grad_norm": 0.7239832878112793, "learning_rate": 4.931044602587346e-05, "loss": 0.6583, "num_input_tokens_seen": 13043504, "step": 22485 }, { "epoch": 3.3497170092344355, "grad_norm": 0.7663785219192505, "learning_rate": 4.930968790704199e-05, "loss": 0.7425, "num_input_tokens_seen": 13046512, "step": 22490 }, { "epoch": 3.3504617217753947, "grad_norm": 0.9557367563247681, "learning_rate": 4.930892937752305e-05, "loss": 0.6071, "num_input_tokens_seen": 13049392, "step": 22495 }, { "epoch": 3.351206434316354, "grad_norm": 0.4867874085903168, "learning_rate": 4.930817043732945e-05, "loss": 0.5115, "num_input_tokens_seen": 13052496, "step": 22500 }, { "epoch": 3.351951146857313, "grad_norm": 1.0589605569839478, "learning_rate": 4.930741108647402e-05, "loss": 0.7032, "num_input_tokens_seen": 13055312, "step": 22505 }, { "epoch": 3.3526958593982723, "grad_norm": 1.5798628330230713, "learning_rate": 4.9306651324969583e-05, "loss": 0.7313, "num_input_tokens_seen": 13058288, "step": 22510 }, { "epoch": 3.3534405719392315, "grad_norm": 1.3804413080215454, "learning_rate": 4.9305891152828976e-05, "loss": 0.6691, "num_input_tokens_seen": 13061168, "step": 22515 }, { "epoch": 3.3541852844801907, "grad_norm": 0.9376453161239624, "learning_rate": 4.930513057006504e-05, "loss": 0.6133, "num_input_tokens_seen": 13064112, "step": 22520 }, { "epoch": 3.35492999702115, "grad_norm": 1.3209911584854126, "learning_rate": 4.930436957669063e-05, "loss": 0.7055, "num_input_tokens_seen": 13067088, "step": 22525 }, { "epoch": 3.355674709562109, "grad_norm": 0.6904616951942444, "learning_rate": 4.93036081727186e-05, "loss": 0.6088, "num_input_tokens_seen": 13070096, "step": 22530 }, { "epoch": 3.3564194221030683, "grad_norm": 1.0244487524032593, "learning_rate": 4.93028463581618e-05, "loss": 0.6832, "num_input_tokens_seen": 13073040, "step": 22535 }, { "epoch": 3.3571641346440275, "grad_norm": 1.1452484130859375, "learning_rate": 4.930208413303312e-05, "loss": 0.6673, "num_input_tokens_seen": 13075792, "step": 22540 }, { "epoch": 3.3579088471849867, "grad_norm": 1.1717109680175781, "learning_rate": 4.930132149734542e-05, "loss": 0.6526, "num_input_tokens_seen": 13078736, "step": 22545 }, { "epoch": 3.358653559725946, "grad_norm": 1.119733214378357, "learning_rate": 4.93005584511116e-05, "loss": 0.7682, "num_input_tokens_seen": 13081488, "step": 22550 }, { "epoch": 3.359398272266905, "grad_norm": 0.8624585866928101, "learning_rate": 4.929979499434454e-05, "loss": 0.73, "num_input_tokens_seen": 13084208, "step": 22555 }, { "epoch": 3.3601429848078643, "grad_norm": 1.122865080833435, "learning_rate": 4.929903112705714e-05, "loss": 0.5841, "num_input_tokens_seen": 13087152, "step": 22560 }, { "epoch": 3.3608876973488235, "grad_norm": 1.4828976392745972, "learning_rate": 4.9298266849262306e-05, "loss": 0.6188, "num_input_tokens_seen": 13089872, "step": 22565 }, { "epoch": 3.3616324098897827, "grad_norm": 1.331849217414856, "learning_rate": 4.929750216097295e-05, "loss": 0.5892, "num_input_tokens_seen": 13092944, "step": 22570 }, { "epoch": 3.3623771224307415, "grad_norm": 0.7565273642539978, "learning_rate": 4.929673706220199e-05, "loss": 0.6949, "num_input_tokens_seen": 13095760, "step": 22575 }, { "epoch": 3.363121834971701, "grad_norm": 0.9139488339424133, "learning_rate": 4.929597155296235e-05, "loss": 0.6275, "num_input_tokens_seen": 13098864, "step": 22580 }, { "epoch": 3.36386654751266, "grad_norm": 0.9249659180641174, "learning_rate": 4.929520563326697e-05, "loss": 0.7572, "num_input_tokens_seen": 13102096, "step": 22585 }, { "epoch": 3.3646112600536195, "grad_norm": 0.9406024813652039, "learning_rate": 4.929443930312878e-05, "loss": 0.627, "num_input_tokens_seen": 13104752, "step": 22590 }, { "epoch": 3.3653559725945783, "grad_norm": 0.9489037990570068, "learning_rate": 4.929367256256072e-05, "loss": 0.5907, "num_input_tokens_seen": 13108080, "step": 22595 }, { "epoch": 3.3661006851355375, "grad_norm": 1.3011515140533447, "learning_rate": 4.929290541157576e-05, "loss": 0.6331, "num_input_tokens_seen": 13111152, "step": 22600 }, { "epoch": 3.3668453976764967, "grad_norm": 0.8427655100822449, "learning_rate": 4.929213785018686e-05, "loss": 0.792, "num_input_tokens_seen": 13114160, "step": 22605 }, { "epoch": 3.367590110217456, "grad_norm": 0.8896921873092651, "learning_rate": 4.9291369878406975e-05, "loss": 0.6973, "num_input_tokens_seen": 13117520, "step": 22610 }, { "epoch": 3.368334822758415, "grad_norm": 1.3169187307357788, "learning_rate": 4.929060149624909e-05, "loss": 0.681, "num_input_tokens_seen": 13120272, "step": 22615 }, { "epoch": 3.3690795352993743, "grad_norm": 2.048678398132324, "learning_rate": 4.928983270372617e-05, "loss": 0.7627, "num_input_tokens_seen": 13123056, "step": 22620 }, { "epoch": 3.3698242478403335, "grad_norm": 0.9390818476676941, "learning_rate": 4.928906350085122e-05, "loss": 0.6762, "num_input_tokens_seen": 13126064, "step": 22625 }, { "epoch": 3.3705689603812927, "grad_norm": 0.9981181025505066, "learning_rate": 4.928829388763723e-05, "loss": 0.6792, "num_input_tokens_seen": 13128720, "step": 22630 }, { "epoch": 3.371313672922252, "grad_norm": 0.9451253414154053, "learning_rate": 4.928752386409719e-05, "loss": 0.6855, "num_input_tokens_seen": 13131600, "step": 22635 }, { "epoch": 3.372058385463211, "grad_norm": 1.436547040939331, "learning_rate": 4.9286753430244126e-05, "loss": 0.6183, "num_input_tokens_seen": 13134576, "step": 22640 }, { "epoch": 3.3728030980041703, "grad_norm": 0.9186355471611023, "learning_rate": 4.928598258609105e-05, "loss": 0.6578, "num_input_tokens_seen": 13137488, "step": 22645 }, { "epoch": 3.3735478105451295, "grad_norm": 1.034777045249939, "learning_rate": 4.928521133165098e-05, "loss": 0.7202, "num_input_tokens_seen": 13140240, "step": 22650 }, { "epoch": 3.3742925230860887, "grad_norm": 0.7903726696968079, "learning_rate": 4.928443966693694e-05, "loss": 0.6547, "num_input_tokens_seen": 13143344, "step": 22655 }, { "epoch": 3.375037235627048, "grad_norm": 0.8085973858833313, "learning_rate": 4.928366759196198e-05, "loss": 0.7331, "num_input_tokens_seen": 13146640, "step": 22660 }, { "epoch": 3.375781948168007, "grad_norm": 1.4680836200714111, "learning_rate": 4.9282895106739136e-05, "loss": 0.6824, "num_input_tokens_seen": 13149616, "step": 22665 }, { "epoch": 3.3765266607089663, "grad_norm": 0.946913480758667, "learning_rate": 4.928212221128146e-05, "loss": 0.8151, "num_input_tokens_seen": 13152496, "step": 22670 }, { "epoch": 3.3772713732499255, "grad_norm": 1.3838986158370972, "learning_rate": 4.928134890560201e-05, "loss": 0.7826, "num_input_tokens_seen": 13155408, "step": 22675 }, { "epoch": 3.3780160857908847, "grad_norm": 0.7432546615600586, "learning_rate": 4.928057518971384e-05, "loss": 0.6797, "num_input_tokens_seen": 13158128, "step": 22680 }, { "epoch": 3.378760798331844, "grad_norm": 0.8296151757240295, "learning_rate": 4.9279801063630035e-05, "loss": 0.703, "num_input_tokens_seen": 13161296, "step": 22685 }, { "epoch": 3.379505510872803, "grad_norm": 1.6030012369155884, "learning_rate": 4.9279026527363666e-05, "loss": 0.6824, "num_input_tokens_seen": 13164176, "step": 22690 }, { "epoch": 3.3802502234137624, "grad_norm": 1.4849964380264282, "learning_rate": 4.927825158092783e-05, "loss": 0.7708, "num_input_tokens_seen": 13166768, "step": 22695 }, { "epoch": 3.3809949359547216, "grad_norm": 0.943882405757904, "learning_rate": 4.9277476224335603e-05, "loss": 0.6387, "num_input_tokens_seen": 13169648, "step": 22700 }, { "epoch": 3.3817396484956808, "grad_norm": 1.3049439191818237, "learning_rate": 4.927670045760009e-05, "loss": 0.8323, "num_input_tokens_seen": 13172240, "step": 22705 }, { "epoch": 3.38248436103664, "grad_norm": 0.9189729690551758, "learning_rate": 4.927592428073439e-05, "loss": 0.6211, "num_input_tokens_seen": 13175824, "step": 22710 }, { "epoch": 3.383229073577599, "grad_norm": 0.8541845083236694, "learning_rate": 4.927514769375163e-05, "loss": 0.7594, "num_input_tokens_seen": 13178608, "step": 22715 }, { "epoch": 3.3839737861185584, "grad_norm": 0.7512338161468506, "learning_rate": 4.9274370696664916e-05, "loss": 0.934, "num_input_tokens_seen": 13181424, "step": 22720 }, { "epoch": 3.3847184986595176, "grad_norm": 1.436119556427002, "learning_rate": 4.9273593289487384e-05, "loss": 0.698, "num_input_tokens_seen": 13184048, "step": 22725 }, { "epoch": 3.3854632112004768, "grad_norm": 0.8406791090965271, "learning_rate": 4.9272815472232165e-05, "loss": 0.6947, "num_input_tokens_seen": 13187120, "step": 22730 }, { "epoch": 3.386207923741436, "grad_norm": 0.9495931267738342, "learning_rate": 4.9272037244912394e-05, "loss": 0.6501, "num_input_tokens_seen": 13190256, "step": 22735 }, { "epoch": 3.386952636282395, "grad_norm": 1.1056545972824097, "learning_rate": 4.927125860754123e-05, "loss": 0.6939, "num_input_tokens_seen": 13193200, "step": 22740 }, { "epoch": 3.3876973488233544, "grad_norm": 0.6866962313652039, "learning_rate": 4.9270479560131813e-05, "loss": 0.6375, "num_input_tokens_seen": 13196016, "step": 22745 }, { "epoch": 3.388442061364313, "grad_norm": 1.0137395858764648, "learning_rate": 4.926970010269731e-05, "loss": 0.6469, "num_input_tokens_seen": 13198960, "step": 22750 }, { "epoch": 3.389186773905273, "grad_norm": 1.7927347421646118, "learning_rate": 4.92689202352509e-05, "loss": 0.8425, "num_input_tokens_seen": 13201872, "step": 22755 }, { "epoch": 3.3899314864462315, "grad_norm": 0.7724321484565735, "learning_rate": 4.926813995780574e-05, "loss": 0.6799, "num_input_tokens_seen": 13205072, "step": 22760 }, { "epoch": 3.390676198987191, "grad_norm": 0.9651437997817993, "learning_rate": 4.926735927037503e-05, "loss": 0.8154, "num_input_tokens_seen": 13208048, "step": 22765 }, { "epoch": 3.39142091152815, "grad_norm": 0.6845628023147583, "learning_rate": 4.9266578172971934e-05, "loss": 0.5879, "num_input_tokens_seen": 13211216, "step": 22770 }, { "epoch": 3.392165624069109, "grad_norm": 1.0862926244735718, "learning_rate": 4.926579666560968e-05, "loss": 0.5526, "num_input_tokens_seen": 13213936, "step": 22775 }, { "epoch": 3.3929103366100684, "grad_norm": 1.1697440147399902, "learning_rate": 4.926501474830144e-05, "loss": 0.6611, "num_input_tokens_seen": 13216752, "step": 22780 }, { "epoch": 3.3936550491510276, "grad_norm": 2.0288097858428955, "learning_rate": 4.926423242106044e-05, "loss": 0.708, "num_input_tokens_seen": 13219472, "step": 22785 }, { "epoch": 3.3943997616919868, "grad_norm": 0.7982820868492126, "learning_rate": 4.92634496838999e-05, "loss": 0.6796, "num_input_tokens_seen": 13222416, "step": 22790 }, { "epoch": 3.395144474232946, "grad_norm": 0.8398354649543762, "learning_rate": 4.9262666536833035e-05, "loss": 0.6376, "num_input_tokens_seen": 13225072, "step": 22795 }, { "epoch": 3.395889186773905, "grad_norm": 1.060076117515564, "learning_rate": 4.926188297987308e-05, "loss": 0.651, "num_input_tokens_seen": 13227984, "step": 22800 }, { "epoch": 3.3966338993148644, "grad_norm": 0.6321725845336914, "learning_rate": 4.926109901303327e-05, "loss": 0.6685, "num_input_tokens_seen": 13230992, "step": 22805 }, { "epoch": 3.3973786118558236, "grad_norm": 0.6263721585273743, "learning_rate": 4.9260314636326846e-05, "loss": 0.5959, "num_input_tokens_seen": 13233968, "step": 22810 }, { "epoch": 3.3981233243967828, "grad_norm": 1.052808165550232, "learning_rate": 4.925952984976707e-05, "loss": 0.6854, "num_input_tokens_seen": 13237200, "step": 22815 }, { "epoch": 3.398868036937742, "grad_norm": 0.6987687349319458, "learning_rate": 4.925874465336719e-05, "loss": 0.6054, "num_input_tokens_seen": 13239984, "step": 22820 }, { "epoch": 3.399612749478701, "grad_norm": 1.111486792564392, "learning_rate": 4.9257959047140476e-05, "loss": 0.6861, "num_input_tokens_seen": 13243024, "step": 22825 }, { "epoch": 3.4003574620196604, "grad_norm": 1.4832335710525513, "learning_rate": 4.9257173031100196e-05, "loss": 0.5425, "num_input_tokens_seen": 13246000, "step": 22830 }, { "epoch": 3.4011021745606196, "grad_norm": 0.8778698444366455, "learning_rate": 4.925638660525963e-05, "loss": 0.5603, "num_input_tokens_seen": 13248784, "step": 22835 }, { "epoch": 3.401846887101579, "grad_norm": 1.0095781087875366, "learning_rate": 4.925559976963207e-05, "loss": 0.6586, "num_input_tokens_seen": 13251600, "step": 22840 }, { "epoch": 3.402591599642538, "grad_norm": 2.2333178520202637, "learning_rate": 4.9254812524230806e-05, "loss": 0.7588, "num_input_tokens_seen": 13254512, "step": 22845 }, { "epoch": 3.403336312183497, "grad_norm": 0.9481454491615295, "learning_rate": 4.925402486906913e-05, "loss": 0.6193, "num_input_tokens_seen": 13257648, "step": 22850 }, { "epoch": 3.4040810247244564, "grad_norm": 0.7106925845146179, "learning_rate": 4.925323680416036e-05, "loss": 0.6236, "num_input_tokens_seen": 13260624, "step": 22855 }, { "epoch": 3.4048257372654156, "grad_norm": 0.896912157535553, "learning_rate": 4.92524483295178e-05, "loss": 0.614, "num_input_tokens_seen": 13263760, "step": 22860 }, { "epoch": 3.405570449806375, "grad_norm": 1.3797876834869385, "learning_rate": 4.925165944515477e-05, "loss": 0.658, "num_input_tokens_seen": 13266384, "step": 22865 }, { "epoch": 3.406315162347334, "grad_norm": 0.8444094657897949, "learning_rate": 4.9250870151084614e-05, "loss": 0.6333, "num_input_tokens_seen": 13269040, "step": 22870 }, { "epoch": 3.407059874888293, "grad_norm": 0.8152040243148804, "learning_rate": 4.9250080447320644e-05, "loss": 0.6146, "num_input_tokens_seen": 13272208, "step": 22875 }, { "epoch": 3.4078045874292524, "grad_norm": 1.254452109336853, "learning_rate": 4.924929033387622e-05, "loss": 0.6821, "num_input_tokens_seen": 13275088, "step": 22880 }, { "epoch": 3.4085492999702116, "grad_norm": 1.3848954439163208, "learning_rate": 4.9248499810764675e-05, "loss": 0.723, "num_input_tokens_seen": 13278448, "step": 22885 }, { "epoch": 3.409294012511171, "grad_norm": 2.273284912109375, "learning_rate": 4.9247708877999375e-05, "loss": 0.6781, "num_input_tokens_seen": 13281136, "step": 22890 }, { "epoch": 3.41003872505213, "grad_norm": 1.218007206916809, "learning_rate": 4.9246917535593675e-05, "loss": 0.6629, "num_input_tokens_seen": 13284176, "step": 22895 }, { "epoch": 3.410783437593089, "grad_norm": 1.0040862560272217, "learning_rate": 4.924612578356095e-05, "loss": 0.6554, "num_input_tokens_seen": 13287024, "step": 22900 }, { "epoch": 3.4115281501340484, "grad_norm": 1.0740562677383423, "learning_rate": 4.9245333621914566e-05, "loss": 0.5458, "num_input_tokens_seen": 13290096, "step": 22905 }, { "epoch": 3.4122728626750076, "grad_norm": 1.3064794540405273, "learning_rate": 4.9244541050667916e-05, "loss": 0.5509, "num_input_tokens_seen": 13292880, "step": 22910 }, { "epoch": 3.413017575215967, "grad_norm": 0.8690316677093506, "learning_rate": 4.9243748069834386e-05, "loss": 0.6165, "num_input_tokens_seen": 13295696, "step": 22915 }, { "epoch": 3.413762287756926, "grad_norm": 1.4264835119247437, "learning_rate": 4.924295467942737e-05, "loss": 0.677, "num_input_tokens_seen": 13298448, "step": 22920 }, { "epoch": 3.414507000297885, "grad_norm": 1.4115862846374512, "learning_rate": 4.924216087946028e-05, "loss": 0.6649, "num_input_tokens_seen": 13301040, "step": 22925 }, { "epoch": 3.4152517128388444, "grad_norm": 0.8369256854057312, "learning_rate": 4.924136666994652e-05, "loss": 0.6674, "num_input_tokens_seen": 13304112, "step": 22930 }, { "epoch": 3.415996425379803, "grad_norm": 1.1620659828186035, "learning_rate": 4.9240572050899505e-05, "loss": 0.6237, "num_input_tokens_seen": 13306960, "step": 22935 }, { "epoch": 3.4167411379207624, "grad_norm": 0.6990060806274414, "learning_rate": 4.923977702233266e-05, "loss": 0.7567, "num_input_tokens_seen": 13309520, "step": 22940 }, { "epoch": 3.4174858504617216, "grad_norm": 0.8992708325386047, "learning_rate": 4.923898158425942e-05, "loss": 0.6744, "num_input_tokens_seen": 13312368, "step": 22945 }, { "epoch": 3.418230563002681, "grad_norm": 0.8744785785675049, "learning_rate": 4.923818573669322e-05, "loss": 0.618, "num_input_tokens_seen": 13315280, "step": 22950 }, { "epoch": 3.41897527554364, "grad_norm": 0.9733996391296387, "learning_rate": 4.923738947964751e-05, "loss": 0.6348, "num_input_tokens_seen": 13318000, "step": 22955 }, { "epoch": 3.419719988084599, "grad_norm": 0.8570157289505005, "learning_rate": 4.923659281313574e-05, "loss": 0.8013, "num_input_tokens_seen": 13320688, "step": 22960 }, { "epoch": 3.4204647006255584, "grad_norm": 1.6145237684249878, "learning_rate": 4.9235795737171365e-05, "loss": 0.71, "num_input_tokens_seen": 13323600, "step": 22965 }, { "epoch": 3.4212094131665176, "grad_norm": 0.8358142971992493, "learning_rate": 4.923499825176786e-05, "loss": 0.6427, "num_input_tokens_seen": 13326416, "step": 22970 }, { "epoch": 3.421954125707477, "grad_norm": 1.581695556640625, "learning_rate": 4.923420035693868e-05, "loss": 0.7025, "num_input_tokens_seen": 13329712, "step": 22975 }, { "epoch": 3.422698838248436, "grad_norm": 1.0461816787719727, "learning_rate": 4.923340205269732e-05, "loss": 0.5735, "num_input_tokens_seen": 13332528, "step": 22980 }, { "epoch": 3.423443550789395, "grad_norm": 0.92613285779953, "learning_rate": 4.923260333905726e-05, "loss": 0.7729, "num_input_tokens_seen": 13335760, "step": 22985 }, { "epoch": 3.4241882633303544, "grad_norm": 1.2298967838287354, "learning_rate": 4.9231804216031995e-05, "loss": 0.7797, "num_input_tokens_seen": 13338992, "step": 22990 }, { "epoch": 3.4249329758713136, "grad_norm": 0.9010134339332581, "learning_rate": 4.923100468363503e-05, "loss": 0.7502, "num_input_tokens_seen": 13341712, "step": 22995 }, { "epoch": 3.425677688412273, "grad_norm": 0.936186671257019, "learning_rate": 4.923020474187987e-05, "loss": 0.671, "num_input_tokens_seen": 13344528, "step": 23000 }, { "epoch": 3.426422400953232, "grad_norm": 2.2493526935577393, "learning_rate": 4.922940439078002e-05, "loss": 0.6105, "num_input_tokens_seen": 13347280, "step": 23005 }, { "epoch": 3.4271671134941912, "grad_norm": 1.0121581554412842, "learning_rate": 4.922860363034901e-05, "loss": 0.7096, "num_input_tokens_seen": 13350352, "step": 23010 }, { "epoch": 3.4279118260351504, "grad_norm": 0.7580313086509705, "learning_rate": 4.922780246060037e-05, "loss": 0.7117, "num_input_tokens_seen": 13353456, "step": 23015 }, { "epoch": 3.4286565385761096, "grad_norm": 3.3671157360076904, "learning_rate": 4.922700088154764e-05, "loss": 0.711, "num_input_tokens_seen": 13356368, "step": 23020 }, { "epoch": 3.429401251117069, "grad_norm": 0.8645957708358765, "learning_rate": 4.9226198893204335e-05, "loss": 0.636, "num_input_tokens_seen": 13359408, "step": 23025 }, { "epoch": 3.430145963658028, "grad_norm": 0.6632853746414185, "learning_rate": 4.922539649558403e-05, "loss": 0.6398, "num_input_tokens_seen": 13362064, "step": 23030 }, { "epoch": 3.4308906761989872, "grad_norm": 0.6428794860839844, "learning_rate": 4.9224593688700274e-05, "loss": 0.5427, "num_input_tokens_seen": 13365136, "step": 23035 }, { "epoch": 3.4316353887399464, "grad_norm": 1.0651277303695679, "learning_rate": 4.922379047256663e-05, "loss": 0.7532, "num_input_tokens_seen": 13368112, "step": 23040 }, { "epoch": 3.4323801012809056, "grad_norm": 0.9576681852340698, "learning_rate": 4.922298684719666e-05, "loss": 0.6793, "num_input_tokens_seen": 13371152, "step": 23045 }, { "epoch": 3.433124813821865, "grad_norm": 0.6562487483024597, "learning_rate": 4.922218281260395e-05, "loss": 0.6553, "num_input_tokens_seen": 13373840, "step": 23050 }, { "epoch": 3.433869526362824, "grad_norm": 1.6298795938491821, "learning_rate": 4.9221378368802085e-05, "loss": 0.706, "num_input_tokens_seen": 13376944, "step": 23055 }, { "epoch": 3.4346142389037833, "grad_norm": 0.7446618676185608, "learning_rate": 4.9220573515804644e-05, "loss": 0.7812, "num_input_tokens_seen": 13379600, "step": 23060 }, { "epoch": 3.4353589514447425, "grad_norm": 0.7881981730461121, "learning_rate": 4.921976825362523e-05, "loss": 0.6563, "num_input_tokens_seen": 13382256, "step": 23065 }, { "epoch": 3.4361036639857017, "grad_norm": 1.2366758584976196, "learning_rate": 4.921896258227745e-05, "loss": 0.72, "num_input_tokens_seen": 13385168, "step": 23070 }, { "epoch": 3.436848376526661, "grad_norm": 0.6093248724937439, "learning_rate": 4.921815650177491e-05, "loss": 0.7722, "num_input_tokens_seen": 13387984, "step": 23075 }, { "epoch": 3.43759308906762, "grad_norm": 0.9677644371986389, "learning_rate": 4.9217350012131223e-05, "loss": 0.6487, "num_input_tokens_seen": 13390832, "step": 23080 }, { "epoch": 3.4383378016085793, "grad_norm": 0.8900024890899658, "learning_rate": 4.9216543113360035e-05, "loss": 0.6922, "num_input_tokens_seen": 13393872, "step": 23085 }, { "epoch": 3.4390825141495385, "grad_norm": 1.2627824544906616, "learning_rate": 4.9215735805474956e-05, "loss": 0.6933, "num_input_tokens_seen": 13397008, "step": 23090 }, { "epoch": 3.4398272266904977, "grad_norm": 0.8775407671928406, "learning_rate": 4.921492808848963e-05, "loss": 0.6772, "num_input_tokens_seen": 13399824, "step": 23095 }, { "epoch": 3.4405719392314564, "grad_norm": 0.6894403100013733, "learning_rate": 4.921411996241771e-05, "loss": 0.7131, "num_input_tokens_seen": 13402896, "step": 23100 }, { "epoch": 3.441316651772416, "grad_norm": 1.045196533203125, "learning_rate": 4.921331142727284e-05, "loss": 0.7328, "num_input_tokens_seen": 13406096, "step": 23105 }, { "epoch": 3.442061364313375, "grad_norm": 1.067272663116455, "learning_rate": 4.921250248306869e-05, "loss": 0.7238, "num_input_tokens_seen": 13409200, "step": 23110 }, { "epoch": 3.442806076854334, "grad_norm": 0.7378498315811157, "learning_rate": 4.9211693129818915e-05, "loss": 0.6632, "num_input_tokens_seen": 13412304, "step": 23115 }, { "epoch": 3.4435507893952932, "grad_norm": 1.3268409967422485, "learning_rate": 4.9210883367537184e-05, "loss": 0.6914, "num_input_tokens_seen": 13414960, "step": 23120 }, { "epoch": 3.4442955019362524, "grad_norm": 0.6851729154586792, "learning_rate": 4.9210073196237196e-05, "loss": 0.6754, "num_input_tokens_seen": 13417680, "step": 23125 }, { "epoch": 3.4450402144772116, "grad_norm": 0.6823292374610901, "learning_rate": 4.9209262615932624e-05, "loss": 0.584, "num_input_tokens_seen": 13420240, "step": 23130 }, { "epoch": 3.445784927018171, "grad_norm": 0.6855947375297546, "learning_rate": 4.9208451626637164e-05, "loss": 0.58, "num_input_tokens_seen": 13423216, "step": 23135 }, { "epoch": 3.44652963955913, "grad_norm": 0.5553902387619019, "learning_rate": 4.920764022836452e-05, "loss": 0.7919, "num_input_tokens_seen": 13426192, "step": 23140 }, { "epoch": 3.4472743521000893, "grad_norm": 1.0060981512069702, "learning_rate": 4.920682842112839e-05, "loss": 0.6198, "num_input_tokens_seen": 13429232, "step": 23145 }, { "epoch": 3.4480190646410485, "grad_norm": 0.6581115126609802, "learning_rate": 4.920601620494251e-05, "loss": 0.6221, "num_input_tokens_seen": 13432048, "step": 23150 }, { "epoch": 3.4487637771820077, "grad_norm": 0.7623167634010315, "learning_rate": 4.920520357982058e-05, "loss": 0.5945, "num_input_tokens_seen": 13434960, "step": 23155 }, { "epoch": 3.449508489722967, "grad_norm": 0.8283389806747437, "learning_rate": 4.9204390545776334e-05, "loss": 0.8105, "num_input_tokens_seen": 13437712, "step": 23160 }, { "epoch": 3.450253202263926, "grad_norm": 1.1756608486175537, "learning_rate": 4.920357710282352e-05, "loss": 0.6557, "num_input_tokens_seen": 13440496, "step": 23165 }, { "epoch": 3.4509979148048853, "grad_norm": 1.8917138576507568, "learning_rate": 4.9202763250975864e-05, "loss": 0.6827, "num_input_tokens_seen": 13443632, "step": 23170 }, { "epoch": 3.4517426273458445, "grad_norm": 1.29500412940979, "learning_rate": 4.920194899024712e-05, "loss": 0.6116, "num_input_tokens_seen": 13446608, "step": 23175 }, { "epoch": 3.4524873398868037, "grad_norm": 1.7640279531478882, "learning_rate": 4.920113432065105e-05, "loss": 0.7243, "num_input_tokens_seen": 13449520, "step": 23180 }, { "epoch": 3.453232052427763, "grad_norm": 0.614048957824707, "learning_rate": 4.920031924220141e-05, "loss": 0.5473, "num_input_tokens_seen": 13452240, "step": 23185 }, { "epoch": 3.453976764968722, "grad_norm": 0.8601254820823669, "learning_rate": 4.919950375491197e-05, "loss": 0.5367, "num_input_tokens_seen": 13455248, "step": 23190 }, { "epoch": 3.4547214775096813, "grad_norm": 0.8913120627403259, "learning_rate": 4.919868785879651e-05, "loss": 0.5957, "num_input_tokens_seen": 13458320, "step": 23195 }, { "epoch": 3.4554661900506405, "grad_norm": 1.557077169418335, "learning_rate": 4.919787155386882e-05, "loss": 0.7056, "num_input_tokens_seen": 13460912, "step": 23200 }, { "epoch": 3.4562109025915997, "grad_norm": 0.6750434041023254, "learning_rate": 4.919705484014268e-05, "loss": 0.6282, "num_input_tokens_seen": 13463568, "step": 23205 }, { "epoch": 3.456955615132559, "grad_norm": 1.2537615299224854, "learning_rate": 4.919623771763189e-05, "loss": 0.8078, "num_input_tokens_seen": 13466448, "step": 23210 }, { "epoch": 3.457700327673518, "grad_norm": 0.8428911566734314, "learning_rate": 4.919542018635025e-05, "loss": 0.6156, "num_input_tokens_seen": 13469392, "step": 23215 }, { "epoch": 3.4584450402144773, "grad_norm": 0.6543729305267334, "learning_rate": 4.919460224631158e-05, "loss": 0.6412, "num_input_tokens_seen": 13472464, "step": 23220 }, { "epoch": 3.4591897527554365, "grad_norm": 1.9343074560165405, "learning_rate": 4.91937838975297e-05, "loss": 0.6816, "num_input_tokens_seen": 13475408, "step": 23225 }, { "epoch": 3.4599344652963957, "grad_norm": 0.6737511157989502, "learning_rate": 4.9192965140018435e-05, "loss": 0.5922, "num_input_tokens_seen": 13478288, "step": 23230 }, { "epoch": 3.460679177837355, "grad_norm": 0.8001711964607239, "learning_rate": 4.919214597379161e-05, "loss": 0.6081, "num_input_tokens_seen": 13481232, "step": 23235 }, { "epoch": 3.461423890378314, "grad_norm": 1.00631582736969, "learning_rate": 4.919132639886306e-05, "loss": 0.6375, "num_input_tokens_seen": 13484048, "step": 23240 }, { "epoch": 3.4621686029192733, "grad_norm": 1.1822209358215332, "learning_rate": 4.919050641524663e-05, "loss": 0.8365, "num_input_tokens_seen": 13486832, "step": 23245 }, { "epoch": 3.4629133154602325, "grad_norm": 1.0900142192840576, "learning_rate": 4.9189686022956195e-05, "loss": 0.5274, "num_input_tokens_seen": 13489456, "step": 23250 }, { "epoch": 3.4636580280011917, "grad_norm": 1.2544772624969482, "learning_rate": 4.91888652220056e-05, "loss": 0.7287, "num_input_tokens_seen": 13492272, "step": 23255 }, { "epoch": 3.464402740542151, "grad_norm": 1.2010281085968018, "learning_rate": 4.91880440124087e-05, "loss": 0.7917, "num_input_tokens_seen": 13495088, "step": 23260 }, { "epoch": 3.4651474530831097, "grad_norm": 0.5674706101417542, "learning_rate": 4.918722239417939e-05, "loss": 0.605, "num_input_tokens_seen": 13497904, "step": 23265 }, { "epoch": 3.4658921656240693, "grad_norm": 0.9017976522445679, "learning_rate": 4.918640036733154e-05, "loss": 0.6575, "num_input_tokens_seen": 13500880, "step": 23270 }, { "epoch": 3.466636878165028, "grad_norm": 0.933401346206665, "learning_rate": 4.9185577931879034e-05, "loss": 0.6914, "num_input_tokens_seen": 13503952, "step": 23275 }, { "epoch": 3.4673815907059877, "grad_norm": 0.9265634417533875, "learning_rate": 4.9184755087835766e-05, "loss": 0.6032, "num_input_tokens_seen": 13507056, "step": 23280 }, { "epoch": 3.4681263032469465, "grad_norm": 1.6073955297470093, "learning_rate": 4.9183931835215645e-05, "loss": 0.6457, "num_input_tokens_seen": 13509776, "step": 23285 }, { "epoch": 3.4688710157879057, "grad_norm": 0.9962862133979797, "learning_rate": 4.918310817403258e-05, "loss": 0.824, "num_input_tokens_seen": 13512848, "step": 23290 }, { "epoch": 3.469615728328865, "grad_norm": 1.000915765762329, "learning_rate": 4.918228410430048e-05, "loss": 0.7656, "num_input_tokens_seen": 13515888, "step": 23295 }, { "epoch": 3.470360440869824, "grad_norm": 0.8177543878555298, "learning_rate": 4.918145962603326e-05, "loss": 0.6621, "num_input_tokens_seen": 13518352, "step": 23300 }, { "epoch": 3.4711051534107833, "grad_norm": 0.8387886881828308, "learning_rate": 4.918063473924486e-05, "loss": 0.5661, "num_input_tokens_seen": 13521200, "step": 23305 }, { "epoch": 3.4718498659517425, "grad_norm": 1.5128434896469116, "learning_rate": 4.917980944394922e-05, "loss": 0.621, "num_input_tokens_seen": 13523984, "step": 23310 }, { "epoch": 3.4725945784927017, "grad_norm": 0.7182210087776184, "learning_rate": 4.9178983740160264e-05, "loss": 0.7813, "num_input_tokens_seen": 13526608, "step": 23315 }, { "epoch": 3.473339291033661, "grad_norm": 0.9990628957748413, "learning_rate": 4.9178157627891956e-05, "loss": 0.778, "num_input_tokens_seen": 13529392, "step": 23320 }, { "epoch": 3.47408400357462, "grad_norm": 0.9219609498977661, "learning_rate": 4.917733110715825e-05, "loss": 0.632, "num_input_tokens_seen": 13532208, "step": 23325 }, { "epoch": 3.4748287161155793, "grad_norm": 0.8505948185920715, "learning_rate": 4.9176504177973105e-05, "loss": 0.7266, "num_input_tokens_seen": 13535024, "step": 23330 }, { "epoch": 3.4755734286565385, "grad_norm": 0.6697025299072266, "learning_rate": 4.91756768403505e-05, "loss": 0.5213, "num_input_tokens_seen": 13538160, "step": 23335 }, { "epoch": 3.4763181411974977, "grad_norm": 0.7485787868499756, "learning_rate": 4.9174849094304396e-05, "loss": 0.7269, "num_input_tokens_seen": 13541360, "step": 23340 }, { "epoch": 3.477062853738457, "grad_norm": 1.2806953191757202, "learning_rate": 4.91740209398488e-05, "loss": 0.6306, "num_input_tokens_seen": 13544208, "step": 23345 }, { "epoch": 3.477807566279416, "grad_norm": 0.700909435749054, "learning_rate": 4.917319237699768e-05, "loss": 0.711, "num_input_tokens_seen": 13547216, "step": 23350 }, { "epoch": 3.4785522788203753, "grad_norm": 0.74336838722229, "learning_rate": 4.9172363405765044e-05, "loss": 0.6667, "num_input_tokens_seen": 13549840, "step": 23355 }, { "epoch": 3.4792969913613345, "grad_norm": 0.8219333291053772, "learning_rate": 4.91715340261649e-05, "loss": 0.6281, "num_input_tokens_seen": 13552592, "step": 23360 }, { "epoch": 3.4800417039022937, "grad_norm": 0.7230933904647827, "learning_rate": 4.917070423821125e-05, "loss": 0.6421, "num_input_tokens_seen": 13555408, "step": 23365 }, { "epoch": 3.480786416443253, "grad_norm": 0.8874855637550354, "learning_rate": 4.9169874041918116e-05, "loss": 0.6297, "num_input_tokens_seen": 13558384, "step": 23370 }, { "epoch": 3.481531128984212, "grad_norm": 0.9313823580741882, "learning_rate": 4.916904343729954e-05, "loss": 0.7241, "num_input_tokens_seen": 13561488, "step": 23375 }, { "epoch": 3.4822758415251713, "grad_norm": 1.2384238243103027, "learning_rate": 4.916821242436952e-05, "loss": 0.6265, "num_input_tokens_seen": 13564432, "step": 23380 }, { "epoch": 3.4830205540661305, "grad_norm": 1.283765196800232, "learning_rate": 4.916738100314213e-05, "loss": 0.7359, "num_input_tokens_seen": 13567472, "step": 23385 }, { "epoch": 3.4837652666070897, "grad_norm": 0.7475413084030151, "learning_rate": 4.916654917363139e-05, "loss": 0.5615, "num_input_tokens_seen": 13570448, "step": 23390 }, { "epoch": 3.484509979148049, "grad_norm": 0.9998888969421387, "learning_rate": 4.916571693585137e-05, "loss": 0.8425, "num_input_tokens_seen": 13573584, "step": 23395 }, { "epoch": 3.485254691689008, "grad_norm": 1.5323115587234497, "learning_rate": 4.9164884289816115e-05, "loss": 0.5864, "num_input_tokens_seen": 13576624, "step": 23400 }, { "epoch": 3.4859994042299673, "grad_norm": 0.8344225287437439, "learning_rate": 4.916405123553971e-05, "loss": 0.6299, "num_input_tokens_seen": 13579664, "step": 23405 }, { "epoch": 3.4867441167709265, "grad_norm": 0.8824899792671204, "learning_rate": 4.9163217773036214e-05, "loss": 0.6053, "num_input_tokens_seen": 13582736, "step": 23410 }, { "epoch": 3.4874888293118858, "grad_norm": 0.6210979223251343, "learning_rate": 4.916238390231971e-05, "loss": 0.8166, "num_input_tokens_seen": 13585328, "step": 23415 }, { "epoch": 3.488233541852845, "grad_norm": 0.8549705147743225, "learning_rate": 4.916154962340429e-05, "loss": 0.5978, "num_input_tokens_seen": 13588112, "step": 23420 }, { "epoch": 3.488978254393804, "grad_norm": 0.921721875667572, "learning_rate": 4.916071493630405e-05, "loss": 0.6956, "num_input_tokens_seen": 13590960, "step": 23425 }, { "epoch": 3.4897229669347634, "grad_norm": 0.8661309480667114, "learning_rate": 4.915987984103309e-05, "loss": 0.626, "num_input_tokens_seen": 13593776, "step": 23430 }, { "epoch": 3.4904676794757226, "grad_norm": 0.8048628568649292, "learning_rate": 4.9159044337605495e-05, "loss": 0.6974, "num_input_tokens_seen": 13596688, "step": 23435 }, { "epoch": 3.4912123920166813, "grad_norm": 0.8095446825027466, "learning_rate": 4.915820842603542e-05, "loss": 0.6312, "num_input_tokens_seen": 13599792, "step": 23440 }, { "epoch": 3.491957104557641, "grad_norm": 0.9097794890403748, "learning_rate": 4.9157372106336965e-05, "loss": 0.7915, "num_input_tokens_seen": 13602864, "step": 23445 }, { "epoch": 3.4927018170985997, "grad_norm": 1.4968324899673462, "learning_rate": 4.915653537852425e-05, "loss": 0.6527, "num_input_tokens_seen": 13605584, "step": 23450 }, { "epoch": 3.4934465296395594, "grad_norm": 0.8658677339553833, "learning_rate": 4.915569824261143e-05, "loss": 0.6579, "num_input_tokens_seen": 13608464, "step": 23455 }, { "epoch": 3.494191242180518, "grad_norm": 1.1036434173583984, "learning_rate": 4.915486069861264e-05, "loss": 0.6205, "num_input_tokens_seen": 13611472, "step": 23460 }, { "epoch": 3.4949359547214773, "grad_norm": 1.3062201738357544, "learning_rate": 4.915402274654202e-05, "loss": 0.74, "num_input_tokens_seen": 13614608, "step": 23465 }, { "epoch": 3.4956806672624365, "grad_norm": 2.0380969047546387, "learning_rate": 4.915318438641374e-05, "loss": 0.6561, "num_input_tokens_seen": 13617520, "step": 23470 }, { "epoch": 3.4964253798033957, "grad_norm": 1.0388485193252563, "learning_rate": 4.915234561824196e-05, "loss": 0.7467, "num_input_tokens_seen": 13620784, "step": 23475 }, { "epoch": 3.497170092344355, "grad_norm": 0.8803352117538452, "learning_rate": 4.915150644204084e-05, "loss": 0.7678, "num_input_tokens_seen": 13623696, "step": 23480 }, { "epoch": 3.497914804885314, "grad_norm": 1.21907377243042, "learning_rate": 4.915066685782457e-05, "loss": 0.6493, "num_input_tokens_seen": 13626640, "step": 23485 }, { "epoch": 3.4986595174262733, "grad_norm": 1.180420994758606, "learning_rate": 4.914982686560733e-05, "loss": 0.5702, "num_input_tokens_seen": 13629584, "step": 23490 }, { "epoch": 3.4994042299672325, "grad_norm": 1.3456614017486572, "learning_rate": 4.914898646540331e-05, "loss": 0.7141, "num_input_tokens_seen": 13632560, "step": 23495 }, { "epoch": 3.5001489425081918, "grad_norm": 0.9508736729621887, "learning_rate": 4.914814565722671e-05, "loss": 0.6539, "num_input_tokens_seen": 13635376, "step": 23500 }, { "epoch": 3.500893655049151, "grad_norm": 0.6436952948570251, "learning_rate": 4.914730444109173e-05, "loss": 0.7655, "num_input_tokens_seen": 13638128, "step": 23505 }, { "epoch": 3.50163836759011, "grad_norm": 1.0449858903884888, "learning_rate": 4.9146462817012586e-05, "loss": 0.5992, "num_input_tokens_seen": 13640848, "step": 23510 }, { "epoch": 3.5023830801310694, "grad_norm": 1.161637306213379, "learning_rate": 4.9145620785003485e-05, "loss": 0.8927, "num_input_tokens_seen": 13643888, "step": 23515 }, { "epoch": 3.5031277926720286, "grad_norm": 0.9956120252609253, "learning_rate": 4.9144778345078665e-05, "loss": 0.7244, "num_input_tokens_seen": 13646864, "step": 23520 }, { "epoch": 3.5038725052129878, "grad_norm": 1.200086236000061, "learning_rate": 4.914393549725236e-05, "loss": 0.7648, "num_input_tokens_seen": 13649872, "step": 23525 }, { "epoch": 3.504617217753947, "grad_norm": 0.9242076277732849, "learning_rate": 4.91430922415388e-05, "loss": 0.579, "num_input_tokens_seen": 13652560, "step": 23530 }, { "epoch": 3.505361930294906, "grad_norm": 1.891145944595337, "learning_rate": 4.914224857795224e-05, "loss": 0.8282, "num_input_tokens_seen": 13656048, "step": 23535 }, { "epoch": 3.5061066428358654, "grad_norm": 1.1873526573181152, "learning_rate": 4.914140450650692e-05, "loss": 0.7098, "num_input_tokens_seen": 13658800, "step": 23540 }, { "epoch": 3.5068513553768246, "grad_norm": 2.9349260330200195, "learning_rate": 4.9140560027217106e-05, "loss": 0.6803, "num_input_tokens_seen": 13661584, "step": 23545 }, { "epoch": 3.5075960679177838, "grad_norm": 0.8135993480682373, "learning_rate": 4.9139715140097075e-05, "loss": 0.4575, "num_input_tokens_seen": 13664464, "step": 23550 }, { "epoch": 3.508340780458743, "grad_norm": 0.6642447113990784, "learning_rate": 4.9138869845161086e-05, "loss": 0.5783, "num_input_tokens_seen": 13667312, "step": 23555 }, { "epoch": 3.509085492999702, "grad_norm": 1.1793392896652222, "learning_rate": 4.913802414242342e-05, "loss": 0.628, "num_input_tokens_seen": 13670224, "step": 23560 }, { "epoch": 3.5098302055406614, "grad_norm": 0.6176695227622986, "learning_rate": 4.913717803189838e-05, "loss": 0.5259, "num_input_tokens_seen": 13673008, "step": 23565 }, { "epoch": 3.5105749180816206, "grad_norm": 1.1732382774353027, "learning_rate": 4.913633151360024e-05, "loss": 0.6417, "num_input_tokens_seen": 13676048, "step": 23570 }, { "epoch": 3.51131963062258, "grad_norm": 1.7096449136734009, "learning_rate": 4.913548458754331e-05, "loss": 0.7871, "num_input_tokens_seen": 13678960, "step": 23575 }, { "epoch": 3.512064343163539, "grad_norm": 0.9681121110916138, "learning_rate": 4.91346372537419e-05, "loss": 0.6105, "num_input_tokens_seen": 13682160, "step": 23580 }, { "epoch": 3.512809055704498, "grad_norm": 0.776328444480896, "learning_rate": 4.913378951221033e-05, "loss": 0.7857, "num_input_tokens_seen": 13685264, "step": 23585 }, { "epoch": 3.5135537682454574, "grad_norm": 0.8377776145935059, "learning_rate": 4.9132941362962905e-05, "loss": 0.7795, "num_input_tokens_seen": 13688176, "step": 23590 }, { "epoch": 3.5142984807864166, "grad_norm": 0.9906920194625854, "learning_rate": 4.913209280601396e-05, "loss": 0.5598, "num_input_tokens_seen": 13691088, "step": 23595 }, { "epoch": 3.515043193327376, "grad_norm": 0.9563015103340149, "learning_rate": 4.913124384137784e-05, "loss": 0.652, "num_input_tokens_seen": 13693936, "step": 23600 }, { "epoch": 3.5157879058683346, "grad_norm": 0.6865262389183044, "learning_rate": 4.9130394469068886e-05, "loss": 0.6669, "num_input_tokens_seen": 13697008, "step": 23605 }, { "epoch": 3.516532618409294, "grad_norm": 0.68526291847229, "learning_rate": 4.9129544689101437e-05, "loss": 0.6282, "num_input_tokens_seen": 13699984, "step": 23610 }, { "epoch": 3.517277330950253, "grad_norm": 0.8655549883842468, "learning_rate": 4.912869450148986e-05, "loss": 0.5919, "num_input_tokens_seen": 13702768, "step": 23615 }, { "epoch": 3.5180220434912126, "grad_norm": 1.0724411010742188, "learning_rate": 4.9127843906248504e-05, "loss": 0.7941, "num_input_tokens_seen": 13705744, "step": 23620 }, { "epoch": 3.5187667560321714, "grad_norm": 0.6898152232170105, "learning_rate": 4.912699290339175e-05, "loss": 0.5612, "num_input_tokens_seen": 13708464, "step": 23625 }, { "epoch": 3.519511468573131, "grad_norm": 0.8235632181167603, "learning_rate": 4.912614149293398e-05, "loss": 0.6416, "num_input_tokens_seen": 13711536, "step": 23630 }, { "epoch": 3.5202561811140898, "grad_norm": 0.7098702788352966, "learning_rate": 4.9125289674889566e-05, "loss": 0.4646, "num_input_tokens_seen": 13714512, "step": 23635 }, { "epoch": 3.5210008936550494, "grad_norm": 1.211647868156433, "learning_rate": 4.91244374492729e-05, "loss": 0.9808, "num_input_tokens_seen": 13717360, "step": 23640 }, { "epoch": 3.521745606196008, "grad_norm": 1.2723743915557861, "learning_rate": 4.912358481609838e-05, "loss": 0.7325, "num_input_tokens_seen": 13720432, "step": 23645 }, { "epoch": 3.5224903187369674, "grad_norm": 1.0374394655227661, "learning_rate": 4.912273177538041e-05, "loss": 0.6955, "num_input_tokens_seen": 13723536, "step": 23650 }, { "epoch": 3.5232350312779266, "grad_norm": 0.6717837452888489, "learning_rate": 4.912187832713342e-05, "loss": 0.6818, "num_input_tokens_seen": 13726320, "step": 23655 }, { "epoch": 3.523979743818886, "grad_norm": 1.422906517982483, "learning_rate": 4.91210244713718e-05, "loss": 0.6441, "num_input_tokens_seen": 13728944, "step": 23660 }, { "epoch": 3.524724456359845, "grad_norm": 1.165557622909546, "learning_rate": 4.912017020810999e-05, "loss": 0.7068, "num_input_tokens_seen": 13731888, "step": 23665 }, { "epoch": 3.525469168900804, "grad_norm": 1.0165079832077026, "learning_rate": 4.911931553736242e-05, "loss": 0.6127, "num_input_tokens_seen": 13735024, "step": 23670 }, { "epoch": 3.5262138814417634, "grad_norm": 0.46001508831977844, "learning_rate": 4.9118460459143524e-05, "loss": 0.7283, "num_input_tokens_seen": 13737840, "step": 23675 }, { "epoch": 3.5269585939827226, "grad_norm": 0.8455125093460083, "learning_rate": 4.9117604973467756e-05, "loss": 0.6086, "num_input_tokens_seen": 13740368, "step": 23680 }, { "epoch": 3.527703306523682, "grad_norm": 1.0307295322418213, "learning_rate": 4.9116749080349556e-05, "loss": 0.6398, "num_input_tokens_seen": 13743536, "step": 23685 }, { "epoch": 3.528448019064641, "grad_norm": 1.057430624961853, "learning_rate": 4.911589277980339e-05, "loss": 0.6776, "num_input_tokens_seen": 13746448, "step": 23690 }, { "epoch": 3.5291927316056, "grad_norm": 1.030432105064392, "learning_rate": 4.911503607184375e-05, "loss": 0.6806, "num_input_tokens_seen": 13749232, "step": 23695 }, { "epoch": 3.5299374441465594, "grad_norm": 1.0467523336410522, "learning_rate": 4.911417895648506e-05, "loss": 0.6559, "num_input_tokens_seen": 13752048, "step": 23700 }, { "epoch": 3.5306821566875186, "grad_norm": 0.6152336597442627, "learning_rate": 4.9113321433741835e-05, "loss": 0.7842, "num_input_tokens_seen": 13754832, "step": 23705 }, { "epoch": 3.531426869228478, "grad_norm": 0.8937909007072449, "learning_rate": 4.9112463503628545e-05, "loss": 0.6124, "num_input_tokens_seen": 13757744, "step": 23710 }, { "epoch": 3.532171581769437, "grad_norm": 0.8919682502746582, "learning_rate": 4.91116051661597e-05, "loss": 0.5968, "num_input_tokens_seen": 13760976, "step": 23715 }, { "epoch": 3.532916294310396, "grad_norm": 0.9209782481193542, "learning_rate": 4.911074642134979e-05, "loss": 0.7308, "num_input_tokens_seen": 13763920, "step": 23720 }, { "epoch": 3.5336610068513554, "grad_norm": 1.9580495357513428, "learning_rate": 4.9109887269213315e-05, "loss": 0.8286, "num_input_tokens_seen": 13766448, "step": 23725 }, { "epoch": 3.5344057193923146, "grad_norm": 1.5499849319458008, "learning_rate": 4.910902770976481e-05, "loss": 0.7557, "num_input_tokens_seen": 13769808, "step": 23730 }, { "epoch": 3.535150431933274, "grad_norm": 1.0457093715667725, "learning_rate": 4.910816774301878e-05, "loss": 0.6486, "num_input_tokens_seen": 13772752, "step": 23735 }, { "epoch": 3.535895144474233, "grad_norm": 1.2999619245529175, "learning_rate": 4.910730736898976e-05, "loss": 0.8381, "num_input_tokens_seen": 13775728, "step": 23740 }, { "epoch": 3.5366398570151922, "grad_norm": 1.3706350326538086, "learning_rate": 4.9106446587692276e-05, "loss": 0.6083, "num_input_tokens_seen": 13778992, "step": 23745 }, { "epoch": 3.5373845695561514, "grad_norm": 1.0850993394851685, "learning_rate": 4.910558539914088e-05, "loss": 0.8737, "num_input_tokens_seen": 13782128, "step": 23750 }, { "epoch": 3.5381292820971106, "grad_norm": 0.8469459414482117, "learning_rate": 4.910472380335013e-05, "loss": 0.5688, "num_input_tokens_seen": 13784976, "step": 23755 }, { "epoch": 3.53887399463807, "grad_norm": 1.1904217004776, "learning_rate": 4.9103861800334567e-05, "loss": 0.8332, "num_input_tokens_seen": 13787824, "step": 23760 }, { "epoch": 3.539618707179029, "grad_norm": 0.8241473436355591, "learning_rate": 4.9102999390108753e-05, "loss": 0.6513, "num_input_tokens_seen": 13790704, "step": 23765 }, { "epoch": 3.5403634197199882, "grad_norm": 0.6867820024490356, "learning_rate": 4.910213657268726e-05, "loss": 0.5412, "num_input_tokens_seen": 13793744, "step": 23770 }, { "epoch": 3.5411081322609474, "grad_norm": 0.6993001699447632, "learning_rate": 4.910127334808466e-05, "loss": 0.6932, "num_input_tokens_seen": 13796688, "step": 23775 }, { "epoch": 3.541852844801906, "grad_norm": 0.6578481197357178, "learning_rate": 4.9100409716315556e-05, "loss": 0.6391, "num_input_tokens_seen": 13799664, "step": 23780 }, { "epoch": 3.542597557342866, "grad_norm": 1.6197820901870728, "learning_rate": 4.909954567739452e-05, "loss": 0.6146, "num_input_tokens_seen": 13802352, "step": 23785 }, { "epoch": 3.5433422698838246, "grad_norm": 1.2926127910614014, "learning_rate": 4.909868123133615e-05, "loss": 0.6468, "num_input_tokens_seen": 13804976, "step": 23790 }, { "epoch": 3.5440869824247843, "grad_norm": 1.114148497581482, "learning_rate": 4.909781637815506e-05, "loss": 0.7279, "num_input_tokens_seen": 13808048, "step": 23795 }, { "epoch": 3.544831694965743, "grad_norm": 0.8564876317977905, "learning_rate": 4.909695111786584e-05, "loss": 0.6376, "num_input_tokens_seen": 13810928, "step": 23800 }, { "epoch": 3.5455764075067027, "grad_norm": 1.140711784362793, "learning_rate": 4.9096085450483134e-05, "loss": 0.6711, "num_input_tokens_seen": 13813808, "step": 23805 }, { "epoch": 3.5463211200476614, "grad_norm": 1.0100550651550293, "learning_rate": 4.909521937602155e-05, "loss": 0.6033, "num_input_tokens_seen": 13816720, "step": 23810 }, { "epoch": 3.5470658325886206, "grad_norm": 1.8620644807815552, "learning_rate": 4.909435289449573e-05, "loss": 0.7768, "num_input_tokens_seen": 13819504, "step": 23815 }, { "epoch": 3.54781054512958, "grad_norm": 0.949131190776825, "learning_rate": 4.90934860059203e-05, "loss": 0.6824, "num_input_tokens_seen": 13822672, "step": 23820 }, { "epoch": 3.548555257670539, "grad_norm": 1.1875109672546387, "learning_rate": 4.909261871030991e-05, "loss": 0.7317, "num_input_tokens_seen": 13825520, "step": 23825 }, { "epoch": 3.5492999702114982, "grad_norm": 0.9751918911933899, "learning_rate": 4.9091751007679224e-05, "loss": 0.7576, "num_input_tokens_seen": 13828432, "step": 23830 }, { "epoch": 3.5500446827524574, "grad_norm": 1.0867801904678345, "learning_rate": 4.9090882898042876e-05, "loss": 0.5559, "num_input_tokens_seen": 13831696, "step": 23835 }, { "epoch": 3.5507893952934166, "grad_norm": 1.8129321336746216, "learning_rate": 4.909001438141556e-05, "loss": 0.6892, "num_input_tokens_seen": 13834640, "step": 23840 }, { "epoch": 3.551534107834376, "grad_norm": 0.8100690245628357, "learning_rate": 4.908914545781192e-05, "loss": 0.6763, "num_input_tokens_seen": 13837808, "step": 23845 }, { "epoch": 3.552278820375335, "grad_norm": 1.0119112730026245, "learning_rate": 4.9088276127246666e-05, "loss": 0.5631, "num_input_tokens_seen": 13840816, "step": 23850 }, { "epoch": 3.5530235329162942, "grad_norm": 1.0588678121566772, "learning_rate": 4.9087406389734465e-05, "loss": 0.6374, "num_input_tokens_seen": 13843632, "step": 23855 }, { "epoch": 3.5537682454572534, "grad_norm": 1.0867652893066406, "learning_rate": 4.908653624529001e-05, "loss": 0.8143, "num_input_tokens_seen": 13846640, "step": 23860 }, { "epoch": 3.5545129579982127, "grad_norm": 0.7405241131782532, "learning_rate": 4.908566569392801e-05, "loss": 0.5332, "num_input_tokens_seen": 13849424, "step": 23865 }, { "epoch": 3.555257670539172, "grad_norm": 1.019147276878357, "learning_rate": 4.908479473566316e-05, "loss": 0.6268, "num_input_tokens_seen": 13852176, "step": 23870 }, { "epoch": 3.556002383080131, "grad_norm": 0.8443236351013184, "learning_rate": 4.9083923370510184e-05, "loss": 0.6802, "num_input_tokens_seen": 13855216, "step": 23875 }, { "epoch": 3.5567470956210903, "grad_norm": 0.8268356919288635, "learning_rate": 4.908305159848381e-05, "loss": 0.6102, "num_input_tokens_seen": 13858096, "step": 23880 }, { "epoch": 3.5574918081620495, "grad_norm": 0.7938128113746643, "learning_rate": 4.908217941959875e-05, "loss": 0.5413, "num_input_tokens_seen": 13861328, "step": 23885 }, { "epoch": 3.5582365207030087, "grad_norm": 0.965840220451355, "learning_rate": 4.908130683386974e-05, "loss": 0.7728, "num_input_tokens_seen": 13864240, "step": 23890 }, { "epoch": 3.558981233243968, "grad_norm": 0.7203136086463928, "learning_rate": 4.9080433841311526e-05, "loss": 0.6618, "num_input_tokens_seen": 13866992, "step": 23895 }, { "epoch": 3.559725945784927, "grad_norm": 0.744190514087677, "learning_rate": 4.9079560441938865e-05, "loss": 0.7189, "num_input_tokens_seen": 13869712, "step": 23900 }, { "epoch": 3.5604706583258863, "grad_norm": 1.1877285242080688, "learning_rate": 4.90786866357665e-05, "loss": 0.7707, "num_input_tokens_seen": 13872528, "step": 23905 }, { "epoch": 3.5612153708668455, "grad_norm": 1.2143163681030273, "learning_rate": 4.90778124228092e-05, "loss": 0.6831, "num_input_tokens_seen": 13875184, "step": 23910 }, { "epoch": 3.5619600834078047, "grad_norm": 1.057498574256897, "learning_rate": 4.907693780308172e-05, "loss": 0.7126, "num_input_tokens_seen": 13878096, "step": 23915 }, { "epoch": 3.562704795948764, "grad_norm": 0.8058739304542542, "learning_rate": 4.907606277659885e-05, "loss": 0.696, "num_input_tokens_seen": 13881008, "step": 23920 }, { "epoch": 3.563449508489723, "grad_norm": 1.661925196647644, "learning_rate": 4.907518734337538e-05, "loss": 0.7013, "num_input_tokens_seen": 13883824, "step": 23925 }, { "epoch": 3.5641942210306823, "grad_norm": 1.8034673929214478, "learning_rate": 4.907431150342608e-05, "loss": 0.8355, "num_input_tokens_seen": 13886384, "step": 23930 }, { "epoch": 3.5649389335716415, "grad_norm": 0.9187120795249939, "learning_rate": 4.907343525676575e-05, "loss": 0.6672, "num_input_tokens_seen": 13889360, "step": 23935 }, { "epoch": 3.5656836461126007, "grad_norm": 1.6738066673278809, "learning_rate": 4.9072558603409216e-05, "loss": 0.7375, "num_input_tokens_seen": 13892592, "step": 23940 }, { "epoch": 3.5664283586535594, "grad_norm": 0.6203508973121643, "learning_rate": 4.907168154337125e-05, "loss": 0.5855, "num_input_tokens_seen": 13895632, "step": 23945 }, { "epoch": 3.567173071194519, "grad_norm": 1.1817169189453125, "learning_rate": 4.90708040766667e-05, "loss": 0.6102, "num_input_tokens_seen": 13898800, "step": 23950 }, { "epoch": 3.567917783735478, "grad_norm": 1.8680170774459839, "learning_rate": 4.906992620331038e-05, "loss": 0.726, "num_input_tokens_seen": 13901680, "step": 23955 }, { "epoch": 3.5686624962764375, "grad_norm": 1.3825454711914062, "learning_rate": 4.906904792331712e-05, "loss": 0.6766, "num_input_tokens_seen": 13904464, "step": 23960 }, { "epoch": 3.5694072088173963, "grad_norm": 0.9366575479507446, "learning_rate": 4.906816923670176e-05, "loss": 0.6493, "num_input_tokens_seen": 13907280, "step": 23965 }, { "epoch": 3.570151921358356, "grad_norm": 1.126386284828186, "learning_rate": 4.906729014347914e-05, "loss": 0.6293, "num_input_tokens_seen": 13910064, "step": 23970 }, { "epoch": 3.5708966338993147, "grad_norm": 1.0882951021194458, "learning_rate": 4.9066410643664113e-05, "loss": 0.7933, "num_input_tokens_seen": 13913072, "step": 23975 }, { "epoch": 3.5716413464402743, "grad_norm": 0.8289983868598938, "learning_rate": 4.906553073727154e-05, "loss": 0.6527, "num_input_tokens_seen": 13915824, "step": 23980 }, { "epoch": 3.572386058981233, "grad_norm": 0.6803069114685059, "learning_rate": 4.9064650424316284e-05, "loss": 0.6867, "num_input_tokens_seen": 13919152, "step": 23985 }, { "epoch": 3.5731307715221923, "grad_norm": 1.0289605855941772, "learning_rate": 4.906376970481321e-05, "loss": 0.7269, "num_input_tokens_seen": 13921840, "step": 23990 }, { "epoch": 3.5738754840631515, "grad_norm": 0.9937456250190735, "learning_rate": 4.9062888578777214e-05, "loss": 0.6504, "num_input_tokens_seen": 13924496, "step": 23995 }, { "epoch": 3.5746201966041107, "grad_norm": 0.7619805932044983, "learning_rate": 4.906200704622317e-05, "loss": 0.6194, "num_input_tokens_seen": 13927408, "step": 24000 }, { "epoch": 3.57536490914507, "grad_norm": 1.109909176826477, "learning_rate": 4.906112510716597e-05, "loss": 0.6385, "num_input_tokens_seen": 13930224, "step": 24005 }, { "epoch": 3.576109621686029, "grad_norm": 0.6727430820465088, "learning_rate": 4.906024276162052e-05, "loss": 0.6272, "num_input_tokens_seen": 13933072, "step": 24010 }, { "epoch": 3.5768543342269883, "grad_norm": 0.7564928531646729, "learning_rate": 4.905936000960172e-05, "loss": 0.6129, "num_input_tokens_seen": 13936208, "step": 24015 }, { "epoch": 3.5775990467679475, "grad_norm": 1.5620869398117065, "learning_rate": 4.905847685112448e-05, "loss": 0.7028, "num_input_tokens_seen": 13939216, "step": 24020 }, { "epoch": 3.5783437593089067, "grad_norm": 1.055530071258545, "learning_rate": 4.905759328620373e-05, "loss": 0.8329, "num_input_tokens_seen": 13942384, "step": 24025 }, { "epoch": 3.579088471849866, "grad_norm": 1.019470453262329, "learning_rate": 4.90567093148544e-05, "loss": 0.6822, "num_input_tokens_seen": 13945136, "step": 24030 }, { "epoch": 3.579833184390825, "grad_norm": 0.7011270523071289, "learning_rate": 4.9055824937091406e-05, "loss": 0.7127, "num_input_tokens_seen": 13947728, "step": 24035 }, { "epoch": 3.5805778969317843, "grad_norm": 0.8443698883056641, "learning_rate": 4.9054940152929704e-05, "loss": 0.7254, "num_input_tokens_seen": 13950672, "step": 24040 }, { "epoch": 3.5813226094727435, "grad_norm": 0.923148512840271, "learning_rate": 4.9054054962384235e-05, "loss": 0.5753, "num_input_tokens_seen": 13953648, "step": 24045 }, { "epoch": 3.5820673220137027, "grad_norm": 0.7439378499984741, "learning_rate": 4.905316936546995e-05, "loss": 0.6113, "num_input_tokens_seen": 13956560, "step": 24050 }, { "epoch": 3.582812034554662, "grad_norm": 0.7631959319114685, "learning_rate": 4.9052283362201823e-05, "loss": 0.6719, "num_input_tokens_seen": 13959504, "step": 24055 }, { "epoch": 3.583556747095621, "grad_norm": 1.8597095012664795, "learning_rate": 4.9051396952594806e-05, "loss": 0.5851, "num_input_tokens_seen": 13962384, "step": 24060 }, { "epoch": 3.5843014596365803, "grad_norm": 0.9662265181541443, "learning_rate": 4.905051013666389e-05, "loss": 0.6797, "num_input_tokens_seen": 13965200, "step": 24065 }, { "epoch": 3.5850461721775395, "grad_norm": 0.8549129366874695, "learning_rate": 4.904962291442404e-05, "loss": 0.7301, "num_input_tokens_seen": 13967952, "step": 24070 }, { "epoch": 3.5857908847184987, "grad_norm": 1.0781782865524292, "learning_rate": 4.904873528589027e-05, "loss": 0.6488, "num_input_tokens_seen": 13970928, "step": 24075 }, { "epoch": 3.586535597259458, "grad_norm": 0.9497019648551941, "learning_rate": 4.9047847251077544e-05, "loss": 0.6434, "num_input_tokens_seen": 13973872, "step": 24080 }, { "epoch": 3.587280309800417, "grad_norm": 2.014631748199463, "learning_rate": 4.904695881000089e-05, "loss": 0.7036, "num_input_tokens_seen": 13976592, "step": 24085 }, { "epoch": 3.5880250223413763, "grad_norm": 1.0084490776062012, "learning_rate": 4.90460699626753e-05, "loss": 0.6913, "num_input_tokens_seen": 13979472, "step": 24090 }, { "epoch": 3.5887697348823355, "grad_norm": 1.3088754415512085, "learning_rate": 4.90451807091158e-05, "loss": 0.7726, "num_input_tokens_seen": 13982288, "step": 24095 }, { "epoch": 3.5895144474232947, "grad_norm": 1.0954151153564453, "learning_rate": 4.904429104933741e-05, "loss": 0.5225, "num_input_tokens_seen": 13985136, "step": 24100 }, { "epoch": 3.590259159964254, "grad_norm": 0.9396993517875671, "learning_rate": 4.904340098335516e-05, "loss": 0.9432, "num_input_tokens_seen": 13988016, "step": 24105 }, { "epoch": 3.591003872505213, "grad_norm": 1.2422703504562378, "learning_rate": 4.904251051118408e-05, "loss": 0.8636, "num_input_tokens_seen": 13991120, "step": 24110 }, { "epoch": 3.5917485850461723, "grad_norm": 1.0376317501068115, "learning_rate": 4.904161963283923e-05, "loss": 0.5915, "num_input_tokens_seen": 13993968, "step": 24115 }, { "epoch": 3.592493297587131, "grad_norm": 1.8094185590744019, "learning_rate": 4.9040728348335655e-05, "loss": 0.7633, "num_input_tokens_seen": 13996752, "step": 24120 }, { "epoch": 3.5932380101280907, "grad_norm": 0.6609119176864624, "learning_rate": 4.90398366576884e-05, "loss": 0.5192, "num_input_tokens_seen": 13999696, "step": 24125 }, { "epoch": 3.5939827226690495, "grad_norm": 0.732974648475647, "learning_rate": 4.903894456091254e-05, "loss": 0.6723, "num_input_tokens_seen": 14002832, "step": 24130 }, { "epoch": 3.594727435210009, "grad_norm": 0.9114761352539062, "learning_rate": 4.903805205802314e-05, "loss": 0.6561, "num_input_tokens_seen": 14005712, "step": 24135 }, { "epoch": 3.595472147750968, "grad_norm": 0.5603374242782593, "learning_rate": 4.903715914903529e-05, "loss": 0.7253, "num_input_tokens_seen": 14009040, "step": 24140 }, { "epoch": 3.5962168602919276, "grad_norm": 1.0841175317764282, "learning_rate": 4.9036265833964057e-05, "loss": 0.7427, "num_input_tokens_seen": 14011600, "step": 24145 }, { "epoch": 3.5969615728328863, "grad_norm": 0.7855088710784912, "learning_rate": 4.903537211282455e-05, "loss": 0.6675, "num_input_tokens_seen": 14014416, "step": 24150 }, { "epoch": 3.597706285373846, "grad_norm": 1.0257692337036133, "learning_rate": 4.9034477985631854e-05, "loss": 0.6637, "num_input_tokens_seen": 14017104, "step": 24155 }, { "epoch": 3.5984509979148047, "grad_norm": 0.8605470657348633, "learning_rate": 4.903358345240109e-05, "loss": 0.576, "num_input_tokens_seen": 14019856, "step": 24160 }, { "epoch": 3.599195710455764, "grad_norm": 0.9056673049926758, "learning_rate": 4.903268851314735e-05, "loss": 0.7211, "num_input_tokens_seen": 14022608, "step": 24165 }, { "epoch": 3.599940422996723, "grad_norm": 0.8931896090507507, "learning_rate": 4.903179316788577e-05, "loss": 0.677, "num_input_tokens_seen": 14025392, "step": 24170 }, { "epoch": 3.6006851355376823, "grad_norm": 1.688758373260498, "learning_rate": 4.903089741663146e-05, "loss": 0.7765, "num_input_tokens_seen": 14028208, "step": 24175 }, { "epoch": 3.6014298480786415, "grad_norm": 0.934891402721405, "learning_rate": 4.9030001259399563e-05, "loss": 0.818, "num_input_tokens_seen": 14031024, "step": 24180 }, { "epoch": 3.6021745606196007, "grad_norm": 0.9534233212471008, "learning_rate": 4.9029104696205225e-05, "loss": 0.6024, "num_input_tokens_seen": 14033936, "step": 24185 }, { "epoch": 3.60291927316056, "grad_norm": 1.264015793800354, "learning_rate": 4.9028207727063576e-05, "loss": 0.7368, "num_input_tokens_seen": 14036976, "step": 24190 }, { "epoch": 3.603663985701519, "grad_norm": 0.9171027541160583, "learning_rate": 4.902731035198979e-05, "loss": 0.7119, "num_input_tokens_seen": 14039920, "step": 24195 }, { "epoch": 3.6044086982424783, "grad_norm": 0.8776971697807312, "learning_rate": 4.902641257099901e-05, "loss": 0.7665, "num_input_tokens_seen": 14042896, "step": 24200 }, { "epoch": 3.6051534107834375, "grad_norm": 0.7032585740089417, "learning_rate": 4.9025514384106414e-05, "loss": 0.5719, "num_input_tokens_seen": 14045616, "step": 24205 }, { "epoch": 3.6058981233243967, "grad_norm": 0.7810484170913696, "learning_rate": 4.902461579132717e-05, "loss": 0.5772, "num_input_tokens_seen": 14048592, "step": 24210 }, { "epoch": 3.606642835865356, "grad_norm": 0.7244249582290649, "learning_rate": 4.902371679267646e-05, "loss": 0.6576, "num_input_tokens_seen": 14051568, "step": 24215 }, { "epoch": 3.607387548406315, "grad_norm": 1.2368348836898804, "learning_rate": 4.9022817388169464e-05, "loss": 0.6117, "num_input_tokens_seen": 14054448, "step": 24220 }, { "epoch": 3.6081322609472744, "grad_norm": 1.2188447713851929, "learning_rate": 4.9021917577821386e-05, "loss": 0.7393, "num_input_tokens_seen": 14057488, "step": 24225 }, { "epoch": 3.6088769734882336, "grad_norm": 0.9930869340896606, "learning_rate": 4.902101736164742e-05, "loss": 0.5188, "num_input_tokens_seen": 14060400, "step": 24230 }, { "epoch": 3.6096216860291928, "grad_norm": 1.1840139627456665, "learning_rate": 4.902011673966279e-05, "loss": 0.839, "num_input_tokens_seen": 14063152, "step": 24235 }, { "epoch": 3.610366398570152, "grad_norm": 0.9108840823173523, "learning_rate": 4.90192157118827e-05, "loss": 0.5917, "num_input_tokens_seen": 14066288, "step": 24240 }, { "epoch": 3.611111111111111, "grad_norm": 1.3727706670761108, "learning_rate": 4.901831427832237e-05, "loss": 0.7633, "num_input_tokens_seen": 14069168, "step": 24245 }, { "epoch": 3.6118558236520704, "grad_norm": 1.3294966220855713, "learning_rate": 4.9017412438997026e-05, "loss": 0.7793, "num_input_tokens_seen": 14072176, "step": 24250 }, { "epoch": 3.6126005361930296, "grad_norm": 1.6180381774902344, "learning_rate": 4.901651019392191e-05, "loss": 0.704, "num_input_tokens_seen": 14075056, "step": 24255 }, { "epoch": 3.6133452487339888, "grad_norm": 1.919084072113037, "learning_rate": 4.901560754311227e-05, "loss": 0.7493, "num_input_tokens_seen": 14078224, "step": 24260 }, { "epoch": 3.614089961274948, "grad_norm": 0.8087077140808105, "learning_rate": 4.901470448658335e-05, "loss": 0.6665, "num_input_tokens_seen": 14081360, "step": 24265 }, { "epoch": 3.614834673815907, "grad_norm": 0.9701362252235413, "learning_rate": 4.9013801024350406e-05, "loss": 0.7199, "num_input_tokens_seen": 14084336, "step": 24270 }, { "epoch": 3.6155793863568664, "grad_norm": 0.809296727180481, "learning_rate": 4.9012897156428694e-05, "loss": 0.603, "num_input_tokens_seen": 14087312, "step": 24275 }, { "epoch": 3.6163240988978256, "grad_norm": 0.9642226099967957, "learning_rate": 4.901199288283349e-05, "loss": 0.7016, "num_input_tokens_seen": 14090128, "step": 24280 }, { "epoch": 3.617068811438785, "grad_norm": 0.9399383068084717, "learning_rate": 4.901108820358008e-05, "loss": 0.7472, "num_input_tokens_seen": 14092656, "step": 24285 }, { "epoch": 3.617813523979744, "grad_norm": 2.442458152770996, "learning_rate": 4.901018311868373e-05, "loss": 0.7214, "num_input_tokens_seen": 14095664, "step": 24290 }, { "epoch": 3.6185582365207027, "grad_norm": 1.276788592338562, "learning_rate": 4.9009277628159744e-05, "loss": 0.5955, "num_input_tokens_seen": 14098800, "step": 24295 }, { "epoch": 3.6193029490616624, "grad_norm": 1.0871130228042603, "learning_rate": 4.900837173202341e-05, "loss": 0.6463, "num_input_tokens_seen": 14101584, "step": 24300 }, { "epoch": 3.620047661602621, "grad_norm": 0.9733688831329346, "learning_rate": 4.900746543029003e-05, "loss": 0.5823, "num_input_tokens_seen": 14104528, "step": 24305 }, { "epoch": 3.620792374143581, "grad_norm": 1.009658694267273, "learning_rate": 4.900655872297494e-05, "loss": 0.6212, "num_input_tokens_seen": 14107184, "step": 24310 }, { "epoch": 3.6215370866845396, "grad_norm": 1.3706121444702148, "learning_rate": 4.900565161009343e-05, "loss": 0.8881, "num_input_tokens_seen": 14110128, "step": 24315 }, { "epoch": 3.622281799225499, "grad_norm": 0.8751162886619568, "learning_rate": 4.9004744091660826e-05, "loss": 0.6542, "num_input_tokens_seen": 14112688, "step": 24320 }, { "epoch": 3.623026511766458, "grad_norm": 1.9556621313095093, "learning_rate": 4.900383616769247e-05, "loss": 0.7199, "num_input_tokens_seen": 14115536, "step": 24325 }, { "epoch": 3.6237712243074176, "grad_norm": 1.2951865196228027, "learning_rate": 4.900292783820371e-05, "loss": 0.7633, "num_input_tokens_seen": 14118256, "step": 24330 }, { "epoch": 3.6245159368483764, "grad_norm": 0.8881608247756958, "learning_rate": 4.9002019103209875e-05, "loss": 0.6995, "num_input_tokens_seen": 14121200, "step": 24335 }, { "epoch": 3.6252606493893356, "grad_norm": 1.6181221008300781, "learning_rate": 4.9001109962726323e-05, "loss": 0.6755, "num_input_tokens_seen": 14124144, "step": 24340 }, { "epoch": 3.6260053619302948, "grad_norm": 0.6802491545677185, "learning_rate": 4.9000200416768405e-05, "loss": 0.6889, "num_input_tokens_seen": 14127152, "step": 24345 }, { "epoch": 3.626750074471254, "grad_norm": 1.0675691366195679, "learning_rate": 4.89992904653515e-05, "loss": 0.6399, "num_input_tokens_seen": 14129872, "step": 24350 }, { "epoch": 3.627494787012213, "grad_norm": 1.8535106182098389, "learning_rate": 4.899838010849097e-05, "loss": 0.7092, "num_input_tokens_seen": 14132752, "step": 24355 }, { "epoch": 3.6282394995531724, "grad_norm": 0.7270225286483765, "learning_rate": 4.89974693462022e-05, "loss": 0.5392, "num_input_tokens_seen": 14135696, "step": 24360 }, { "epoch": 3.6289842120941316, "grad_norm": 0.6765105724334717, "learning_rate": 4.899655817850058e-05, "loss": 0.638, "num_input_tokens_seen": 14138512, "step": 24365 }, { "epoch": 3.629728924635091, "grad_norm": 0.7963419556617737, "learning_rate": 4.899564660540149e-05, "loss": 0.6878, "num_input_tokens_seen": 14141104, "step": 24370 }, { "epoch": 3.63047363717605, "grad_norm": 0.8912367820739746, "learning_rate": 4.899473462692035e-05, "loss": 0.7008, "num_input_tokens_seen": 14143952, "step": 24375 }, { "epoch": 3.631218349717009, "grad_norm": 1.075903058052063, "learning_rate": 4.899382224307255e-05, "loss": 0.8359, "num_input_tokens_seen": 14146576, "step": 24380 }, { "epoch": 3.6319630622579684, "grad_norm": 0.8870188593864441, "learning_rate": 4.8992909453873505e-05, "loss": 0.7305, "num_input_tokens_seen": 14149392, "step": 24385 }, { "epoch": 3.6327077747989276, "grad_norm": 1.2199615240097046, "learning_rate": 4.899199625933865e-05, "loss": 0.5867, "num_input_tokens_seen": 14152496, "step": 24390 }, { "epoch": 3.633452487339887, "grad_norm": 0.8770727515220642, "learning_rate": 4.899108265948339e-05, "loss": 0.7449, "num_input_tokens_seen": 14155312, "step": 24395 }, { "epoch": 3.634197199880846, "grad_norm": 1.2070237398147583, "learning_rate": 4.899016865432318e-05, "loss": 0.6675, "num_input_tokens_seen": 14158288, "step": 24400 }, { "epoch": 3.634941912421805, "grad_norm": 0.8139351010322571, "learning_rate": 4.898925424387345e-05, "loss": 0.5442, "num_input_tokens_seen": 14161296, "step": 24405 }, { "epoch": 3.6356866249627644, "grad_norm": 0.9710365533828735, "learning_rate": 4.8988339428149656e-05, "loss": 0.6002, "num_input_tokens_seen": 14163952, "step": 24410 }, { "epoch": 3.6364313375037236, "grad_norm": 0.9147529006004333, "learning_rate": 4.898742420716724e-05, "loss": 0.752, "num_input_tokens_seen": 14166768, "step": 24415 }, { "epoch": 3.637176050044683, "grad_norm": 0.9824113249778748, "learning_rate": 4.898650858094168e-05, "loss": 0.7076, "num_input_tokens_seen": 14169648, "step": 24420 }, { "epoch": 3.637920762585642, "grad_norm": 0.7766646146774292, "learning_rate": 4.898559254948843e-05, "loss": 0.7392, "num_input_tokens_seen": 14172656, "step": 24425 }, { "epoch": 3.638665475126601, "grad_norm": 0.9403411149978638, "learning_rate": 4.898467611282297e-05, "loss": 0.6373, "num_input_tokens_seen": 14175632, "step": 24430 }, { "epoch": 3.6394101876675604, "grad_norm": 1.0431715250015259, "learning_rate": 4.8983759270960796e-05, "loss": 0.7038, "num_input_tokens_seen": 14178416, "step": 24435 }, { "epoch": 3.6401549002085196, "grad_norm": 0.906886875629425, "learning_rate": 4.8982842023917374e-05, "loss": 0.8429, "num_input_tokens_seen": 14181424, "step": 24440 }, { "epoch": 3.640899612749479, "grad_norm": 1.0893563032150269, "learning_rate": 4.898192437170822e-05, "loss": 0.5018, "num_input_tokens_seen": 14184272, "step": 24445 }, { "epoch": 3.641644325290438, "grad_norm": 1.8420217037200928, "learning_rate": 4.898100631434882e-05, "loss": 0.6452, "num_input_tokens_seen": 14187216, "step": 24450 }, { "epoch": 3.6423890378313972, "grad_norm": 0.6488367319107056, "learning_rate": 4.898008785185469e-05, "loss": 0.7196, "num_input_tokens_seen": 14190256, "step": 24455 }, { "epoch": 3.6431337503723564, "grad_norm": 0.9387704133987427, "learning_rate": 4.8979168984241354e-05, "loss": 0.7021, "num_input_tokens_seen": 14193360, "step": 24460 }, { "epoch": 3.6438784629133156, "grad_norm": 1.2729402780532837, "learning_rate": 4.8978249711524324e-05, "loss": 0.5774, "num_input_tokens_seen": 14196240, "step": 24465 }, { "epoch": 3.6446231754542744, "grad_norm": 0.611290693283081, "learning_rate": 4.8977330033719147e-05, "loss": 0.6052, "num_input_tokens_seen": 14199152, "step": 24470 }, { "epoch": 3.645367887995234, "grad_norm": 0.9702829122543335, "learning_rate": 4.897640995084133e-05, "loss": 0.7081, "num_input_tokens_seen": 14202032, "step": 24475 }, { "epoch": 3.646112600536193, "grad_norm": 0.9719698429107666, "learning_rate": 4.8975489462906456e-05, "loss": 0.7376, "num_input_tokens_seen": 14204816, "step": 24480 }, { "epoch": 3.6468573130771524, "grad_norm": 0.6216227412223816, "learning_rate": 4.897456856993004e-05, "loss": 0.6369, "num_input_tokens_seen": 14207792, "step": 24485 }, { "epoch": 3.647602025618111, "grad_norm": 0.5603209137916565, "learning_rate": 4.897364727192766e-05, "loss": 0.698, "num_input_tokens_seen": 14210992, "step": 24490 }, { "epoch": 3.648346738159071, "grad_norm": 1.0517123937606812, "learning_rate": 4.897272556891487e-05, "loss": 0.6302, "num_input_tokens_seen": 14213552, "step": 24495 }, { "epoch": 3.6490914507000296, "grad_norm": 0.7706148624420166, "learning_rate": 4.897180346090726e-05, "loss": 0.6874, "num_input_tokens_seen": 14216368, "step": 24500 }, { "epoch": 3.6498361632409893, "grad_norm": 0.7884291410446167, "learning_rate": 4.8970880947920386e-05, "loss": 0.6715, "num_input_tokens_seen": 14219216, "step": 24505 }, { "epoch": 3.650580875781948, "grad_norm": 1.2991814613342285, "learning_rate": 4.8969958029969834e-05, "loss": 0.6144, "num_input_tokens_seen": 14222416, "step": 24510 }, { "epoch": 3.651325588322907, "grad_norm": 0.9406715035438538, "learning_rate": 4.896903470707121e-05, "loss": 0.5923, "num_input_tokens_seen": 14225104, "step": 24515 }, { "epoch": 3.6520703008638664, "grad_norm": 0.8699218034744263, "learning_rate": 4.89681109792401e-05, "loss": 0.7654, "num_input_tokens_seen": 14228144, "step": 24520 }, { "epoch": 3.6528150134048256, "grad_norm": 0.795149564743042, "learning_rate": 4.896718684649213e-05, "loss": 0.7596, "num_input_tokens_seen": 14231056, "step": 24525 }, { "epoch": 3.653559725945785, "grad_norm": 1.3139818906784058, "learning_rate": 4.8966262308842885e-05, "loss": 0.7224, "num_input_tokens_seen": 14234000, "step": 24530 }, { "epoch": 3.654304438486744, "grad_norm": 0.8851135969161987, "learning_rate": 4.8965337366308e-05, "loss": 0.704, "num_input_tokens_seen": 14236912, "step": 24535 }, { "epoch": 3.6550491510277032, "grad_norm": 0.9990304708480835, "learning_rate": 4.896441201890309e-05, "loss": 0.6664, "num_input_tokens_seen": 14239696, "step": 24540 }, { "epoch": 3.6557938635686624, "grad_norm": 1.184546709060669, "learning_rate": 4.896348626664381e-05, "loss": 0.8434, "num_input_tokens_seen": 14242576, "step": 24545 }, { "epoch": 3.6565385761096216, "grad_norm": 0.6817115545272827, "learning_rate": 4.896256010954578e-05, "loss": 0.6021, "num_input_tokens_seen": 14245648, "step": 24550 }, { "epoch": 3.657283288650581, "grad_norm": 0.7537308931350708, "learning_rate": 4.896163354762464e-05, "loss": 0.6383, "num_input_tokens_seen": 14248464, "step": 24555 }, { "epoch": 3.65802800119154, "grad_norm": 0.8630800843238831, "learning_rate": 4.8960706580896066e-05, "loss": 0.7621, "num_input_tokens_seen": 14251568, "step": 24560 }, { "epoch": 3.6587727137324992, "grad_norm": 0.9613097310066223, "learning_rate": 4.8959779209375703e-05, "loss": 0.7033, "num_input_tokens_seen": 14254544, "step": 24565 }, { "epoch": 3.6595174262734584, "grad_norm": 1.2549314498901367, "learning_rate": 4.895885143307922e-05, "loss": 0.5921, "num_input_tokens_seen": 14257296, "step": 24570 }, { "epoch": 3.6602621388144176, "grad_norm": 1.0013065338134766, "learning_rate": 4.8957923252022304e-05, "loss": 0.6361, "num_input_tokens_seen": 14260368, "step": 24575 }, { "epoch": 3.661006851355377, "grad_norm": 0.6735069751739502, "learning_rate": 4.8956994666220615e-05, "loss": 0.667, "num_input_tokens_seen": 14263216, "step": 24580 }, { "epoch": 3.661751563896336, "grad_norm": 1.0215997695922852, "learning_rate": 4.895606567568985e-05, "loss": 0.7536, "num_input_tokens_seen": 14266192, "step": 24585 }, { "epoch": 3.6624962764372953, "grad_norm": 1.0906718969345093, "learning_rate": 4.8955136280445704e-05, "loss": 0.7592, "num_input_tokens_seen": 14268976, "step": 24590 }, { "epoch": 3.6632409889782545, "grad_norm": 1.2516080141067505, "learning_rate": 4.895420648050388e-05, "loss": 0.6154, "num_input_tokens_seen": 14271664, "step": 24595 }, { "epoch": 3.6639857015192137, "grad_norm": 1.2399680614471436, "learning_rate": 4.895327627588008e-05, "loss": 0.7148, "num_input_tokens_seen": 14274416, "step": 24600 }, { "epoch": 3.664730414060173, "grad_norm": 0.8508558869361877, "learning_rate": 4.8952345666590025e-05, "loss": 0.6871, "num_input_tokens_seen": 14277456, "step": 24605 }, { "epoch": 3.665475126601132, "grad_norm": 0.9090257883071899, "learning_rate": 4.895141465264943e-05, "loss": 0.4754, "num_input_tokens_seen": 14280624, "step": 24610 }, { "epoch": 3.6662198391420913, "grad_norm": 0.6178355813026428, "learning_rate": 4.895048323407403e-05, "loss": 0.5435, "num_input_tokens_seen": 14283536, "step": 24615 }, { "epoch": 3.6669645516830505, "grad_norm": 0.6851806640625, "learning_rate": 4.894955141087956e-05, "loss": 0.6509, "num_input_tokens_seen": 14286640, "step": 24620 }, { "epoch": 3.6677092642240097, "grad_norm": 0.6911941170692444, "learning_rate": 4.894861918308176e-05, "loss": 0.577, "num_input_tokens_seen": 14289584, "step": 24625 }, { "epoch": 3.668453976764969, "grad_norm": 0.9416460990905762, "learning_rate": 4.894768655069638e-05, "loss": 0.6935, "num_input_tokens_seen": 14292528, "step": 24630 }, { "epoch": 3.669198689305928, "grad_norm": 1.3422495126724243, "learning_rate": 4.8946753513739166e-05, "loss": 0.6359, "num_input_tokens_seen": 14295440, "step": 24635 }, { "epoch": 3.6699434018468873, "grad_norm": 1.8943743705749512, "learning_rate": 4.89458200722259e-05, "loss": 0.8222, "num_input_tokens_seen": 14298672, "step": 24640 }, { "epoch": 3.670688114387846, "grad_norm": 1.1180813312530518, "learning_rate": 4.894488622617234e-05, "loss": 0.6812, "num_input_tokens_seen": 14301296, "step": 24645 }, { "epoch": 3.6714328269288057, "grad_norm": 1.57412588596344, "learning_rate": 4.894395197559426e-05, "loss": 0.711, "num_input_tokens_seen": 14304208, "step": 24650 }, { "epoch": 3.6721775394697644, "grad_norm": 0.8392574787139893, "learning_rate": 4.8943017320507444e-05, "loss": 0.5712, "num_input_tokens_seen": 14307504, "step": 24655 }, { "epoch": 3.672922252010724, "grad_norm": 0.8073023557662964, "learning_rate": 4.894208226092769e-05, "loss": 0.6416, "num_input_tokens_seen": 14310736, "step": 24660 }, { "epoch": 3.673666964551683, "grad_norm": 0.8128201961517334, "learning_rate": 4.894114679687079e-05, "loss": 0.6219, "num_input_tokens_seen": 14313776, "step": 24665 }, { "epoch": 3.6744116770926425, "grad_norm": 1.513104796409607, "learning_rate": 4.8940210928352545e-05, "loss": 0.6249, "num_input_tokens_seen": 14316784, "step": 24670 }, { "epoch": 3.6751563896336013, "grad_norm": 1.1180232763290405, "learning_rate": 4.893927465538877e-05, "loss": 0.7102, "num_input_tokens_seen": 14319440, "step": 24675 }, { "epoch": 3.675901102174561, "grad_norm": 1.4777765274047852, "learning_rate": 4.8938337977995286e-05, "loss": 0.7197, "num_input_tokens_seen": 14322384, "step": 24680 }, { "epoch": 3.6766458147155197, "grad_norm": 0.9005042910575867, "learning_rate": 4.89374008961879e-05, "loss": 0.5719, "num_input_tokens_seen": 14325200, "step": 24685 }, { "epoch": 3.677390527256479, "grad_norm": 0.7407094836235046, "learning_rate": 4.8936463409982466e-05, "loss": 0.7411, "num_input_tokens_seen": 14328144, "step": 24690 }, { "epoch": 3.678135239797438, "grad_norm": 1.0773181915283203, "learning_rate": 4.89355255193948e-05, "loss": 0.7206, "num_input_tokens_seen": 14330864, "step": 24695 }, { "epoch": 3.6788799523383973, "grad_norm": 2.3716163635253906, "learning_rate": 4.893458722444076e-05, "loss": 0.6049, "num_input_tokens_seen": 14333808, "step": 24700 }, { "epoch": 3.6796246648793565, "grad_norm": 0.8805814385414124, "learning_rate": 4.89336485251362e-05, "loss": 0.6112, "num_input_tokens_seen": 14336816, "step": 24705 }, { "epoch": 3.6803693774203157, "grad_norm": 0.8580710887908936, "learning_rate": 4.893270942149697e-05, "loss": 0.7173, "num_input_tokens_seen": 14340016, "step": 24710 }, { "epoch": 3.681114089961275, "grad_norm": 0.8060891032218933, "learning_rate": 4.8931769913538945e-05, "loss": 0.5035, "num_input_tokens_seen": 14343664, "step": 24715 }, { "epoch": 3.681858802502234, "grad_norm": 0.9199968576431274, "learning_rate": 4.893083000127798e-05, "loss": 0.7412, "num_input_tokens_seen": 14346480, "step": 24720 }, { "epoch": 3.6826035150431933, "grad_norm": 1.049452781677246, "learning_rate": 4.8929889684729966e-05, "loss": 0.6481, "num_input_tokens_seen": 14349136, "step": 24725 }, { "epoch": 3.6833482275841525, "grad_norm": 0.9737989902496338, "learning_rate": 4.892894896391079e-05, "loss": 0.6459, "num_input_tokens_seen": 14351760, "step": 24730 }, { "epoch": 3.6840929401251117, "grad_norm": 1.4393017292022705, "learning_rate": 4.892800783883635e-05, "loss": 0.6161, "num_input_tokens_seen": 14354640, "step": 24735 }, { "epoch": 3.684837652666071, "grad_norm": 0.7293903231620789, "learning_rate": 4.892706630952253e-05, "loss": 0.6333, "num_input_tokens_seen": 14357680, "step": 24740 }, { "epoch": 3.68558236520703, "grad_norm": 1.0424762964248657, "learning_rate": 4.892612437598524e-05, "loss": 0.7088, "num_input_tokens_seen": 14360464, "step": 24745 }, { "epoch": 3.6863270777479893, "grad_norm": 1.530538558959961, "learning_rate": 4.8925182038240395e-05, "loss": 0.8015, "num_input_tokens_seen": 14363408, "step": 24750 }, { "epoch": 3.6870717902889485, "grad_norm": 0.7360979318618774, "learning_rate": 4.892423929630392e-05, "loss": 0.7053, "num_input_tokens_seen": 14366512, "step": 24755 }, { "epoch": 3.6878165028299077, "grad_norm": 0.7427245378494263, "learning_rate": 4.892329615019173e-05, "loss": 0.674, "num_input_tokens_seen": 14369424, "step": 24760 }, { "epoch": 3.688561215370867, "grad_norm": 0.9378321170806885, "learning_rate": 4.892235259991977e-05, "loss": 0.6545, "num_input_tokens_seen": 14372400, "step": 24765 }, { "epoch": 3.689305927911826, "grad_norm": 1.277305245399475, "learning_rate": 4.8921408645503986e-05, "loss": 0.6002, "num_input_tokens_seen": 14375280, "step": 24770 }, { "epoch": 3.6900506404527853, "grad_norm": 0.8980659246444702, "learning_rate": 4.892046428696031e-05, "loss": 0.7658, "num_input_tokens_seen": 14378288, "step": 24775 }, { "epoch": 3.6907953529937445, "grad_norm": 0.9715303182601929, "learning_rate": 4.8919519524304704e-05, "loss": 0.6142, "num_input_tokens_seen": 14381200, "step": 24780 }, { "epoch": 3.6915400655347037, "grad_norm": 0.8890687823295593, "learning_rate": 4.891857435755312e-05, "loss": 0.7372, "num_input_tokens_seen": 14384560, "step": 24785 }, { "epoch": 3.692284778075663, "grad_norm": 0.9887111186981201, "learning_rate": 4.891762878672153e-05, "loss": 0.587, "num_input_tokens_seen": 14387408, "step": 24790 }, { "epoch": 3.693029490616622, "grad_norm": 0.4952554702758789, "learning_rate": 4.891668281182592e-05, "loss": 0.7671, "num_input_tokens_seen": 14390512, "step": 24795 }, { "epoch": 3.6937742031575813, "grad_norm": 0.7685219049453735, "learning_rate": 4.8915736432882254e-05, "loss": 0.9066, "num_input_tokens_seen": 14393296, "step": 24800 }, { "epoch": 3.6945189156985405, "grad_norm": 1.1310131549835205, "learning_rate": 4.891478964990653e-05, "loss": 0.628, "num_input_tokens_seen": 14395984, "step": 24805 }, { "epoch": 3.6952636282394993, "grad_norm": 0.9939931631088257, "learning_rate": 4.891384246291474e-05, "loss": 0.7138, "num_input_tokens_seen": 14399152, "step": 24810 }, { "epoch": 3.696008340780459, "grad_norm": 0.7888880372047424, "learning_rate": 4.891289487192289e-05, "loss": 0.6604, "num_input_tokens_seen": 14402064, "step": 24815 }, { "epoch": 3.6967530533214177, "grad_norm": 0.6890326142311096, "learning_rate": 4.891194687694698e-05, "loss": 0.5801, "num_input_tokens_seen": 14404688, "step": 24820 }, { "epoch": 3.6974977658623773, "grad_norm": 1.0522589683532715, "learning_rate": 4.8910998478003034e-05, "loss": 0.6266, "num_input_tokens_seen": 14407600, "step": 24825 }, { "epoch": 3.698242478403336, "grad_norm": 2.161825180053711, "learning_rate": 4.891004967510707e-05, "loss": 0.7473, "num_input_tokens_seen": 14410416, "step": 24830 }, { "epoch": 3.6989871909442957, "grad_norm": 0.843725860118866, "learning_rate": 4.890910046827511e-05, "loss": 0.6666, "num_input_tokens_seen": 14413296, "step": 24835 }, { "epoch": 3.6997319034852545, "grad_norm": 1.6484770774841309, "learning_rate": 4.890815085752322e-05, "loss": 0.635, "num_input_tokens_seen": 14415792, "step": 24840 }, { "epoch": 3.700476616026214, "grad_norm": 0.7156805992126465, "learning_rate": 4.890720084286739e-05, "loss": 0.5476, "num_input_tokens_seen": 14418896, "step": 24845 }, { "epoch": 3.701221328567173, "grad_norm": 0.7262691855430603, "learning_rate": 4.890625042432372e-05, "loss": 0.5967, "num_input_tokens_seen": 14421584, "step": 24850 }, { "epoch": 3.701966041108132, "grad_norm": 1.5465763807296753, "learning_rate": 4.890529960190825e-05, "loss": 0.698, "num_input_tokens_seen": 14424528, "step": 24855 }, { "epoch": 3.7027107536490913, "grad_norm": 1.2469186782836914, "learning_rate": 4.8904348375637025e-05, "loss": 0.7423, "num_input_tokens_seen": 14427888, "step": 24860 }, { "epoch": 3.7034554661900505, "grad_norm": 1.166243076324463, "learning_rate": 4.8903396745526144e-05, "loss": 0.6949, "num_input_tokens_seen": 14430768, "step": 24865 }, { "epoch": 3.7042001787310097, "grad_norm": 1.2938261032104492, "learning_rate": 4.8902444711591656e-05, "loss": 0.6453, "num_input_tokens_seen": 14433872, "step": 24870 }, { "epoch": 3.704944891271969, "grad_norm": 0.7912776470184326, "learning_rate": 4.8901492273849666e-05, "loss": 0.687, "num_input_tokens_seen": 14436624, "step": 24875 }, { "epoch": 3.705689603812928, "grad_norm": 1.1000041961669922, "learning_rate": 4.890053943231625e-05, "loss": 0.6147, "num_input_tokens_seen": 14439952, "step": 24880 }, { "epoch": 3.7064343163538873, "grad_norm": 1.3301968574523926, "learning_rate": 4.889958618700752e-05, "loss": 0.7804, "num_input_tokens_seen": 14443056, "step": 24885 }, { "epoch": 3.7071790288948465, "grad_norm": 0.632992684841156, "learning_rate": 4.8898632537939567e-05, "loss": 0.6708, "num_input_tokens_seen": 14445808, "step": 24890 }, { "epoch": 3.7079237414358057, "grad_norm": 1.1464635133743286, "learning_rate": 4.889767848512851e-05, "loss": 0.6464, "num_input_tokens_seen": 14448624, "step": 24895 }, { "epoch": 3.708668453976765, "grad_norm": 1.0255006551742554, "learning_rate": 4.889672402859046e-05, "loss": 0.6817, "num_input_tokens_seen": 14451472, "step": 24900 }, { "epoch": 3.709413166517724, "grad_norm": 0.8732730150222778, "learning_rate": 4.8895769168341546e-05, "loss": 0.6953, "num_input_tokens_seen": 14454320, "step": 24905 }, { "epoch": 3.7101578790586833, "grad_norm": 1.3209837675094604, "learning_rate": 4.8894813904397895e-05, "loss": 0.6799, "num_input_tokens_seen": 14457424, "step": 24910 }, { "epoch": 3.7109025915996425, "grad_norm": 0.8685429096221924, "learning_rate": 4.889385823677565e-05, "loss": 0.595, "num_input_tokens_seen": 14460336, "step": 24915 }, { "epoch": 3.7116473041406017, "grad_norm": 1.0024490356445312, "learning_rate": 4.889290216549096e-05, "loss": 0.5959, "num_input_tokens_seen": 14463280, "step": 24920 }, { "epoch": 3.712392016681561, "grad_norm": 1.4600019454956055, "learning_rate": 4.889194569055996e-05, "loss": 0.6485, "num_input_tokens_seen": 14466160, "step": 24925 }, { "epoch": 3.71313672922252, "grad_norm": 1.2315114736557007, "learning_rate": 4.8890988811998835e-05, "loss": 0.6863, "num_input_tokens_seen": 14468880, "step": 24930 }, { "epoch": 3.7138814417634793, "grad_norm": 0.7723066806793213, "learning_rate": 4.889003152982373e-05, "loss": 0.8222, "num_input_tokens_seen": 14471632, "step": 24935 }, { "epoch": 3.7146261543044385, "grad_norm": 1.6224658489227295, "learning_rate": 4.888907384405082e-05, "loss": 0.6979, "num_input_tokens_seen": 14474608, "step": 24940 }, { "epoch": 3.7153708668453977, "grad_norm": 1.2245643138885498, "learning_rate": 4.888811575469629e-05, "loss": 0.8699, "num_input_tokens_seen": 14477680, "step": 24945 }, { "epoch": 3.716115579386357, "grad_norm": 1.2911463975906372, "learning_rate": 4.8887157261776316e-05, "loss": 0.5422, "num_input_tokens_seen": 14480496, "step": 24950 }, { "epoch": 3.716860291927316, "grad_norm": 0.8319194912910461, "learning_rate": 4.888619836530711e-05, "loss": 0.6484, "num_input_tokens_seen": 14483472, "step": 24955 }, { "epoch": 3.7176050044682754, "grad_norm": 2.0465879440307617, "learning_rate": 4.8885239065304855e-05, "loss": 0.8199, "num_input_tokens_seen": 14486224, "step": 24960 }, { "epoch": 3.7183497170092346, "grad_norm": 1.1584410667419434, "learning_rate": 4.8884279361785754e-05, "loss": 0.7259, "num_input_tokens_seen": 14489456, "step": 24965 }, { "epoch": 3.7190944295501938, "grad_norm": 0.7716915607452393, "learning_rate": 4.888331925476604e-05, "loss": 0.7974, "num_input_tokens_seen": 14492784, "step": 24970 }, { "epoch": 3.719839142091153, "grad_norm": 0.8974360227584839, "learning_rate": 4.8882358744261914e-05, "loss": 0.6387, "num_input_tokens_seen": 14495664, "step": 24975 }, { "epoch": 3.720583854632112, "grad_norm": 0.8965444564819336, "learning_rate": 4.888139783028961e-05, "loss": 0.5926, "num_input_tokens_seen": 14498640, "step": 24980 }, { "epoch": 3.721328567173071, "grad_norm": 0.7630810737609863, "learning_rate": 4.888043651286537e-05, "loss": 0.7371, "num_input_tokens_seen": 14501488, "step": 24985 }, { "epoch": 3.7220732797140306, "grad_norm": 1.3138418197631836, "learning_rate": 4.887947479200542e-05, "loss": 0.6603, "num_input_tokens_seen": 14504528, "step": 24990 }, { "epoch": 3.7228179922549893, "grad_norm": 1.064481496810913, "learning_rate": 4.887851266772601e-05, "loss": 0.6633, "num_input_tokens_seen": 14507472, "step": 24995 }, { "epoch": 3.723562704795949, "grad_norm": 0.8751208782196045, "learning_rate": 4.8877550140043404e-05, "loss": 0.6565, "num_input_tokens_seen": 14510256, "step": 25000 }, { "epoch": 3.7243074173369077, "grad_norm": 0.8235897421836853, "learning_rate": 4.887658720897385e-05, "loss": 0.6349, "num_input_tokens_seen": 14513200, "step": 25005 }, { "epoch": 3.7250521298778674, "grad_norm": 1.7069035768508911, "learning_rate": 4.8875623874533627e-05, "loss": 0.6489, "num_input_tokens_seen": 14515792, "step": 25010 }, { "epoch": 3.725796842418826, "grad_norm": 1.1497386693954468, "learning_rate": 4.8874660136739e-05, "loss": 0.7374, "num_input_tokens_seen": 14518640, "step": 25015 }, { "epoch": 3.726541554959786, "grad_norm": 0.5555846095085144, "learning_rate": 4.887369599560626e-05, "loss": 0.4947, "num_input_tokens_seen": 14521904, "step": 25020 }, { "epoch": 3.7272862675007445, "grad_norm": 0.7664979100227356, "learning_rate": 4.8872731451151684e-05, "loss": 0.7659, "num_input_tokens_seen": 14524848, "step": 25025 }, { "epoch": 3.7280309800417037, "grad_norm": 1.54281747341156, "learning_rate": 4.887176650339158e-05, "loss": 0.6159, "num_input_tokens_seen": 14527600, "step": 25030 }, { "epoch": 3.728775692582663, "grad_norm": 0.8686009645462036, "learning_rate": 4.887080115234224e-05, "loss": 0.5902, "num_input_tokens_seen": 14530352, "step": 25035 }, { "epoch": 3.729520405123622, "grad_norm": 0.6886975169181824, "learning_rate": 4.886983539801998e-05, "loss": 0.7345, "num_input_tokens_seen": 14533328, "step": 25040 }, { "epoch": 3.7302651176645814, "grad_norm": 0.8356492519378662, "learning_rate": 4.886886924044111e-05, "loss": 0.6026, "num_input_tokens_seen": 14536624, "step": 25045 }, { "epoch": 3.7310098302055406, "grad_norm": 1.0919681787490845, "learning_rate": 4.8867902679621946e-05, "loss": 0.4805, "num_input_tokens_seen": 14539280, "step": 25050 }, { "epoch": 3.7317545427464998, "grad_norm": 1.560441255569458, "learning_rate": 4.8866935715578835e-05, "loss": 0.7675, "num_input_tokens_seen": 14542000, "step": 25055 }, { "epoch": 3.732499255287459, "grad_norm": 1.608132243156433, "learning_rate": 4.88659683483281e-05, "loss": 0.5923, "num_input_tokens_seen": 14545072, "step": 25060 }, { "epoch": 3.733243967828418, "grad_norm": 1.7034094333648682, "learning_rate": 4.886500057788608e-05, "loss": 0.7875, "num_input_tokens_seen": 14547920, "step": 25065 }, { "epoch": 3.7339886803693774, "grad_norm": 1.0527369976043701, "learning_rate": 4.8864032404269126e-05, "loss": 0.5383, "num_input_tokens_seen": 14550800, "step": 25070 }, { "epoch": 3.7347333929103366, "grad_norm": 1.3221440315246582, "learning_rate": 4.886306382749361e-05, "loss": 0.6738, "num_input_tokens_seen": 14553680, "step": 25075 }, { "epoch": 3.7354781054512958, "grad_norm": 0.9688680768013, "learning_rate": 4.886209484757588e-05, "loss": 0.703, "num_input_tokens_seen": 14556624, "step": 25080 }, { "epoch": 3.736222817992255, "grad_norm": 1.3696608543395996, "learning_rate": 4.886112546453231e-05, "loss": 0.8281, "num_input_tokens_seen": 14559568, "step": 25085 }, { "epoch": 3.736967530533214, "grad_norm": 0.6302360892295837, "learning_rate": 4.886015567837927e-05, "loss": 0.7706, "num_input_tokens_seen": 14562608, "step": 25090 }, { "epoch": 3.7377122430741734, "grad_norm": 1.0214979648590088, "learning_rate": 4.885918548913316e-05, "loss": 0.7913, "num_input_tokens_seen": 14565456, "step": 25095 }, { "epoch": 3.7384569556151326, "grad_norm": 1.7194195985794067, "learning_rate": 4.885821489681036e-05, "loss": 0.6587, "num_input_tokens_seen": 14568304, "step": 25100 }, { "epoch": 3.739201668156092, "grad_norm": 1.1218372583389282, "learning_rate": 4.885724390142726e-05, "loss": 0.777, "num_input_tokens_seen": 14571248, "step": 25105 }, { "epoch": 3.739946380697051, "grad_norm": 1.2296844720840454, "learning_rate": 4.885627250300028e-05, "loss": 0.7623, "num_input_tokens_seen": 14574096, "step": 25110 }, { "epoch": 3.74069109323801, "grad_norm": 0.7827171087265015, "learning_rate": 4.885530070154582e-05, "loss": 0.6596, "num_input_tokens_seen": 14576688, "step": 25115 }, { "epoch": 3.7414358057789694, "grad_norm": 1.0645935535430908, "learning_rate": 4.88543284970803e-05, "loss": 0.6957, "num_input_tokens_seen": 14579504, "step": 25120 }, { "epoch": 3.7421805183199286, "grad_norm": 1.5041327476501465, "learning_rate": 4.8853355889620143e-05, "loss": 0.6085, "num_input_tokens_seen": 14582320, "step": 25125 }, { "epoch": 3.742925230860888, "grad_norm": 2.0293571949005127, "learning_rate": 4.885238287918178e-05, "loss": 0.5733, "num_input_tokens_seen": 14585072, "step": 25130 }, { "epoch": 3.743669943401847, "grad_norm": 1.1437443494796753, "learning_rate": 4.885140946578166e-05, "loss": 0.6503, "num_input_tokens_seen": 14588016, "step": 25135 }, { "epoch": 3.744414655942806, "grad_norm": 1.5888217687606812, "learning_rate": 4.885043564943621e-05, "loss": 0.5964, "num_input_tokens_seen": 14591088, "step": 25140 }, { "epoch": 3.7451593684837654, "grad_norm": 1.0413397550582886, "learning_rate": 4.884946143016189e-05, "loss": 0.5451, "num_input_tokens_seen": 14593712, "step": 25145 }, { "epoch": 3.7459040810247246, "grad_norm": 0.7445510625839233, "learning_rate": 4.884848680797516e-05, "loss": 0.6135, "num_input_tokens_seen": 14596528, "step": 25150 }, { "epoch": 3.746648793565684, "grad_norm": 1.4379633665084839, "learning_rate": 4.884751178289249e-05, "loss": 0.6996, "num_input_tokens_seen": 14599280, "step": 25155 }, { "epoch": 3.7473935061066426, "grad_norm": 0.7238779664039612, "learning_rate": 4.884653635493034e-05, "loss": 0.6847, "num_input_tokens_seen": 14602352, "step": 25160 }, { "epoch": 3.748138218647602, "grad_norm": 1.0301508903503418, "learning_rate": 4.8845560524105196e-05, "loss": 0.5955, "num_input_tokens_seen": 14605264, "step": 25165 }, { "epoch": 3.748882931188561, "grad_norm": 1.1488871574401855, "learning_rate": 4.8844584290433536e-05, "loss": 0.5689, "num_input_tokens_seen": 14608208, "step": 25170 }, { "epoch": 3.7496276437295206, "grad_norm": 1.3621567487716675, "learning_rate": 4.8843607653931865e-05, "loss": 0.593, "num_input_tokens_seen": 14611376, "step": 25175 }, { "epoch": 3.7503723562704794, "grad_norm": 0.6668291091918945, "learning_rate": 4.884263061461668e-05, "loss": 0.7635, "num_input_tokens_seen": 14614512, "step": 25180 }, { "epoch": 3.751117068811439, "grad_norm": 1.2362061738967896, "learning_rate": 4.884165317250448e-05, "loss": 0.6359, "num_input_tokens_seen": 14617424, "step": 25185 }, { "epoch": 3.751861781352398, "grad_norm": 1.1034914255142212, "learning_rate": 4.8840675327611785e-05, "loss": 0.7256, "num_input_tokens_seen": 14620400, "step": 25190 }, { "epoch": 3.7526064938933574, "grad_norm": 0.9136698246002197, "learning_rate": 4.8839697079955104e-05, "loss": 0.6309, "num_input_tokens_seen": 14623728, "step": 25195 }, { "epoch": 3.753351206434316, "grad_norm": 0.8318911194801331, "learning_rate": 4.883871842955097e-05, "loss": 0.6433, "num_input_tokens_seen": 14626864, "step": 25200 }, { "epoch": 3.7540959189752754, "grad_norm": 1.197152018547058, "learning_rate": 4.883773937641593e-05, "loss": 0.7267, "num_input_tokens_seen": 14629712, "step": 25205 }, { "epoch": 3.7548406315162346, "grad_norm": 1.8412528038024902, "learning_rate": 4.8836759920566494e-05, "loss": 0.8174, "num_input_tokens_seen": 14632432, "step": 25210 }, { "epoch": 3.755585344057194, "grad_norm": 1.0976884365081787, "learning_rate": 4.8835780062019234e-05, "loss": 0.6855, "num_input_tokens_seen": 14635216, "step": 25215 }, { "epoch": 3.756330056598153, "grad_norm": 1.3828773498535156, "learning_rate": 4.8834799800790694e-05, "loss": 0.7181, "num_input_tokens_seen": 14638160, "step": 25220 }, { "epoch": 3.757074769139112, "grad_norm": 1.4004625082015991, "learning_rate": 4.8833819136897436e-05, "loss": 0.6612, "num_input_tokens_seen": 14641360, "step": 25225 }, { "epoch": 3.7578194816800714, "grad_norm": 1.2142601013183594, "learning_rate": 4.883283807035602e-05, "loss": 0.7145, "num_input_tokens_seen": 14644432, "step": 25230 }, { "epoch": 3.7585641942210306, "grad_norm": 0.554198145866394, "learning_rate": 4.883185660118304e-05, "loss": 0.5709, "num_input_tokens_seen": 14647856, "step": 25235 }, { "epoch": 3.75930890676199, "grad_norm": 0.7020019292831421, "learning_rate": 4.883087472939506e-05, "loss": 0.761, "num_input_tokens_seen": 14650704, "step": 25240 }, { "epoch": 3.760053619302949, "grad_norm": 0.7047448754310608, "learning_rate": 4.882989245500867e-05, "loss": 0.6361, "num_input_tokens_seen": 14653552, "step": 25245 }, { "epoch": 3.760798331843908, "grad_norm": 2.0098981857299805, "learning_rate": 4.882890977804047e-05, "loss": 0.7999, "num_input_tokens_seen": 14656688, "step": 25250 }, { "epoch": 3.7615430443848674, "grad_norm": 0.844720721244812, "learning_rate": 4.882792669850705e-05, "loss": 0.8548, "num_input_tokens_seen": 14659664, "step": 25255 }, { "epoch": 3.7622877569258266, "grad_norm": 0.8345736265182495, "learning_rate": 4.882694321642504e-05, "loss": 0.6783, "num_input_tokens_seen": 14662608, "step": 25260 }, { "epoch": 3.763032469466786, "grad_norm": 1.7337360382080078, "learning_rate": 4.8825959331811026e-05, "loss": 0.6697, "num_input_tokens_seen": 14665584, "step": 25265 }, { "epoch": 3.763777182007745, "grad_norm": 0.8645907640457153, "learning_rate": 4.882497504468165e-05, "loss": 0.6001, "num_input_tokens_seen": 14668560, "step": 25270 }, { "epoch": 3.7645218945487042, "grad_norm": 0.8188133239746094, "learning_rate": 4.8823990355053536e-05, "loss": 0.6154, "num_input_tokens_seen": 14671184, "step": 25275 }, { "epoch": 3.7652666070896634, "grad_norm": 1.0401660203933716, "learning_rate": 4.8823005262943323e-05, "loss": 0.6601, "num_input_tokens_seen": 14674064, "step": 25280 }, { "epoch": 3.7660113196306226, "grad_norm": 0.9297122955322266, "learning_rate": 4.882201976836764e-05, "loss": 0.6051, "num_input_tokens_seen": 14676848, "step": 25285 }, { "epoch": 3.766756032171582, "grad_norm": 1.0629922151565552, "learning_rate": 4.8821033871343155e-05, "loss": 0.7954, "num_input_tokens_seen": 14679664, "step": 25290 }, { "epoch": 3.767500744712541, "grad_norm": 0.9465351104736328, "learning_rate": 4.8820047571886504e-05, "loss": 0.6932, "num_input_tokens_seen": 14682704, "step": 25295 }, { "epoch": 3.7682454572535002, "grad_norm": 0.7057069540023804, "learning_rate": 4.8819060870014366e-05, "loss": 0.6254, "num_input_tokens_seen": 14685456, "step": 25300 }, { "epoch": 3.7689901697944594, "grad_norm": 0.8069382309913635, "learning_rate": 4.88180737657434e-05, "loss": 0.6389, "num_input_tokens_seen": 14688592, "step": 25305 }, { "epoch": 3.7697348823354186, "grad_norm": 0.6675489544868469, "learning_rate": 4.881708625909028e-05, "loss": 0.5919, "num_input_tokens_seen": 14691760, "step": 25310 }, { "epoch": 3.770479594876378, "grad_norm": 0.7835070490837097, "learning_rate": 4.881609835007171e-05, "loss": 0.66, "num_input_tokens_seen": 14694768, "step": 25315 }, { "epoch": 3.771224307417337, "grad_norm": 0.7878667712211609, "learning_rate": 4.881511003870435e-05, "loss": 0.5604, "num_input_tokens_seen": 14698096, "step": 25320 }, { "epoch": 3.7719690199582963, "grad_norm": 0.6613149642944336, "learning_rate": 4.881412132500491e-05, "loss": 0.6926, "num_input_tokens_seen": 14700816, "step": 25325 }, { "epoch": 3.7727137324992555, "grad_norm": 0.7409448027610779, "learning_rate": 4.8813132208990095e-05, "loss": 0.5392, "num_input_tokens_seen": 14703664, "step": 25330 }, { "epoch": 3.773458445040214, "grad_norm": 0.8895001411437988, "learning_rate": 4.881214269067662e-05, "loss": 0.6295, "num_input_tokens_seen": 14706672, "step": 25335 }, { "epoch": 3.774203157581174, "grad_norm": 0.7245697379112244, "learning_rate": 4.881115277008119e-05, "loss": 0.6154, "num_input_tokens_seen": 14709552, "step": 25340 }, { "epoch": 3.7749478701221326, "grad_norm": 0.7523994445800781, "learning_rate": 4.881016244722054e-05, "loss": 0.6993, "num_input_tokens_seen": 14712720, "step": 25345 }, { "epoch": 3.7756925826630923, "grad_norm": 0.7471364736557007, "learning_rate": 4.880917172211139e-05, "loss": 0.5913, "num_input_tokens_seen": 14715280, "step": 25350 }, { "epoch": 3.776437295204051, "grad_norm": 1.0520211458206177, "learning_rate": 4.8808180594770486e-05, "loss": 0.5427, "num_input_tokens_seen": 14717968, "step": 25355 }, { "epoch": 3.7771820077450107, "grad_norm": 0.5729223489761353, "learning_rate": 4.880718906521456e-05, "loss": 0.61, "num_input_tokens_seen": 14720560, "step": 25360 }, { "epoch": 3.7779267202859694, "grad_norm": 0.9107989072799683, "learning_rate": 4.880619713346039e-05, "loss": 0.6096, "num_input_tokens_seen": 14723472, "step": 25365 }, { "epoch": 3.778671432826929, "grad_norm": 1.1047850847244263, "learning_rate": 4.8805204799524695e-05, "loss": 0.762, "num_input_tokens_seen": 14726256, "step": 25370 }, { "epoch": 3.779416145367888, "grad_norm": 0.5055503249168396, "learning_rate": 4.880421206342427e-05, "loss": 0.577, "num_input_tokens_seen": 14728944, "step": 25375 }, { "epoch": 3.780160857908847, "grad_norm": 2.2974324226379395, "learning_rate": 4.880321892517587e-05, "loss": 0.538, "num_input_tokens_seen": 14731664, "step": 25380 }, { "epoch": 3.7809055704498062, "grad_norm": 0.9062539935112, "learning_rate": 4.880222538479629e-05, "loss": 0.6818, "num_input_tokens_seen": 14734736, "step": 25385 }, { "epoch": 3.7816502829907654, "grad_norm": 0.8173104524612427, "learning_rate": 4.880123144230229e-05, "loss": 0.6786, "num_input_tokens_seen": 14737808, "step": 25390 }, { "epoch": 3.7823949955317246, "grad_norm": 0.91404789686203, "learning_rate": 4.880023709771068e-05, "loss": 0.718, "num_input_tokens_seen": 14740688, "step": 25395 }, { "epoch": 3.783139708072684, "grad_norm": 0.9192412495613098, "learning_rate": 4.8799242351038257e-05, "loss": 0.6745, "num_input_tokens_seen": 14743344, "step": 25400 }, { "epoch": 3.783884420613643, "grad_norm": 0.6307187676429749, "learning_rate": 4.8798247202301824e-05, "loss": 0.595, "num_input_tokens_seen": 14746096, "step": 25405 }, { "epoch": 3.7846291331546023, "grad_norm": 1.3105175495147705, "learning_rate": 4.879725165151818e-05, "loss": 0.6432, "num_input_tokens_seen": 14749328, "step": 25410 }, { "epoch": 3.7853738456955615, "grad_norm": 1.231214165687561, "learning_rate": 4.8796255698704165e-05, "loss": 0.6712, "num_input_tokens_seen": 14752144, "step": 25415 }, { "epoch": 3.7861185582365207, "grad_norm": 1.152450442314148, "learning_rate": 4.87952593438766e-05, "loss": 0.611, "num_input_tokens_seen": 14754992, "step": 25420 }, { "epoch": 3.78686327077748, "grad_norm": 0.7439554929733276, "learning_rate": 4.879426258705231e-05, "loss": 0.5215, "num_input_tokens_seen": 14757712, "step": 25425 }, { "epoch": 3.787607983318439, "grad_norm": 2.4561147689819336, "learning_rate": 4.879326542824813e-05, "loss": 0.6751, "num_input_tokens_seen": 14760720, "step": 25430 }, { "epoch": 3.7883526958593983, "grad_norm": 1.1554075479507446, "learning_rate": 4.8792267867480926e-05, "loss": 0.8012, "num_input_tokens_seen": 14763472, "step": 25435 }, { "epoch": 3.7890974084003575, "grad_norm": 0.7662646770477295, "learning_rate": 4.879126990476752e-05, "loss": 0.6324, "num_input_tokens_seen": 14766192, "step": 25440 }, { "epoch": 3.7898421209413167, "grad_norm": 0.8349674344062805, "learning_rate": 4.8790271540124796e-05, "loss": 0.554, "num_input_tokens_seen": 14769008, "step": 25445 }, { "epoch": 3.790586833482276, "grad_norm": 0.8174417018890381, "learning_rate": 4.8789272773569625e-05, "loss": 0.6314, "num_input_tokens_seen": 14771984, "step": 25450 }, { "epoch": 3.791331546023235, "grad_norm": 0.9455499053001404, "learning_rate": 4.8788273605118855e-05, "loss": 0.7221, "num_input_tokens_seen": 14775152, "step": 25455 }, { "epoch": 3.7920762585641943, "grad_norm": 0.9796226024627686, "learning_rate": 4.8787274034789386e-05, "loss": 0.6948, "num_input_tokens_seen": 14778000, "step": 25460 }, { "epoch": 3.7928209711051535, "grad_norm": 0.8893463611602783, "learning_rate": 4.87862740625981e-05, "loss": 0.664, "num_input_tokens_seen": 14780912, "step": 25465 }, { "epoch": 3.7935656836461127, "grad_norm": 1.1405442953109741, "learning_rate": 4.878527368856189e-05, "loss": 0.6425, "num_input_tokens_seen": 14783888, "step": 25470 }, { "epoch": 3.794310396187072, "grad_norm": 1.9499527215957642, "learning_rate": 4.878427291269765e-05, "loss": 0.7235, "num_input_tokens_seen": 14786896, "step": 25475 }, { "epoch": 3.795055108728031, "grad_norm": 0.7543547749519348, "learning_rate": 4.878327173502229e-05, "loss": 0.5677, "num_input_tokens_seen": 14789968, "step": 25480 }, { "epoch": 3.7957998212689903, "grad_norm": 0.8816763162612915, "learning_rate": 4.8782270155552735e-05, "loss": 0.654, "num_input_tokens_seen": 14792816, "step": 25485 }, { "epoch": 3.7965445338099495, "grad_norm": 1.2528401613235474, "learning_rate": 4.878126817430588e-05, "loss": 0.6032, "num_input_tokens_seen": 14795760, "step": 25490 }, { "epoch": 3.7972892463509087, "grad_norm": 0.8298819661140442, "learning_rate": 4.878026579129868e-05, "loss": 0.6586, "num_input_tokens_seen": 14798416, "step": 25495 }, { "epoch": 3.798033958891868, "grad_norm": 0.9447733163833618, "learning_rate": 4.877926300654807e-05, "loss": 0.6244, "num_input_tokens_seen": 14801264, "step": 25500 }, { "epoch": 3.798778671432827, "grad_norm": 1.3774720430374146, "learning_rate": 4.877825982007097e-05, "loss": 0.613, "num_input_tokens_seen": 14803888, "step": 25505 }, { "epoch": 3.799523383973786, "grad_norm": 0.7701545357704163, "learning_rate": 4.877725623188434e-05, "loss": 0.5739, "num_input_tokens_seen": 14806640, "step": 25510 }, { "epoch": 3.8002680965147455, "grad_norm": 0.8726545572280884, "learning_rate": 4.8776252242005124e-05, "loss": 0.7036, "num_input_tokens_seen": 14809552, "step": 25515 }, { "epoch": 3.8010128090557043, "grad_norm": 1.2735495567321777, "learning_rate": 4.87752478504503e-05, "loss": 0.7183, "num_input_tokens_seen": 14812592, "step": 25520 }, { "epoch": 3.801757521596664, "grad_norm": 1.1900365352630615, "learning_rate": 4.8774243057236824e-05, "loss": 0.6751, "num_input_tokens_seen": 14815632, "step": 25525 }, { "epoch": 3.8025022341376227, "grad_norm": 0.8883476853370667, "learning_rate": 4.877323786238167e-05, "loss": 0.6621, "num_input_tokens_seen": 14818544, "step": 25530 }, { "epoch": 3.8032469466785823, "grad_norm": 0.98847496509552, "learning_rate": 4.877223226590184e-05, "loss": 0.7061, "num_input_tokens_seen": 14821200, "step": 25535 }, { "epoch": 3.803991659219541, "grad_norm": 1.3543317317962646, "learning_rate": 4.877122626781429e-05, "loss": 0.7309, "num_input_tokens_seen": 14824080, "step": 25540 }, { "epoch": 3.8047363717605007, "grad_norm": 0.8382605314254761, "learning_rate": 4.8770219868136036e-05, "loss": 0.5429, "num_input_tokens_seen": 14826864, "step": 25545 }, { "epoch": 3.8054810843014595, "grad_norm": 1.0617870092391968, "learning_rate": 4.876921306688408e-05, "loss": 0.883, "num_input_tokens_seen": 14830096, "step": 25550 }, { "epoch": 3.8062257968424187, "grad_norm": 1.1281284093856812, "learning_rate": 4.8768205864075425e-05, "loss": 0.6245, "num_input_tokens_seen": 14833104, "step": 25555 }, { "epoch": 3.806970509383378, "grad_norm": 1.0556944608688354, "learning_rate": 4.876719825972709e-05, "loss": 0.5876, "num_input_tokens_seen": 14836432, "step": 25560 }, { "epoch": 3.807715221924337, "grad_norm": 1.0568407773971558, "learning_rate": 4.8766190253856106e-05, "loss": 0.6517, "num_input_tokens_seen": 14839184, "step": 25565 }, { "epoch": 3.8084599344652963, "grad_norm": 1.0262937545776367, "learning_rate": 4.876518184647948e-05, "loss": 0.7501, "num_input_tokens_seen": 14842064, "step": 25570 }, { "epoch": 3.8092046470062555, "grad_norm": 1.183476209640503, "learning_rate": 4.8764173037614256e-05, "loss": 0.6201, "num_input_tokens_seen": 14844784, "step": 25575 }, { "epoch": 3.8099493595472147, "grad_norm": 1.0481646060943604, "learning_rate": 4.876316382727749e-05, "loss": 0.6596, "num_input_tokens_seen": 14847728, "step": 25580 }, { "epoch": 3.810694072088174, "grad_norm": 0.6560296416282654, "learning_rate": 4.8762154215486225e-05, "loss": 0.7072, "num_input_tokens_seen": 14850768, "step": 25585 }, { "epoch": 3.811438784629133, "grad_norm": 0.8906235694885254, "learning_rate": 4.876114420225751e-05, "loss": 0.7981, "num_input_tokens_seen": 14853360, "step": 25590 }, { "epoch": 3.8121834971700923, "grad_norm": 0.9606361985206604, "learning_rate": 4.876013378760842e-05, "loss": 0.562, "num_input_tokens_seen": 14856400, "step": 25595 }, { "epoch": 3.8129282097110515, "grad_norm": 0.9313444495201111, "learning_rate": 4.875912297155601e-05, "loss": 0.5545, "num_input_tokens_seen": 14859344, "step": 25600 }, { "epoch": 3.8136729222520107, "grad_norm": 0.8070986270904541, "learning_rate": 4.875811175411737e-05, "loss": 0.5307, "num_input_tokens_seen": 14862288, "step": 25605 }, { "epoch": 3.81441763479297, "grad_norm": 1.115882158279419, "learning_rate": 4.875710013530958e-05, "loss": 0.6065, "num_input_tokens_seen": 14865360, "step": 25610 }, { "epoch": 3.815162347333929, "grad_norm": 0.8531510233879089, "learning_rate": 4.8756088115149724e-05, "loss": 0.6867, "num_input_tokens_seen": 14868112, "step": 25615 }, { "epoch": 3.8159070598748883, "grad_norm": 1.7709261178970337, "learning_rate": 4.8755075693654906e-05, "loss": 0.8128, "num_input_tokens_seen": 14871152, "step": 25620 }, { "epoch": 3.8166517724158475, "grad_norm": 2.2112293243408203, "learning_rate": 4.8754062870842234e-05, "loss": 0.7517, "num_input_tokens_seen": 14874096, "step": 25625 }, { "epoch": 3.8173964849568067, "grad_norm": 1.0421310663223267, "learning_rate": 4.87530496467288e-05, "loss": 0.5875, "num_input_tokens_seen": 14877136, "step": 25630 }, { "epoch": 3.818141197497766, "grad_norm": 0.915611207485199, "learning_rate": 4.875203602133174e-05, "loss": 0.5477, "num_input_tokens_seen": 14879952, "step": 25635 }, { "epoch": 3.818885910038725, "grad_norm": 0.630158543586731, "learning_rate": 4.875102199466817e-05, "loss": 0.5911, "num_input_tokens_seen": 14882736, "step": 25640 }, { "epoch": 3.8196306225796843, "grad_norm": 1.0670098066329956, "learning_rate": 4.875000756675523e-05, "loss": 0.6648, "num_input_tokens_seen": 14885968, "step": 25645 }, { "epoch": 3.8203753351206435, "grad_norm": 2.990708827972412, "learning_rate": 4.874899273761004e-05, "loss": 0.6243, "num_input_tokens_seen": 14889008, "step": 25650 }, { "epoch": 3.8211200476616027, "grad_norm": 1.138558030128479, "learning_rate": 4.8747977507249765e-05, "loss": 0.6036, "num_input_tokens_seen": 14891920, "step": 25655 }, { "epoch": 3.821864760202562, "grad_norm": 0.6953200697898865, "learning_rate": 4.874696187569154e-05, "loss": 0.4044, "num_input_tokens_seen": 14894448, "step": 25660 }, { "epoch": 3.822609472743521, "grad_norm": 1.2622164487838745, "learning_rate": 4.874594584295253e-05, "loss": 0.5612, "num_input_tokens_seen": 14897424, "step": 25665 }, { "epoch": 3.8233541852844803, "grad_norm": 1.2009556293487549, "learning_rate": 4.87449294090499e-05, "loss": 0.6663, "num_input_tokens_seen": 14900240, "step": 25670 }, { "epoch": 3.824098897825439, "grad_norm": 0.9872735142707825, "learning_rate": 4.874391257400083e-05, "loss": 0.806, "num_input_tokens_seen": 14903248, "step": 25675 }, { "epoch": 3.8248436103663987, "grad_norm": 0.7995824217796326, "learning_rate": 4.874289533782247e-05, "loss": 0.6356, "num_input_tokens_seen": 14906256, "step": 25680 }, { "epoch": 3.8255883229073575, "grad_norm": 0.668465256690979, "learning_rate": 4.874187770053204e-05, "loss": 0.7154, "num_input_tokens_seen": 14909264, "step": 25685 }, { "epoch": 3.826333035448317, "grad_norm": 1.3467073440551758, "learning_rate": 4.874085966214671e-05, "loss": 0.5163, "num_input_tokens_seen": 14911888, "step": 25690 }, { "epoch": 3.827077747989276, "grad_norm": 0.8188043236732483, "learning_rate": 4.873984122268369e-05, "loss": 0.5504, "num_input_tokens_seen": 14914672, "step": 25695 }, { "epoch": 3.8278224605302356, "grad_norm": 2.1615793704986572, "learning_rate": 4.873882238216017e-05, "loss": 0.7204, "num_input_tokens_seen": 14917808, "step": 25700 }, { "epoch": 3.8285671730711943, "grad_norm": 0.7764835357666016, "learning_rate": 4.873780314059338e-05, "loss": 0.6866, "num_input_tokens_seen": 14920848, "step": 25705 }, { "epoch": 3.829311885612154, "grad_norm": 0.7276891469955444, "learning_rate": 4.873678349800054e-05, "loss": 0.7057, "num_input_tokens_seen": 14923664, "step": 25710 }, { "epoch": 3.8300565981531127, "grad_norm": 2.5964276790618896, "learning_rate": 4.873576345439886e-05, "loss": 0.6668, "num_input_tokens_seen": 14926320, "step": 25715 }, { "epoch": 3.830801310694072, "grad_norm": 1.4490981101989746, "learning_rate": 4.873474300980558e-05, "loss": 0.7841, "num_input_tokens_seen": 14929328, "step": 25720 }, { "epoch": 3.831546023235031, "grad_norm": 0.8911262154579163, "learning_rate": 4.873372216423794e-05, "loss": 0.6536, "num_input_tokens_seen": 14932304, "step": 25725 }, { "epoch": 3.8322907357759903, "grad_norm": 0.552753746509552, "learning_rate": 4.8732700917713186e-05, "loss": 0.7507, "num_input_tokens_seen": 14935216, "step": 25730 }, { "epoch": 3.8330354483169495, "grad_norm": 1.088772177696228, "learning_rate": 4.8731679270248575e-05, "loss": 0.5907, "num_input_tokens_seen": 14938256, "step": 25735 }, { "epoch": 3.8337801608579087, "grad_norm": 1.484948992729187, "learning_rate": 4.8730657221861354e-05, "loss": 0.6936, "num_input_tokens_seen": 14941296, "step": 25740 }, { "epoch": 3.834524873398868, "grad_norm": 1.0205223560333252, "learning_rate": 4.8729634772568805e-05, "loss": 0.6674, "num_input_tokens_seen": 14944080, "step": 25745 }, { "epoch": 3.835269585939827, "grad_norm": 0.8737632632255554, "learning_rate": 4.872861192238819e-05, "loss": 0.813, "num_input_tokens_seen": 14947056, "step": 25750 }, { "epoch": 3.8360142984807863, "grad_norm": 0.8784990310668945, "learning_rate": 4.87275886713368e-05, "loss": 0.6302, "num_input_tokens_seen": 14949968, "step": 25755 }, { "epoch": 3.8367590110217455, "grad_norm": 1.1434122323989868, "learning_rate": 4.872656501943191e-05, "loss": 0.6676, "num_input_tokens_seen": 14952752, "step": 25760 }, { "epoch": 3.8375037235627047, "grad_norm": 1.0056360960006714, "learning_rate": 4.872554096669082e-05, "loss": 0.8655, "num_input_tokens_seen": 14955920, "step": 25765 }, { "epoch": 3.838248436103664, "grad_norm": 0.9934924244880676, "learning_rate": 4.8724516513130826e-05, "loss": 0.6923, "num_input_tokens_seen": 14958832, "step": 25770 }, { "epoch": 3.838993148644623, "grad_norm": 1.1484981775283813, "learning_rate": 4.872349165876924e-05, "loss": 0.6555, "num_input_tokens_seen": 14961968, "step": 25775 }, { "epoch": 3.8397378611855824, "grad_norm": 0.7789892554283142, "learning_rate": 4.872246640362337e-05, "loss": 0.7708, "num_input_tokens_seen": 14964592, "step": 25780 }, { "epoch": 3.8404825737265416, "grad_norm": 0.9697123169898987, "learning_rate": 4.872144074771054e-05, "loss": 0.7018, "num_input_tokens_seen": 14967376, "step": 25785 }, { "epoch": 3.8412272862675008, "grad_norm": 0.7289960384368896, "learning_rate": 4.872041469104809e-05, "loss": 0.6276, "num_input_tokens_seen": 14970352, "step": 25790 }, { "epoch": 3.84197199880846, "grad_norm": 1.3071585893630981, "learning_rate": 4.871938823365333e-05, "loss": 0.6558, "num_input_tokens_seen": 14972880, "step": 25795 }, { "epoch": 3.842716711349419, "grad_norm": 1.2934463024139404, "learning_rate": 4.871836137554362e-05, "loss": 0.7071, "num_input_tokens_seen": 14975632, "step": 25800 }, { "epoch": 3.8434614238903784, "grad_norm": 1.1477903127670288, "learning_rate": 4.8717334116736293e-05, "loss": 0.614, "num_input_tokens_seen": 14978864, "step": 25805 }, { "epoch": 3.8442061364313376, "grad_norm": 1.0310068130493164, "learning_rate": 4.8716306457248717e-05, "loss": 0.6089, "num_input_tokens_seen": 14981584, "step": 25810 }, { "epoch": 3.8449508489722968, "grad_norm": 1.7665411233901978, "learning_rate": 4.871527839709825e-05, "loss": 0.8603, "num_input_tokens_seen": 14984240, "step": 25815 }, { "epoch": 3.845695561513256, "grad_norm": 1.3688442707061768, "learning_rate": 4.871424993630226e-05, "loss": 0.7126, "num_input_tokens_seen": 14987248, "step": 25820 }, { "epoch": 3.846440274054215, "grad_norm": 0.8890618085861206, "learning_rate": 4.871322107487811e-05, "loss": 0.4315, "num_input_tokens_seen": 14990256, "step": 25825 }, { "epoch": 3.8471849865951744, "grad_norm": 0.7853980660438538, "learning_rate": 4.8712191812843194e-05, "loss": 0.682, "num_input_tokens_seen": 14993264, "step": 25830 }, { "epoch": 3.8479296991361336, "grad_norm": 0.9139350056648254, "learning_rate": 4.87111621502149e-05, "loss": 0.4898, "num_input_tokens_seen": 14995952, "step": 25835 }, { "epoch": 3.848674411677093, "grad_norm": 0.7383255958557129, "learning_rate": 4.871013208701062e-05, "loss": 0.5951, "num_input_tokens_seen": 14999216, "step": 25840 }, { "epoch": 3.849419124218052, "grad_norm": 0.9293065071105957, "learning_rate": 4.870910162324776e-05, "loss": 0.5516, "num_input_tokens_seen": 15002032, "step": 25845 }, { "epoch": 3.8501638367590107, "grad_norm": 1.0126999616622925, "learning_rate": 4.8708070758943716e-05, "loss": 0.7155, "num_input_tokens_seen": 15004816, "step": 25850 }, { "epoch": 3.8509085492999704, "grad_norm": 1.2841262817382812, "learning_rate": 4.870703949411591e-05, "loss": 0.8571, "num_input_tokens_seen": 15007856, "step": 25855 }, { "epoch": 3.851653261840929, "grad_norm": 0.9934714436531067, "learning_rate": 4.8706007828781776e-05, "loss": 0.5567, "num_input_tokens_seen": 15010736, "step": 25860 }, { "epoch": 3.852397974381889, "grad_norm": 0.7875568270683289, "learning_rate": 4.8704975762958734e-05, "loss": 0.6567, "num_input_tokens_seen": 15013680, "step": 25865 }, { "epoch": 3.8531426869228476, "grad_norm": 0.9832250475883484, "learning_rate": 4.8703943296664214e-05, "loss": 0.7234, "num_input_tokens_seen": 15016528, "step": 25870 }, { "epoch": 3.853887399463807, "grad_norm": 1.137824296951294, "learning_rate": 4.8702910429915663e-05, "loss": 0.7458, "num_input_tokens_seen": 15019728, "step": 25875 }, { "epoch": 3.854632112004766, "grad_norm": 1.4447041749954224, "learning_rate": 4.870187716273054e-05, "loss": 0.7082, "num_input_tokens_seen": 15022608, "step": 25880 }, { "epoch": 3.8553768245457256, "grad_norm": 0.7408069968223572, "learning_rate": 4.870084349512628e-05, "loss": 0.9163, "num_input_tokens_seen": 15025584, "step": 25885 }, { "epoch": 3.8561215370866844, "grad_norm": 0.6330254077911377, "learning_rate": 4.8699809427120364e-05, "loss": 0.5829, "num_input_tokens_seen": 15028560, "step": 25890 }, { "epoch": 3.8568662496276436, "grad_norm": 1.3144254684448242, "learning_rate": 4.869877495873025e-05, "loss": 0.5639, "num_input_tokens_seen": 15031376, "step": 25895 }, { "epoch": 3.8576109621686028, "grad_norm": 0.7276001572608948, "learning_rate": 4.869774008997343e-05, "loss": 0.622, "num_input_tokens_seen": 15034448, "step": 25900 }, { "epoch": 3.858355674709562, "grad_norm": 1.2555782794952393, "learning_rate": 4.869670482086737e-05, "loss": 0.4846, "num_input_tokens_seen": 15037424, "step": 25905 }, { "epoch": 3.859100387250521, "grad_norm": 0.8753920197486877, "learning_rate": 4.869566915142956e-05, "loss": 0.6392, "num_input_tokens_seen": 15040368, "step": 25910 }, { "epoch": 3.8598450997914804, "grad_norm": 1.89002525806427, "learning_rate": 4.8694633081677507e-05, "loss": 0.652, "num_input_tokens_seen": 15042864, "step": 25915 }, { "epoch": 3.8605898123324396, "grad_norm": 1.2481355667114258, "learning_rate": 4.869359661162871e-05, "loss": 0.618, "num_input_tokens_seen": 15045584, "step": 25920 }, { "epoch": 3.861334524873399, "grad_norm": 1.2994897365570068, "learning_rate": 4.869255974130068e-05, "loss": 0.5949, "num_input_tokens_seen": 15048304, "step": 25925 }, { "epoch": 3.862079237414358, "grad_norm": 1.1987457275390625, "learning_rate": 4.869152247071094e-05, "loss": 0.7399, "num_input_tokens_seen": 15051504, "step": 25930 }, { "epoch": 3.862823949955317, "grad_norm": 0.7969673275947571, "learning_rate": 4.8690484799877004e-05, "loss": 0.6379, "num_input_tokens_seen": 15054384, "step": 25935 }, { "epoch": 3.8635686624962764, "grad_norm": 1.4675123691558838, "learning_rate": 4.86894467288164e-05, "loss": 0.7642, "num_input_tokens_seen": 15057488, "step": 25940 }, { "epoch": 3.8643133750372356, "grad_norm": 1.785499095916748, "learning_rate": 4.868840825754667e-05, "loss": 0.7789, "num_input_tokens_seen": 15060240, "step": 25945 }, { "epoch": 3.865058087578195, "grad_norm": 2.017488956451416, "learning_rate": 4.868736938608536e-05, "loss": 0.7637, "num_input_tokens_seen": 15062992, "step": 25950 }, { "epoch": 3.865802800119154, "grad_norm": 0.7677405476570129, "learning_rate": 4.8686330114450025e-05, "loss": 0.8405, "num_input_tokens_seen": 15065808, "step": 25955 }, { "epoch": 3.866547512660113, "grad_norm": 0.40367016196250916, "learning_rate": 4.868529044265821e-05, "loss": 0.6099, "num_input_tokens_seen": 15068816, "step": 25960 }, { "epoch": 3.8672922252010724, "grad_norm": 1.2522507905960083, "learning_rate": 4.868425037072749e-05, "loss": 0.6051, "num_input_tokens_seen": 15071696, "step": 25965 }, { "epoch": 3.8680369377420316, "grad_norm": 0.6639984250068665, "learning_rate": 4.868320989867543e-05, "loss": 0.6125, "num_input_tokens_seen": 15074512, "step": 25970 }, { "epoch": 3.868781650282991, "grad_norm": 0.9630029201507568, "learning_rate": 4.868216902651961e-05, "loss": 0.6265, "num_input_tokens_seen": 15077680, "step": 25975 }, { "epoch": 3.86952636282395, "grad_norm": 0.9913626909255981, "learning_rate": 4.8681127754277606e-05, "loss": 0.5945, "num_input_tokens_seen": 15080464, "step": 25980 }, { "epoch": 3.870271075364909, "grad_norm": 1.1359819173812866, "learning_rate": 4.868008608196702e-05, "loss": 0.6649, "num_input_tokens_seen": 15083248, "step": 25985 }, { "epoch": 3.8710157879058684, "grad_norm": 0.6226510405540466, "learning_rate": 4.8679044009605455e-05, "loss": 0.7743, "num_input_tokens_seen": 15086192, "step": 25990 }, { "epoch": 3.8717605004468276, "grad_norm": 0.9685414433479309, "learning_rate": 4.867800153721051e-05, "loss": 0.6791, "num_input_tokens_seen": 15088944, "step": 25995 }, { "epoch": 3.872505212987787, "grad_norm": 0.8670291900634766, "learning_rate": 4.867695866479978e-05, "loss": 0.6865, "num_input_tokens_seen": 15091824, "step": 26000 }, { "epoch": 3.873249925528746, "grad_norm": 1.2999125719070435, "learning_rate": 4.86759153923909e-05, "loss": 0.8165, "num_input_tokens_seen": 15094992, "step": 26005 }, { "epoch": 3.8739946380697052, "grad_norm": 1.1615688800811768, "learning_rate": 4.86748717200015e-05, "loss": 0.8045, "num_input_tokens_seen": 15097904, "step": 26010 }, { "epoch": 3.8747393506106644, "grad_norm": 3.4180996417999268, "learning_rate": 4.8673827647649206e-05, "loss": 0.6666, "num_input_tokens_seen": 15100848, "step": 26015 }, { "epoch": 3.8754840631516236, "grad_norm": 1.068251609802246, "learning_rate": 4.867278317535164e-05, "loss": 0.653, "num_input_tokens_seen": 15103760, "step": 26020 }, { "epoch": 3.8762287756925824, "grad_norm": 1.52351713180542, "learning_rate": 4.867173830312648e-05, "loss": 0.6575, "num_input_tokens_seen": 15106448, "step": 26025 }, { "epoch": 3.876973488233542, "grad_norm": 0.8804477453231812, "learning_rate": 4.867069303099135e-05, "loss": 0.5869, "num_input_tokens_seen": 15109360, "step": 26030 }, { "epoch": 3.877718200774501, "grad_norm": 1.2626911401748657, "learning_rate": 4.8669647358963924e-05, "loss": 0.7479, "num_input_tokens_seen": 15112624, "step": 26035 }, { "epoch": 3.8784629133154604, "grad_norm": 0.7737899422645569, "learning_rate": 4.866860128706186e-05, "loss": 0.5178, "num_input_tokens_seen": 15115280, "step": 26040 }, { "epoch": 3.879207625856419, "grad_norm": 0.9587374925613403, "learning_rate": 4.866755481530284e-05, "loss": 0.6202, "num_input_tokens_seen": 15118000, "step": 26045 }, { "epoch": 3.879952338397379, "grad_norm": 2.1143558025360107, "learning_rate": 4.866650794370452e-05, "loss": 0.783, "num_input_tokens_seen": 15120656, "step": 26050 }, { "epoch": 3.8806970509383376, "grad_norm": 1.0770020484924316, "learning_rate": 4.866546067228461e-05, "loss": 0.7692, "num_input_tokens_seen": 15123728, "step": 26055 }, { "epoch": 3.8814417634792973, "grad_norm": 0.9982559680938721, "learning_rate": 4.866441300106081e-05, "loss": 0.6141, "num_input_tokens_seen": 15126480, "step": 26060 }, { "epoch": 3.882186476020256, "grad_norm": 0.7761786580085754, "learning_rate": 4.866336493005078e-05, "loss": 0.6278, "num_input_tokens_seen": 15129136, "step": 26065 }, { "epoch": 3.882931188561215, "grad_norm": 1.152209997177124, "learning_rate": 4.866231645927226e-05, "loss": 0.7095, "num_input_tokens_seen": 15132208, "step": 26070 }, { "epoch": 3.8836759011021744, "grad_norm": 1.0383977890014648, "learning_rate": 4.866126758874295e-05, "loss": 0.552, "num_input_tokens_seen": 15135056, "step": 26075 }, { "epoch": 3.8844206136431336, "grad_norm": 0.9580537676811218, "learning_rate": 4.8660218318480574e-05, "loss": 0.6432, "num_input_tokens_seen": 15137904, "step": 26080 }, { "epoch": 3.885165326184093, "grad_norm": 0.7364570498466492, "learning_rate": 4.865916864850286e-05, "loss": 0.6236, "num_input_tokens_seen": 15140656, "step": 26085 }, { "epoch": 3.885910038725052, "grad_norm": 0.8851413726806641, "learning_rate": 4.865811857882754e-05, "loss": 0.5333, "num_input_tokens_seen": 15143600, "step": 26090 }, { "epoch": 3.8866547512660112, "grad_norm": 2.6517348289489746, "learning_rate": 4.8657068109472345e-05, "loss": 0.8608, "num_input_tokens_seen": 15146768, "step": 26095 }, { "epoch": 3.8873994638069704, "grad_norm": 1.3757407665252686, "learning_rate": 4.8656017240455025e-05, "loss": 0.7087, "num_input_tokens_seen": 15149776, "step": 26100 }, { "epoch": 3.8881441763479296, "grad_norm": 0.9839833378791809, "learning_rate": 4.865496597179334e-05, "loss": 0.6289, "num_input_tokens_seen": 15152592, "step": 26105 }, { "epoch": 3.888888888888889, "grad_norm": 0.8514261245727539, "learning_rate": 4.8653914303505054e-05, "loss": 0.5308, "num_input_tokens_seen": 15155216, "step": 26110 }, { "epoch": 3.889633601429848, "grad_norm": 1.3826457262039185, "learning_rate": 4.865286223560792e-05, "loss": 0.6155, "num_input_tokens_seen": 15158352, "step": 26115 }, { "epoch": 3.8903783139708072, "grad_norm": 0.9534000158309937, "learning_rate": 4.865180976811972e-05, "loss": 0.7768, "num_input_tokens_seen": 15161328, "step": 26120 }, { "epoch": 3.8911230265117664, "grad_norm": 0.7134515047073364, "learning_rate": 4.8650756901058225e-05, "loss": 0.6682, "num_input_tokens_seen": 15164208, "step": 26125 }, { "epoch": 3.8918677390527256, "grad_norm": 1.0286879539489746, "learning_rate": 4.864970363444124e-05, "loss": 0.8213, "num_input_tokens_seen": 15166992, "step": 26130 }, { "epoch": 3.892612451593685, "grad_norm": 1.3260667324066162, "learning_rate": 4.864864996828654e-05, "loss": 0.7404, "num_input_tokens_seen": 15171056, "step": 26135 }, { "epoch": 3.893357164134644, "grad_norm": 1.6854124069213867, "learning_rate": 4.864759590261194e-05, "loss": 0.6049, "num_input_tokens_seen": 15173840, "step": 26140 }, { "epoch": 3.8941018766756033, "grad_norm": 2.2508018016815186, "learning_rate": 4.8646541437435246e-05, "loss": 0.7829, "num_input_tokens_seen": 15176752, "step": 26145 }, { "epoch": 3.8948465892165625, "grad_norm": 1.0670433044433594, "learning_rate": 4.8645486572774266e-05, "loss": 0.7746, "num_input_tokens_seen": 15179440, "step": 26150 }, { "epoch": 3.8955913017575217, "grad_norm": 0.699644148349762, "learning_rate": 4.8644431308646815e-05, "loss": 0.655, "num_input_tokens_seen": 15182384, "step": 26155 }, { "epoch": 3.896336014298481, "grad_norm": 0.746639609336853, "learning_rate": 4.8643375645070735e-05, "loss": 0.6061, "num_input_tokens_seen": 15185200, "step": 26160 }, { "epoch": 3.89708072683944, "grad_norm": 0.9584206342697144, "learning_rate": 4.864231958206384e-05, "loss": 0.6385, "num_input_tokens_seen": 15188368, "step": 26165 }, { "epoch": 3.8978254393803993, "grad_norm": 0.7210211157798767, "learning_rate": 4.8641263119644004e-05, "loss": 0.5936, "num_input_tokens_seen": 15191152, "step": 26170 }, { "epoch": 3.8985701519213585, "grad_norm": 1.2965197563171387, "learning_rate": 4.864020625782905e-05, "loss": 0.6347, "num_input_tokens_seen": 15194096, "step": 26175 }, { "epoch": 3.8993148644623177, "grad_norm": 1.2559678554534912, "learning_rate": 4.863914899663683e-05, "loss": 0.7864, "num_input_tokens_seen": 15196976, "step": 26180 }, { "epoch": 3.900059577003277, "grad_norm": 0.8168959617614746, "learning_rate": 4.8638091336085224e-05, "loss": 0.6504, "num_input_tokens_seen": 15199536, "step": 26185 }, { "epoch": 3.900804289544236, "grad_norm": 0.6630781888961792, "learning_rate": 4.863703327619208e-05, "loss": 0.6636, "num_input_tokens_seen": 15202288, "step": 26190 }, { "epoch": 3.9015490020851953, "grad_norm": 0.9397679567337036, "learning_rate": 4.863597481697528e-05, "loss": 0.5558, "num_input_tokens_seen": 15205424, "step": 26195 }, { "epoch": 3.902293714626154, "grad_norm": 1.000444769859314, "learning_rate": 4.8634915958452724e-05, "loss": 0.6096, "num_input_tokens_seen": 15208272, "step": 26200 }, { "epoch": 3.9030384271671137, "grad_norm": 0.6726030111312866, "learning_rate": 4.863385670064227e-05, "loss": 0.7128, "num_input_tokens_seen": 15211120, "step": 26205 }, { "epoch": 3.9037831397080724, "grad_norm": 1.4896752834320068, "learning_rate": 4.863279704356183e-05, "loss": 0.6889, "num_input_tokens_seen": 15213936, "step": 26210 }, { "epoch": 3.904527852249032, "grad_norm": 0.5285993218421936, "learning_rate": 4.863173698722931e-05, "loss": 0.5632, "num_input_tokens_seen": 15216560, "step": 26215 }, { "epoch": 3.905272564789991, "grad_norm": 1.4052382707595825, "learning_rate": 4.863067653166261e-05, "loss": 0.6616, "num_input_tokens_seen": 15219472, "step": 26220 }, { "epoch": 3.9060172773309505, "grad_norm": 0.8616143465042114, "learning_rate": 4.8629615676879634e-05, "loss": 0.575, "num_input_tokens_seen": 15222832, "step": 26225 }, { "epoch": 3.9067619898719093, "grad_norm": 0.826878011226654, "learning_rate": 4.8628554422898334e-05, "loss": 0.563, "num_input_tokens_seen": 15225808, "step": 26230 }, { "epoch": 3.907506702412869, "grad_norm": 0.6242923736572266, "learning_rate": 4.8627492769736616e-05, "loss": 0.5942, "num_input_tokens_seen": 15228688, "step": 26235 }, { "epoch": 3.9082514149538277, "grad_norm": 0.9383857250213623, "learning_rate": 4.862643071741242e-05, "loss": 0.6509, "num_input_tokens_seen": 15231888, "step": 26240 }, { "epoch": 3.908996127494787, "grad_norm": 0.8798577189445496, "learning_rate": 4.8625368265943696e-05, "loss": 0.6204, "num_input_tokens_seen": 15234960, "step": 26245 }, { "epoch": 3.909740840035746, "grad_norm": 1.484930396080017, "learning_rate": 4.8624305415348374e-05, "loss": 0.5768, "num_input_tokens_seen": 15237744, "step": 26250 }, { "epoch": 3.9104855525767053, "grad_norm": 0.7769575715065002, "learning_rate": 4.8623242165644436e-05, "loss": 0.7799, "num_input_tokens_seen": 15240624, "step": 26255 }, { "epoch": 3.9112302651176645, "grad_norm": 1.4825189113616943, "learning_rate": 4.8622178516849824e-05, "loss": 0.7149, "num_input_tokens_seen": 15243536, "step": 26260 }, { "epoch": 3.9119749776586237, "grad_norm": 0.7372304201126099, "learning_rate": 4.862111446898252e-05, "loss": 0.7086, "num_input_tokens_seen": 15246640, "step": 26265 }, { "epoch": 3.912719690199583, "grad_norm": 0.577366292476654, "learning_rate": 4.862005002206049e-05, "loss": 0.7006, "num_input_tokens_seen": 15249456, "step": 26270 }, { "epoch": 3.913464402740542, "grad_norm": 1.4120677709579468, "learning_rate": 4.8618985176101716e-05, "loss": 0.7095, "num_input_tokens_seen": 15252592, "step": 26275 }, { "epoch": 3.9142091152815013, "grad_norm": 0.8862597346305847, "learning_rate": 4.86179199311242e-05, "loss": 0.6561, "num_input_tokens_seen": 15255728, "step": 26280 }, { "epoch": 3.9149538278224605, "grad_norm": 0.9358404874801636, "learning_rate": 4.861685428714593e-05, "loss": 0.7054, "num_input_tokens_seen": 15258384, "step": 26285 }, { "epoch": 3.9156985403634197, "grad_norm": 1.2764896154403687, "learning_rate": 4.861578824418491e-05, "loss": 0.6634, "num_input_tokens_seen": 15261104, "step": 26290 }, { "epoch": 3.916443252904379, "grad_norm": 1.0642592906951904, "learning_rate": 4.861472180225915e-05, "loss": 0.7144, "num_input_tokens_seen": 15263824, "step": 26295 }, { "epoch": 3.917187965445338, "grad_norm": 1.0047862529754639, "learning_rate": 4.861365496138667e-05, "loss": 0.6303, "num_input_tokens_seen": 15266640, "step": 26300 }, { "epoch": 3.9179326779862973, "grad_norm": 1.1816933155059814, "learning_rate": 4.861258772158548e-05, "loss": 0.7074, "num_input_tokens_seen": 15269552, "step": 26305 }, { "epoch": 3.9186773905272565, "grad_norm": 0.7779901623725891, "learning_rate": 4.861152008287362e-05, "loss": 0.641, "num_input_tokens_seen": 15272560, "step": 26310 }, { "epoch": 3.9194221030682157, "grad_norm": 1.0756562948226929, "learning_rate": 4.861045204526913e-05, "loss": 0.5892, "num_input_tokens_seen": 15275312, "step": 26315 }, { "epoch": 3.920166815609175, "grad_norm": 1.2747008800506592, "learning_rate": 4.8609383608790046e-05, "loss": 0.6877, "num_input_tokens_seen": 15278256, "step": 26320 }, { "epoch": 3.920911528150134, "grad_norm": 0.6570281982421875, "learning_rate": 4.860831477345443e-05, "loss": 0.6681, "num_input_tokens_seen": 15280944, "step": 26325 }, { "epoch": 3.9216562406910933, "grad_norm": 0.8123558759689331, "learning_rate": 4.860724553928032e-05, "loss": 0.6264, "num_input_tokens_seen": 15283568, "step": 26330 }, { "epoch": 3.9224009532320525, "grad_norm": 2.469904899597168, "learning_rate": 4.86061759062858e-05, "loss": 0.6984, "num_input_tokens_seen": 15286320, "step": 26335 }, { "epoch": 3.9231456657730117, "grad_norm": 0.8362818360328674, "learning_rate": 4.8605105874488924e-05, "loss": 0.6932, "num_input_tokens_seen": 15288976, "step": 26340 }, { "epoch": 3.923890378313971, "grad_norm": 0.7959244251251221, "learning_rate": 4.8604035443907775e-05, "loss": 0.5987, "num_input_tokens_seen": 15291728, "step": 26345 }, { "epoch": 3.92463509085493, "grad_norm": 1.827812910079956, "learning_rate": 4.860296461456044e-05, "loss": 0.6937, "num_input_tokens_seen": 15294416, "step": 26350 }, { "epoch": 3.9253798033958893, "grad_norm": 0.9040415287017822, "learning_rate": 4.8601893386465e-05, "loss": 0.5811, "num_input_tokens_seen": 15297264, "step": 26355 }, { "epoch": 3.9261245159368485, "grad_norm": 1.6696906089782715, "learning_rate": 4.860082175963957e-05, "loss": 0.6321, "num_input_tokens_seen": 15300368, "step": 26360 }, { "epoch": 3.9268692284778077, "grad_norm": 0.9935216903686523, "learning_rate": 4.859974973410224e-05, "loss": 0.7533, "num_input_tokens_seen": 15302960, "step": 26365 }, { "epoch": 3.927613941018767, "grad_norm": 1.6185449361801147, "learning_rate": 4.8598677309871123e-05, "loss": 0.7512, "num_input_tokens_seen": 15305648, "step": 26370 }, { "epoch": 3.9283586535597257, "grad_norm": 2.0993504524230957, "learning_rate": 4.859760448696433e-05, "loss": 0.8155, "num_input_tokens_seen": 15308624, "step": 26375 }, { "epoch": 3.9291033661006853, "grad_norm": 0.9080309271812439, "learning_rate": 4.85965312654e-05, "loss": 0.7108, "num_input_tokens_seen": 15311440, "step": 26380 }, { "epoch": 3.929848078641644, "grad_norm": 0.7621368765830994, "learning_rate": 4.859545764519625e-05, "loss": 0.5779, "num_input_tokens_seen": 15314224, "step": 26385 }, { "epoch": 3.9305927911826037, "grad_norm": 1.6094390153884888, "learning_rate": 4.859438362637123e-05, "loss": 0.8063, "num_input_tokens_seen": 15317296, "step": 26390 }, { "epoch": 3.9313375037235625, "grad_norm": 0.851847767829895, "learning_rate": 4.8593309208943085e-05, "loss": 0.6138, "num_input_tokens_seen": 15320080, "step": 26395 }, { "epoch": 3.932082216264522, "grad_norm": 1.6840879917144775, "learning_rate": 4.859223439292995e-05, "loss": 0.5678, "num_input_tokens_seen": 15322768, "step": 26400 }, { "epoch": 3.932826928805481, "grad_norm": 1.091382384300232, "learning_rate": 4.859115917835e-05, "loss": 0.6774, "num_input_tokens_seen": 15325776, "step": 26405 }, { "epoch": 3.9335716413464406, "grad_norm": 0.8254765868186951, "learning_rate": 4.859008356522139e-05, "loss": 0.6422, "num_input_tokens_seen": 15328720, "step": 26410 }, { "epoch": 3.9343163538873993, "grad_norm": 0.9805375933647156, "learning_rate": 4.8589007553562293e-05, "loss": 0.6173, "num_input_tokens_seen": 15331664, "step": 26415 }, { "epoch": 3.9350610664283585, "grad_norm": 1.3502905368804932, "learning_rate": 4.858793114339089e-05, "loss": 0.5792, "num_input_tokens_seen": 15334928, "step": 26420 }, { "epoch": 3.9358057789693177, "grad_norm": 0.803860604763031, "learning_rate": 4.8586854334725365e-05, "loss": 0.6384, "num_input_tokens_seen": 15337680, "step": 26425 }, { "epoch": 3.936550491510277, "grad_norm": 2.3145570755004883, "learning_rate": 4.8585777127583906e-05, "loss": 0.7447, "num_input_tokens_seen": 15340592, "step": 26430 }, { "epoch": 3.937295204051236, "grad_norm": 0.6502106785774231, "learning_rate": 4.858469952198471e-05, "loss": 0.6971, "num_input_tokens_seen": 15343920, "step": 26435 }, { "epoch": 3.9380399165921953, "grad_norm": 0.776257336139679, "learning_rate": 4.8583621517945995e-05, "loss": 0.6525, "num_input_tokens_seen": 15346544, "step": 26440 }, { "epoch": 3.9387846291331545, "grad_norm": 0.67263263463974, "learning_rate": 4.858254311548596e-05, "loss": 0.7411, "num_input_tokens_seen": 15349520, "step": 26445 }, { "epoch": 3.9395293416741137, "grad_norm": 0.6777185201644897, "learning_rate": 4.858146431462283e-05, "loss": 0.7022, "num_input_tokens_seen": 15352496, "step": 26450 }, { "epoch": 3.940274054215073, "grad_norm": 1.1554031372070312, "learning_rate": 4.858038511537482e-05, "loss": 0.682, "num_input_tokens_seen": 15355088, "step": 26455 }, { "epoch": 3.941018766756032, "grad_norm": 0.8502023816108704, "learning_rate": 4.857930551776017e-05, "loss": 0.7042, "num_input_tokens_seen": 15357936, "step": 26460 }, { "epoch": 3.9417634792969913, "grad_norm": 1.0762159824371338, "learning_rate": 4.857822552179713e-05, "loss": 0.6848, "num_input_tokens_seen": 15360592, "step": 26465 }, { "epoch": 3.9425081918379505, "grad_norm": 2.197274923324585, "learning_rate": 4.857714512750392e-05, "loss": 0.768, "num_input_tokens_seen": 15363664, "step": 26470 }, { "epoch": 3.9432529043789097, "grad_norm": 2.129564046859741, "learning_rate": 4.857606433489881e-05, "loss": 0.7192, "num_input_tokens_seen": 15366608, "step": 26475 }, { "epoch": 3.943997616919869, "grad_norm": 0.8809202313423157, "learning_rate": 4.8574983144000055e-05, "loss": 0.5622, "num_input_tokens_seen": 15369744, "step": 26480 }, { "epoch": 3.944742329460828, "grad_norm": 1.36704683303833, "learning_rate": 4.8573901554825915e-05, "loss": 0.7847, "num_input_tokens_seen": 15372624, "step": 26485 }, { "epoch": 3.9454870420017873, "grad_norm": 1.5550464391708374, "learning_rate": 4.857281956739468e-05, "loss": 0.5978, "num_input_tokens_seen": 15375440, "step": 26490 }, { "epoch": 3.9462317545427466, "grad_norm": 1.6656073331832886, "learning_rate": 4.8571737181724606e-05, "loss": 0.5955, "num_input_tokens_seen": 15378128, "step": 26495 }, { "epoch": 3.9469764670837058, "grad_norm": 1.128106951713562, "learning_rate": 4.8570654397834e-05, "loss": 0.7143, "num_input_tokens_seen": 15380848, "step": 26500 }, { "epoch": 3.947721179624665, "grad_norm": 2.6377298831939697, "learning_rate": 4.856957121574114e-05, "loss": 0.7235, "num_input_tokens_seen": 15383600, "step": 26505 }, { "epoch": 3.948465892165624, "grad_norm": 0.8436094522476196, "learning_rate": 4.856848763546433e-05, "loss": 0.8053, "num_input_tokens_seen": 15386288, "step": 26510 }, { "epoch": 3.9492106047065834, "grad_norm": 0.5476636290550232, "learning_rate": 4.856740365702187e-05, "loss": 0.5737, "num_input_tokens_seen": 15389040, "step": 26515 }, { "epoch": 3.9499553172475426, "grad_norm": 0.7313412427902222, "learning_rate": 4.8566319280432085e-05, "loss": 0.5923, "num_input_tokens_seen": 15392080, "step": 26520 }, { "epoch": 3.9507000297885018, "grad_norm": 1.0691039562225342, "learning_rate": 4.8565234505713276e-05, "loss": 0.6541, "num_input_tokens_seen": 15394768, "step": 26525 }, { "epoch": 3.951444742329461, "grad_norm": 0.7221081256866455, "learning_rate": 4.856414933288379e-05, "loss": 0.5751, "num_input_tokens_seen": 15397648, "step": 26530 }, { "epoch": 3.95218945487042, "grad_norm": 0.8913072943687439, "learning_rate": 4.856306376196195e-05, "loss": 0.6766, "num_input_tokens_seen": 15400400, "step": 26535 }, { "epoch": 3.9529341674113794, "grad_norm": 0.7039285898208618, "learning_rate": 4.856197779296609e-05, "loss": 0.477, "num_input_tokens_seen": 15403152, "step": 26540 }, { "epoch": 3.9536788799523386, "grad_norm": 1.34983229637146, "learning_rate": 4.856089142591457e-05, "loss": 0.6825, "num_input_tokens_seen": 15405936, "step": 26545 }, { "epoch": 3.9544235924932973, "grad_norm": 1.788457989692688, "learning_rate": 4.855980466082574e-05, "loss": 0.6723, "num_input_tokens_seen": 15408976, "step": 26550 }, { "epoch": 3.955168305034257, "grad_norm": 1.1846686601638794, "learning_rate": 4.855871749771794e-05, "loss": 0.9638, "num_input_tokens_seen": 15411824, "step": 26555 }, { "epoch": 3.9559130175752157, "grad_norm": 0.9633419513702393, "learning_rate": 4.855762993660956e-05, "loss": 0.7927, "num_input_tokens_seen": 15414576, "step": 26560 }, { "epoch": 3.9566577301161754, "grad_norm": 0.8193262815475464, "learning_rate": 4.855654197751896e-05, "loss": 0.7758, "num_input_tokens_seen": 15417456, "step": 26565 }, { "epoch": 3.957402442657134, "grad_norm": 1.3777625560760498, "learning_rate": 4.855545362046454e-05, "loss": 0.8115, "num_input_tokens_seen": 15420528, "step": 26570 }, { "epoch": 3.958147155198094, "grad_norm": 0.6277323961257935, "learning_rate": 4.855436486546466e-05, "loss": 0.8157, "num_input_tokens_seen": 15423536, "step": 26575 }, { "epoch": 3.9588918677390526, "grad_norm": 1.1176059246063232, "learning_rate": 4.855327571253773e-05, "loss": 0.7618, "num_input_tokens_seen": 15426160, "step": 26580 }, { "epoch": 3.9596365802800118, "grad_norm": 1.4113640785217285, "learning_rate": 4.855218616170214e-05, "loss": 0.5523, "num_input_tokens_seen": 15429296, "step": 26585 }, { "epoch": 3.960381292820971, "grad_norm": 1.0066375732421875, "learning_rate": 4.855109621297631e-05, "loss": 0.5944, "num_input_tokens_seen": 15432112, "step": 26590 }, { "epoch": 3.96112600536193, "grad_norm": 0.6187604665756226, "learning_rate": 4.855000586637864e-05, "loss": 0.6357, "num_input_tokens_seen": 15435088, "step": 26595 }, { "epoch": 3.9618707179028894, "grad_norm": 0.93521648645401, "learning_rate": 4.854891512192755e-05, "loss": 0.6806, "num_input_tokens_seen": 15437840, "step": 26600 }, { "epoch": 3.9626154304438486, "grad_norm": 0.7943716645240784, "learning_rate": 4.8547823979641484e-05, "loss": 0.7788, "num_input_tokens_seen": 15440720, "step": 26605 }, { "epoch": 3.9633601429848078, "grad_norm": 0.986230731010437, "learning_rate": 4.854673243953886e-05, "loss": 0.6078, "num_input_tokens_seen": 15443600, "step": 26610 }, { "epoch": 3.964104855525767, "grad_norm": 0.9784659743309021, "learning_rate": 4.854564050163812e-05, "loss": 0.5801, "num_input_tokens_seen": 15446448, "step": 26615 }, { "epoch": 3.964849568066726, "grad_norm": 1.4570844173431396, "learning_rate": 4.854454816595773e-05, "loss": 0.6537, "num_input_tokens_seen": 15449072, "step": 26620 }, { "epoch": 3.9655942806076854, "grad_norm": 0.7307347655296326, "learning_rate": 4.854345543251611e-05, "loss": 0.6831, "num_input_tokens_seen": 15451856, "step": 26625 }, { "epoch": 3.9663389931486446, "grad_norm": 1.141116976737976, "learning_rate": 4.854236230133175e-05, "loss": 0.8198, "num_input_tokens_seen": 15454896, "step": 26630 }, { "epoch": 3.967083705689604, "grad_norm": 1.6091253757476807, "learning_rate": 4.85412687724231e-05, "loss": 0.8605, "num_input_tokens_seen": 15457808, "step": 26635 }, { "epoch": 3.967828418230563, "grad_norm": 1.3041651248931885, "learning_rate": 4.854017484580864e-05, "loss": 0.6388, "num_input_tokens_seen": 15460624, "step": 26640 }, { "epoch": 3.968573130771522, "grad_norm": 0.9527943134307861, "learning_rate": 4.853908052150685e-05, "loss": 0.7198, "num_input_tokens_seen": 15463472, "step": 26645 }, { "epoch": 3.9693178433124814, "grad_norm": 0.7210283875465393, "learning_rate": 4.8537985799536226e-05, "loss": 0.6645, "num_input_tokens_seen": 15466320, "step": 26650 }, { "epoch": 3.9700625558534406, "grad_norm": 0.8502152562141418, "learning_rate": 4.853689067991525e-05, "loss": 0.6062, "num_input_tokens_seen": 15469168, "step": 26655 }, { "epoch": 3.9708072683944, "grad_norm": 1.6836601495742798, "learning_rate": 4.853579516266243e-05, "loss": 0.7696, "num_input_tokens_seen": 15471888, "step": 26660 }, { "epoch": 3.971551980935359, "grad_norm": 1.3454784154891968, "learning_rate": 4.853469924779627e-05, "loss": 0.8292, "num_input_tokens_seen": 15475088, "step": 26665 }, { "epoch": 3.972296693476318, "grad_norm": 0.9594107270240784, "learning_rate": 4.853360293533529e-05, "loss": 0.578, "num_input_tokens_seen": 15478064, "step": 26670 }, { "epoch": 3.9730414060172774, "grad_norm": 2.1225619316101074, "learning_rate": 4.8532506225298004e-05, "loss": 0.6002, "num_input_tokens_seen": 15480912, "step": 26675 }, { "epoch": 3.9737861185582366, "grad_norm": 1.0373047590255737, "learning_rate": 4.853140911770294e-05, "loss": 0.6932, "num_input_tokens_seen": 15483952, "step": 26680 }, { "epoch": 3.974530831099196, "grad_norm": 0.7588688731193542, "learning_rate": 4.853031161256863e-05, "loss": 0.6071, "num_input_tokens_seen": 15487088, "step": 26685 }, { "epoch": 3.975275543640155, "grad_norm": 0.9764150381088257, "learning_rate": 4.8529213709913626e-05, "loss": 0.7427, "num_input_tokens_seen": 15489872, "step": 26690 }, { "epoch": 3.976020256181114, "grad_norm": 1.5465682744979858, "learning_rate": 4.852811540975647e-05, "loss": 0.6388, "num_input_tokens_seen": 15492784, "step": 26695 }, { "epoch": 3.9767649687220734, "grad_norm": 1.235634207725525, "learning_rate": 4.8527016712115725e-05, "loss": 0.768, "num_input_tokens_seen": 15495440, "step": 26700 }, { "epoch": 3.9775096812630326, "grad_norm": 1.140788197517395, "learning_rate": 4.8525917617009945e-05, "loss": 0.8428, "num_input_tokens_seen": 15498480, "step": 26705 }, { "epoch": 3.978254393803992, "grad_norm": 1.3356873989105225, "learning_rate": 4.8524818124457684e-05, "loss": 0.6519, "num_input_tokens_seen": 15501424, "step": 26710 }, { "epoch": 3.9789991063449506, "grad_norm": 0.8938317894935608, "learning_rate": 4.852371823447753e-05, "loss": 0.5015, "num_input_tokens_seen": 15504304, "step": 26715 }, { "epoch": 3.9797438188859102, "grad_norm": 0.6829257607460022, "learning_rate": 4.852261794708808e-05, "loss": 0.7291, "num_input_tokens_seen": 15507440, "step": 26720 }, { "epoch": 3.980488531426869, "grad_norm": 1.1290736198425293, "learning_rate": 4.8521517262307895e-05, "loss": 0.6471, "num_input_tokens_seen": 15510256, "step": 26725 }, { "epoch": 3.9812332439678286, "grad_norm": 1.0569655895233154, "learning_rate": 4.8520416180155594e-05, "loss": 0.6802, "num_input_tokens_seen": 15513168, "step": 26730 }, { "epoch": 3.9819779565087874, "grad_norm": 0.978878915309906, "learning_rate": 4.8519314700649757e-05, "loss": 0.5894, "num_input_tokens_seen": 15516368, "step": 26735 }, { "epoch": 3.982722669049747, "grad_norm": 1.0624253749847412, "learning_rate": 4.8518212823809e-05, "loss": 0.6752, "num_input_tokens_seen": 15518928, "step": 26740 }, { "epoch": 3.983467381590706, "grad_norm": 0.7212386727333069, "learning_rate": 4.851711054965194e-05, "loss": 0.5821, "num_input_tokens_seen": 15521808, "step": 26745 }, { "epoch": 3.9842120941316654, "grad_norm": 1.4969557523727417, "learning_rate": 4.851600787819721e-05, "loss": 0.7917, "num_input_tokens_seen": 15524624, "step": 26750 }, { "epoch": 3.984956806672624, "grad_norm": 1.1415687799453735, "learning_rate": 4.851490480946342e-05, "loss": 0.6024, "num_input_tokens_seen": 15527184, "step": 26755 }, { "epoch": 3.9857015192135834, "grad_norm": 0.9387511610984802, "learning_rate": 4.851380134346921e-05, "loss": 0.7485, "num_input_tokens_seen": 15530064, "step": 26760 }, { "epoch": 3.9864462317545426, "grad_norm": 0.7245672941207886, "learning_rate": 4.851269748023323e-05, "loss": 0.6146, "num_input_tokens_seen": 15533168, "step": 26765 }, { "epoch": 3.987190944295502, "grad_norm": 0.8471117615699768, "learning_rate": 4.851159321977412e-05, "loss": 0.7737, "num_input_tokens_seen": 15536240, "step": 26770 }, { "epoch": 3.987935656836461, "grad_norm": 1.1751195192337036, "learning_rate": 4.851048856211054e-05, "loss": 0.5969, "num_input_tokens_seen": 15539120, "step": 26775 }, { "epoch": 3.98868036937742, "grad_norm": 0.9804458618164062, "learning_rate": 4.850938350726115e-05, "loss": 0.6348, "num_input_tokens_seen": 15542096, "step": 26780 }, { "epoch": 3.9894250819183794, "grad_norm": 0.9948327541351318, "learning_rate": 4.8508278055244625e-05, "loss": 0.5881, "num_input_tokens_seen": 15544976, "step": 26785 }, { "epoch": 3.9901697944593386, "grad_norm": 1.007909893989563, "learning_rate": 4.8507172206079625e-05, "loss": 0.5129, "num_input_tokens_seen": 15547888, "step": 26790 }, { "epoch": 3.990914507000298, "grad_norm": 0.7968320846557617, "learning_rate": 4.8506065959784854e-05, "loss": 0.634, "num_input_tokens_seen": 15550960, "step": 26795 }, { "epoch": 3.991659219541257, "grad_norm": 0.7995680570602417, "learning_rate": 4.8504959316378974e-05, "loss": 0.6756, "num_input_tokens_seen": 15553552, "step": 26800 }, { "epoch": 3.9924039320822162, "grad_norm": 0.97385573387146, "learning_rate": 4.850385227588071e-05, "loss": 0.5531, "num_input_tokens_seen": 15556272, "step": 26805 }, { "epoch": 3.9931486446231754, "grad_norm": 1.0413224697113037, "learning_rate": 4.8502744838308744e-05, "loss": 0.6914, "num_input_tokens_seen": 15559408, "step": 26810 }, { "epoch": 3.9938933571641346, "grad_norm": 1.1386529207229614, "learning_rate": 4.8501637003681786e-05, "loss": 0.7209, "num_input_tokens_seen": 15562352, "step": 26815 }, { "epoch": 3.994638069705094, "grad_norm": 0.7797432541847229, "learning_rate": 4.850052877201857e-05, "loss": 0.7646, "num_input_tokens_seen": 15565360, "step": 26820 }, { "epoch": 3.995382782246053, "grad_norm": 1.7743433713912964, "learning_rate": 4.8499420143337795e-05, "loss": 0.7615, "num_input_tokens_seen": 15568176, "step": 26825 }, { "epoch": 3.9961274947870122, "grad_norm": 0.9558546543121338, "learning_rate": 4.84983111176582e-05, "loss": 0.6893, "num_input_tokens_seen": 15571152, "step": 26830 }, { "epoch": 3.9968722073279714, "grad_norm": 1.0500881671905518, "learning_rate": 4.8497201694998526e-05, "loss": 0.6753, "num_input_tokens_seen": 15574032, "step": 26835 }, { "epoch": 3.9976169198689306, "grad_norm": 0.9367046356201172, "learning_rate": 4.849609187537751e-05, "loss": 0.643, "num_input_tokens_seen": 15577072, "step": 26840 }, { "epoch": 3.99836163240989, "grad_norm": 0.9297598004341125, "learning_rate": 4.8494981658813895e-05, "loss": 0.7106, "num_input_tokens_seen": 15579824, "step": 26845 }, { "epoch": 3.999106344950849, "grad_norm": 0.8695504069328308, "learning_rate": 4.8493871045326455e-05, "loss": 0.6202, "num_input_tokens_seen": 15582480, "step": 26850 }, { "epoch": 3.9998510574918082, "grad_norm": 0.8493677377700806, "learning_rate": 4.849276003493394e-05, "loss": 0.6451, "num_input_tokens_seen": 15585744, "step": 26855 }, { "epoch": 4.0, "eval_loss": 0.6666773557662964, "eval_runtime": 46.9881, "eval_samples_per_second": 63.505, "eval_steps_per_second": 15.876, "num_input_tokens_seen": 15585872, "step": 26856 }, { "epoch": 4.000595770032767, "grad_norm": 0.9371023774147034, "learning_rate": 4.849164862765512e-05, "loss": 0.8104, "num_input_tokens_seen": 15588400, "step": 26860 }, { "epoch": 4.001340482573727, "grad_norm": 0.9367229342460632, "learning_rate": 4.8490536823508767e-05, "loss": 0.6407, "num_input_tokens_seen": 15591440, "step": 26865 }, { "epoch": 4.002085195114685, "grad_norm": 0.808464527130127, "learning_rate": 4.848942462251367e-05, "loss": 0.5742, "num_input_tokens_seen": 15594608, "step": 26870 }, { "epoch": 4.002829907655645, "grad_norm": 1.0182735919952393, "learning_rate": 4.848831202468862e-05, "loss": 0.6469, "num_input_tokens_seen": 15597488, "step": 26875 }, { "epoch": 4.003574620196604, "grad_norm": 1.3907482624053955, "learning_rate": 4.848719903005241e-05, "loss": 0.7309, "num_input_tokens_seen": 15600784, "step": 26880 }, { "epoch": 4.0043193327375635, "grad_norm": 1.0623252391815186, "learning_rate": 4.848608563862385e-05, "loss": 0.666, "num_input_tokens_seen": 15603664, "step": 26885 }, { "epoch": 4.005064045278522, "grad_norm": 0.8194507360458374, "learning_rate": 4.848497185042173e-05, "loss": 0.6856, "num_input_tokens_seen": 15606448, "step": 26890 }, { "epoch": 4.005808757819482, "grad_norm": 1.6505275964736938, "learning_rate": 4.84838576654649e-05, "loss": 0.7157, "num_input_tokens_seen": 15609232, "step": 26895 }, { "epoch": 4.006553470360441, "grad_norm": 0.8619809746742249, "learning_rate": 4.848274308377214e-05, "loss": 0.6448, "num_input_tokens_seen": 15612080, "step": 26900 }, { "epoch": 4.0072981829014, "grad_norm": 1.9927719831466675, "learning_rate": 4.8481628105362317e-05, "loss": 0.9693, "num_input_tokens_seen": 15614896, "step": 26905 }, { "epoch": 4.008042895442359, "grad_norm": 0.7409844994544983, "learning_rate": 4.848051273025425e-05, "loss": 0.7342, "num_input_tokens_seen": 15617584, "step": 26910 }, { "epoch": 4.008787607983319, "grad_norm": 1.296008825302124, "learning_rate": 4.8479396958466783e-05, "loss": 0.6126, "num_input_tokens_seen": 15620592, "step": 26915 }, { "epoch": 4.009532320524277, "grad_norm": 1.037335753440857, "learning_rate": 4.8478280790018765e-05, "loss": 0.6577, "num_input_tokens_seen": 15623344, "step": 26920 }, { "epoch": 4.010277033065237, "grad_norm": 1.0174481868743896, "learning_rate": 4.847716422492906e-05, "loss": 0.7391, "num_input_tokens_seen": 15626320, "step": 26925 }, { "epoch": 4.011021745606196, "grad_norm": 1.1573317050933838, "learning_rate": 4.847604726321652e-05, "loss": 0.5538, "num_input_tokens_seen": 15629136, "step": 26930 }, { "epoch": 4.0117664581471555, "grad_norm": 0.6563936471939087, "learning_rate": 4.847492990490003e-05, "loss": 0.5859, "num_input_tokens_seen": 15631952, "step": 26935 }, { "epoch": 4.012511170688114, "grad_norm": 1.018635869026184, "learning_rate": 4.847381214999845e-05, "loss": 0.6372, "num_input_tokens_seen": 15634896, "step": 26940 }, { "epoch": 4.013255883229074, "grad_norm": 1.143210530281067, "learning_rate": 4.847269399853068e-05, "loss": 0.7082, "num_input_tokens_seen": 15637712, "step": 26945 }, { "epoch": 4.014000595770033, "grad_norm": 0.7497193217277527, "learning_rate": 4.84715754505156e-05, "loss": 0.681, "num_input_tokens_seen": 15640720, "step": 26950 }, { "epoch": 4.014745308310992, "grad_norm": 0.7666639685630798, "learning_rate": 4.8470456505972105e-05, "loss": 0.6548, "num_input_tokens_seen": 15643696, "step": 26955 }, { "epoch": 4.015490020851951, "grad_norm": 0.5422437787055969, "learning_rate": 4.8469337164919105e-05, "loss": 0.5562, "num_input_tokens_seen": 15646896, "step": 26960 }, { "epoch": 4.016234733392911, "grad_norm": 1.023024320602417, "learning_rate": 4.84682174273755e-05, "loss": 0.7263, "num_input_tokens_seen": 15650064, "step": 26965 }, { "epoch": 4.0169794459338695, "grad_norm": 0.7439996004104614, "learning_rate": 4.846709729336022e-05, "loss": 0.4404, "num_input_tokens_seen": 15652880, "step": 26970 }, { "epoch": 4.017724158474829, "grad_norm": 1.2647364139556885, "learning_rate": 4.846597676289218e-05, "loss": 0.6349, "num_input_tokens_seen": 15655824, "step": 26975 }, { "epoch": 4.018468871015788, "grad_norm": 0.8134424090385437, "learning_rate": 4.846485583599031e-05, "loss": 0.6718, "num_input_tokens_seen": 15658384, "step": 26980 }, { "epoch": 4.0192135835567475, "grad_norm": 0.890448808670044, "learning_rate": 4.846373451267355e-05, "loss": 0.7026, "num_input_tokens_seen": 15661232, "step": 26985 }, { "epoch": 4.019958296097706, "grad_norm": 1.0906391143798828, "learning_rate": 4.846261279296085e-05, "loss": 0.634, "num_input_tokens_seen": 15664304, "step": 26990 }, { "epoch": 4.020703008638666, "grad_norm": 0.9524484276771545, "learning_rate": 4.8461490676871146e-05, "loss": 0.6013, "num_input_tokens_seen": 15667344, "step": 26995 }, { "epoch": 4.021447721179625, "grad_norm": 0.733365535736084, "learning_rate": 4.84603681644234e-05, "loss": 0.6057, "num_input_tokens_seen": 15670128, "step": 27000 }, { "epoch": 4.022192433720583, "grad_norm": 0.803011953830719, "learning_rate": 4.8459245255636585e-05, "loss": 0.7182, "num_input_tokens_seen": 15673328, "step": 27005 }, { "epoch": 4.022937146261543, "grad_norm": 0.9043377637863159, "learning_rate": 4.8458121950529654e-05, "loss": 0.6773, "num_input_tokens_seen": 15676304, "step": 27010 }, { "epoch": 4.023681858802502, "grad_norm": 0.7850614786148071, "learning_rate": 4.845699824912161e-05, "loss": 0.7965, "num_input_tokens_seen": 15679312, "step": 27015 }, { "epoch": 4.0244265713434615, "grad_norm": 1.0574181079864502, "learning_rate": 4.845587415143141e-05, "loss": 0.7032, "num_input_tokens_seen": 15682320, "step": 27020 }, { "epoch": 4.02517128388442, "grad_norm": 1.4130185842514038, "learning_rate": 4.845474965747806e-05, "loss": 0.5907, "num_input_tokens_seen": 15685328, "step": 27025 }, { "epoch": 4.02591599642538, "grad_norm": 1.022057294845581, "learning_rate": 4.8453624767280545e-05, "loss": 0.6697, "num_input_tokens_seen": 15688144, "step": 27030 }, { "epoch": 4.026660708966339, "grad_norm": 0.7605648636817932, "learning_rate": 4.845249948085789e-05, "loss": 0.5567, "num_input_tokens_seen": 15690928, "step": 27035 }, { "epoch": 4.027405421507298, "grad_norm": 3.4661550521850586, "learning_rate": 4.8451373798229085e-05, "loss": 0.7804, "num_input_tokens_seen": 15694032, "step": 27040 }, { "epoch": 4.028150134048257, "grad_norm": 0.861024022102356, "learning_rate": 4.845024771941316e-05, "loss": 0.486, "num_input_tokens_seen": 15697264, "step": 27045 }, { "epoch": 4.028894846589217, "grad_norm": 0.8091147541999817, "learning_rate": 4.844912124442912e-05, "loss": 0.5048, "num_input_tokens_seen": 15700368, "step": 27050 }, { "epoch": 4.0296395591301755, "grad_norm": 0.6987623572349548, "learning_rate": 4.844799437329602e-05, "loss": 0.6693, "num_input_tokens_seen": 15703536, "step": 27055 }, { "epoch": 4.030384271671135, "grad_norm": 2.4188904762268066, "learning_rate": 4.844686710603289e-05, "loss": 0.6461, "num_input_tokens_seen": 15706512, "step": 27060 }, { "epoch": 4.031128984212094, "grad_norm": 0.7954792380332947, "learning_rate": 4.844573944265876e-05, "loss": 0.6621, "num_input_tokens_seen": 15709232, "step": 27065 }, { "epoch": 4.0318736967530535, "grad_norm": 1.036854863166809, "learning_rate": 4.8444611383192695e-05, "loss": 0.7151, "num_input_tokens_seen": 15712400, "step": 27070 }, { "epoch": 4.032618409294012, "grad_norm": 0.9312264919281006, "learning_rate": 4.844348292765375e-05, "loss": 0.6978, "num_input_tokens_seen": 15715024, "step": 27075 }, { "epoch": 4.033363121834972, "grad_norm": 0.8979710340499878, "learning_rate": 4.844235407606099e-05, "loss": 0.5438, "num_input_tokens_seen": 15717584, "step": 27080 }, { "epoch": 4.034107834375931, "grad_norm": 1.4942171573638916, "learning_rate": 4.844122482843347e-05, "loss": 0.5834, "num_input_tokens_seen": 15720432, "step": 27085 }, { "epoch": 4.03485254691689, "grad_norm": 1.0436044931411743, "learning_rate": 4.8440095184790304e-05, "loss": 0.6454, "num_input_tokens_seen": 15723888, "step": 27090 }, { "epoch": 4.035597259457849, "grad_norm": 1.1319411993026733, "learning_rate": 4.843896514515054e-05, "loss": 0.6944, "num_input_tokens_seen": 15726800, "step": 27095 }, { "epoch": 4.036341971998809, "grad_norm": 1.087349772453308, "learning_rate": 4.843783470953328e-05, "loss": 0.5496, "num_input_tokens_seen": 15729744, "step": 27100 }, { "epoch": 4.0370866845397675, "grad_norm": 0.9506438970565796, "learning_rate": 4.843670387795763e-05, "loss": 0.5448, "num_input_tokens_seen": 15732944, "step": 27105 }, { "epoch": 4.037831397080727, "grad_norm": 0.8197279572486877, "learning_rate": 4.843557265044268e-05, "loss": 0.6977, "num_input_tokens_seen": 15736016, "step": 27110 }, { "epoch": 4.038576109621686, "grad_norm": 1.0712751150131226, "learning_rate": 4.843444102700756e-05, "loss": 0.7211, "num_input_tokens_seen": 15738928, "step": 27115 }, { "epoch": 4.0393208221626455, "grad_norm": 1.3652222156524658, "learning_rate": 4.843330900767137e-05, "loss": 0.7465, "num_input_tokens_seen": 15742032, "step": 27120 }, { "epoch": 4.040065534703604, "grad_norm": 0.863523542881012, "learning_rate": 4.843217659245324e-05, "loss": 0.5773, "num_input_tokens_seen": 15744880, "step": 27125 }, { "epoch": 4.040810247244564, "grad_norm": 0.7167543768882751, "learning_rate": 4.843104378137231e-05, "loss": 0.6552, "num_input_tokens_seen": 15747856, "step": 27130 }, { "epoch": 4.041554959785523, "grad_norm": 1.3598235845565796, "learning_rate": 4.84299105744477e-05, "loss": 0.7629, "num_input_tokens_seen": 15750832, "step": 27135 }, { "epoch": 4.042299672326482, "grad_norm": 0.7200037240982056, "learning_rate": 4.8428776971698566e-05, "loss": 0.521, "num_input_tokens_seen": 15753552, "step": 27140 }, { "epoch": 4.043044384867441, "grad_norm": 1.2798410654067993, "learning_rate": 4.842764297314406e-05, "loss": 0.8183, "num_input_tokens_seen": 15756496, "step": 27145 }, { "epoch": 4.043789097408401, "grad_norm": 1.286539912223816, "learning_rate": 4.842650857880333e-05, "loss": 0.8968, "num_input_tokens_seen": 15759472, "step": 27150 }, { "epoch": 4.0445338099493595, "grad_norm": 0.9990066885948181, "learning_rate": 4.842537378869556e-05, "loss": 0.8662, "num_input_tokens_seen": 15762480, "step": 27155 }, { "epoch": 4.045278522490319, "grad_norm": 0.8071956634521484, "learning_rate": 4.84242386028399e-05, "loss": 0.5828, "num_input_tokens_seen": 15765232, "step": 27160 }, { "epoch": 4.046023235031278, "grad_norm": 1.2761963605880737, "learning_rate": 4.8423103021255535e-05, "loss": 0.7695, "num_input_tokens_seen": 15768176, "step": 27165 }, { "epoch": 4.046767947572237, "grad_norm": 0.7526030540466309, "learning_rate": 4.842196704396165e-05, "loss": 0.592, "num_input_tokens_seen": 15770768, "step": 27170 }, { "epoch": 4.047512660113196, "grad_norm": 1.9576547145843506, "learning_rate": 4.842083067097744e-05, "loss": 0.8117, "num_input_tokens_seen": 15773424, "step": 27175 }, { "epoch": 4.048257372654155, "grad_norm": 1.2116402387619019, "learning_rate": 4.84196939023221e-05, "loss": 0.7191, "num_input_tokens_seen": 15776016, "step": 27180 }, { "epoch": 4.049002085195115, "grad_norm": 1.1611733436584473, "learning_rate": 4.841855673801483e-05, "loss": 0.7162, "num_input_tokens_seen": 15779088, "step": 27185 }, { "epoch": 4.0497467977360735, "grad_norm": 1.1780844926834106, "learning_rate": 4.8417419178074854e-05, "loss": 0.6523, "num_input_tokens_seen": 15781936, "step": 27190 }, { "epoch": 4.050491510277033, "grad_norm": 0.8201855421066284, "learning_rate": 4.841628122252138e-05, "loss": 0.5761, "num_input_tokens_seen": 15785392, "step": 27195 }, { "epoch": 4.051236222817992, "grad_norm": 1.2026792764663696, "learning_rate": 4.841514287137362e-05, "loss": 0.6669, "num_input_tokens_seen": 15788144, "step": 27200 }, { "epoch": 4.0519809353589515, "grad_norm": 0.6180292963981628, "learning_rate": 4.841400412465083e-05, "loss": 0.7223, "num_input_tokens_seen": 15790864, "step": 27205 }, { "epoch": 4.05272564789991, "grad_norm": 0.8593971729278564, "learning_rate": 4.8412864982372244e-05, "loss": 0.5859, "num_input_tokens_seen": 15794256, "step": 27210 }, { "epoch": 4.05347036044087, "grad_norm": 1.0728741884231567, "learning_rate": 4.841172544455709e-05, "loss": 0.6642, "num_input_tokens_seen": 15797424, "step": 27215 }, { "epoch": 4.054215072981829, "grad_norm": 2.5907819271087646, "learning_rate": 4.841058551122463e-05, "loss": 0.6408, "num_input_tokens_seen": 15800112, "step": 27220 }, { "epoch": 4.054959785522788, "grad_norm": 0.8067765831947327, "learning_rate": 4.840944518239412e-05, "loss": 0.7805, "num_input_tokens_seen": 15803248, "step": 27225 }, { "epoch": 4.055704498063747, "grad_norm": 1.7075632810592651, "learning_rate": 4.840830445808483e-05, "loss": 0.6686, "num_input_tokens_seen": 15806224, "step": 27230 }, { "epoch": 4.056449210604707, "grad_norm": 1.1526844501495361, "learning_rate": 4.840716333831602e-05, "loss": 0.692, "num_input_tokens_seen": 15809072, "step": 27235 }, { "epoch": 4.0571939231456655, "grad_norm": 1.046203851699829, "learning_rate": 4.8406021823106985e-05, "loss": 0.738, "num_input_tokens_seen": 15811920, "step": 27240 }, { "epoch": 4.057938635686625, "grad_norm": 0.9247602224349976, "learning_rate": 4.8404879912477e-05, "loss": 0.7838, "num_input_tokens_seen": 15814608, "step": 27245 }, { "epoch": 4.058683348227584, "grad_norm": 0.5286707878112793, "learning_rate": 4.8403737606445355e-05, "loss": 0.5631, "num_input_tokens_seen": 15817392, "step": 27250 }, { "epoch": 4.059428060768544, "grad_norm": 0.490643173456192, "learning_rate": 4.8402594905031346e-05, "loss": 0.6099, "num_input_tokens_seen": 15820240, "step": 27255 }, { "epoch": 4.060172773309502, "grad_norm": 0.7935872077941895, "learning_rate": 4.840145180825428e-05, "loss": 0.7262, "num_input_tokens_seen": 15823024, "step": 27260 }, { "epoch": 4.060917485850462, "grad_norm": 1.1469217538833618, "learning_rate": 4.840030831613347e-05, "loss": 0.8105, "num_input_tokens_seen": 15825936, "step": 27265 }, { "epoch": 4.061662198391421, "grad_norm": 1.4982482194900513, "learning_rate": 4.8399164428688244e-05, "loss": 0.7032, "num_input_tokens_seen": 15828752, "step": 27270 }, { "epoch": 4.06240691093238, "grad_norm": 0.9451113939285278, "learning_rate": 4.83980201459379e-05, "loss": 0.5407, "num_input_tokens_seen": 15831568, "step": 27275 }, { "epoch": 4.063151623473339, "grad_norm": 0.8135693073272705, "learning_rate": 4.83968754679018e-05, "loss": 0.6631, "num_input_tokens_seen": 15834480, "step": 27280 }, { "epoch": 4.063896336014299, "grad_norm": 0.836391806602478, "learning_rate": 4.839573039459927e-05, "loss": 0.6226, "num_input_tokens_seen": 15837360, "step": 27285 }, { "epoch": 4.0646410485552575, "grad_norm": 0.8296234607696533, "learning_rate": 4.8394584926049644e-05, "loss": 0.5924, "num_input_tokens_seen": 15840720, "step": 27290 }, { "epoch": 4.065385761096217, "grad_norm": 0.8508095741271973, "learning_rate": 4.839343906227229e-05, "loss": 0.5741, "num_input_tokens_seen": 15843568, "step": 27295 }, { "epoch": 4.066130473637176, "grad_norm": 0.8620295524597168, "learning_rate": 4.8392292803286554e-05, "loss": 0.6804, "num_input_tokens_seen": 15846352, "step": 27300 }, { "epoch": 4.066875186178136, "grad_norm": 0.6451012492179871, "learning_rate": 4.839114614911181e-05, "loss": 0.7048, "num_input_tokens_seen": 15849200, "step": 27305 }, { "epoch": 4.067619898719094, "grad_norm": 0.9833365678787231, "learning_rate": 4.838999909976742e-05, "loss": 0.6059, "num_input_tokens_seen": 15852304, "step": 27310 }, { "epoch": 4.068364611260054, "grad_norm": 0.8523443341255188, "learning_rate": 4.838885165527277e-05, "loss": 0.6134, "num_input_tokens_seen": 15855088, "step": 27315 }, { "epoch": 4.069109323801013, "grad_norm": 0.7970603704452515, "learning_rate": 4.8387703815647245e-05, "loss": 0.7876, "num_input_tokens_seen": 15858288, "step": 27320 }, { "epoch": 4.069854036341972, "grad_norm": 0.7341392040252686, "learning_rate": 4.838655558091024e-05, "loss": 0.7173, "num_input_tokens_seen": 15861264, "step": 27325 }, { "epoch": 4.070598748882931, "grad_norm": 0.9382545351982117, "learning_rate": 4.8385406951081135e-05, "loss": 0.6482, "num_input_tokens_seen": 15863888, "step": 27330 }, { "epoch": 4.071343461423891, "grad_norm": 0.9247066378593445, "learning_rate": 4.838425792617935e-05, "loss": 0.6119, "num_input_tokens_seen": 15866576, "step": 27335 }, { "epoch": 4.07208817396485, "grad_norm": 1.965491771697998, "learning_rate": 4.8383108506224304e-05, "loss": 0.6849, "num_input_tokens_seen": 15869680, "step": 27340 }, { "epoch": 4.072832886505808, "grad_norm": 1.2569605112075806, "learning_rate": 4.8381958691235396e-05, "loss": 0.6487, "num_input_tokens_seen": 15872656, "step": 27345 }, { "epoch": 4.073577599046768, "grad_norm": 0.9390179514884949, "learning_rate": 4.838080848123206e-05, "loss": 0.6704, "num_input_tokens_seen": 15875600, "step": 27350 }, { "epoch": 4.074322311587727, "grad_norm": 1.1461271047592163, "learning_rate": 4.837965787623373e-05, "loss": 0.6261, "num_input_tokens_seen": 15878416, "step": 27355 }, { "epoch": 4.075067024128686, "grad_norm": 0.8763047456741333, "learning_rate": 4.837850687625985e-05, "loss": 0.5961, "num_input_tokens_seen": 15881328, "step": 27360 }, { "epoch": 4.075811736669645, "grad_norm": 0.7859175801277161, "learning_rate": 4.8377355481329846e-05, "loss": 0.5413, "num_input_tokens_seen": 15884368, "step": 27365 }, { "epoch": 4.076556449210605, "grad_norm": 0.878592312335968, "learning_rate": 4.8376203691463184e-05, "loss": 0.6303, "num_input_tokens_seen": 15887344, "step": 27370 }, { "epoch": 4.0773011617515635, "grad_norm": 0.8985881805419922, "learning_rate": 4.837505150667932e-05, "loss": 0.5611, "num_input_tokens_seen": 15890096, "step": 27375 }, { "epoch": 4.078045874292523, "grad_norm": 1.1400120258331299, "learning_rate": 4.837389892699772e-05, "loss": 0.5783, "num_input_tokens_seen": 15892912, "step": 27380 }, { "epoch": 4.078790586833482, "grad_norm": 0.8860772252082825, "learning_rate": 4.837274595243785e-05, "loss": 0.5857, "num_input_tokens_seen": 15895568, "step": 27385 }, { "epoch": 4.079535299374442, "grad_norm": 0.8138963580131531, "learning_rate": 4.8371592583019196e-05, "loss": 0.7173, "num_input_tokens_seen": 15898160, "step": 27390 }, { "epoch": 4.0802800119154, "grad_norm": 3.3792970180511475, "learning_rate": 4.8370438818761235e-05, "loss": 0.6331, "num_input_tokens_seen": 15901296, "step": 27395 }, { "epoch": 4.08102472445636, "grad_norm": 0.6646422743797302, "learning_rate": 4.836928465968347e-05, "loss": 0.5346, "num_input_tokens_seen": 15903952, "step": 27400 }, { "epoch": 4.081769436997319, "grad_norm": 0.6969845294952393, "learning_rate": 4.836813010580538e-05, "loss": 0.5467, "num_input_tokens_seen": 15907024, "step": 27405 }, { "epoch": 4.082514149538278, "grad_norm": 0.5085529088973999, "learning_rate": 4.836697515714649e-05, "loss": 0.6167, "num_input_tokens_seen": 15909744, "step": 27410 }, { "epoch": 4.083258862079237, "grad_norm": 1.2827610969543457, "learning_rate": 4.8365819813726306e-05, "loss": 0.6369, "num_input_tokens_seen": 15912688, "step": 27415 }, { "epoch": 4.084003574620197, "grad_norm": 1.3843644857406616, "learning_rate": 4.8364664075564334e-05, "loss": 0.5872, "num_input_tokens_seen": 15915856, "step": 27420 }, { "epoch": 4.084748287161156, "grad_norm": 0.5134528279304504, "learning_rate": 4.836350794268012e-05, "loss": 0.7303, "num_input_tokens_seen": 15918864, "step": 27425 }, { "epoch": 4.085492999702115, "grad_norm": 0.9186603426933289, "learning_rate": 4.836235141509318e-05, "loss": 0.4936, "num_input_tokens_seen": 15921552, "step": 27430 }, { "epoch": 4.086237712243074, "grad_norm": 1.6424505710601807, "learning_rate": 4.836119449282306e-05, "loss": 0.8323, "num_input_tokens_seen": 15925008, "step": 27435 }, { "epoch": 4.086982424784034, "grad_norm": 0.8816686868667603, "learning_rate": 4.8360037175889304e-05, "loss": 0.6778, "num_input_tokens_seen": 15927696, "step": 27440 }, { "epoch": 4.087727137324992, "grad_norm": 1.594543695449829, "learning_rate": 4.8358879464311455e-05, "loss": 0.7068, "num_input_tokens_seen": 15930576, "step": 27445 }, { "epoch": 4.088471849865952, "grad_norm": 1.0705597400665283, "learning_rate": 4.835772135810909e-05, "loss": 0.6676, "num_input_tokens_seen": 15933808, "step": 27450 }, { "epoch": 4.089216562406911, "grad_norm": 1.098403811454773, "learning_rate": 4.8356562857301744e-05, "loss": 0.6715, "num_input_tokens_seen": 15936624, "step": 27455 }, { "epoch": 4.08996127494787, "grad_norm": 0.7472844123840332, "learning_rate": 4.835540396190902e-05, "loss": 0.8207, "num_input_tokens_seen": 15939344, "step": 27460 }, { "epoch": 4.090705987488829, "grad_norm": 0.7685825228691101, "learning_rate": 4.835424467195049e-05, "loss": 0.5994, "num_input_tokens_seen": 15941968, "step": 27465 }, { "epoch": 4.091450700029789, "grad_norm": 0.956460177898407, "learning_rate": 4.835308498744572e-05, "loss": 0.7277, "num_input_tokens_seen": 15944720, "step": 27470 }, { "epoch": 4.092195412570748, "grad_norm": 1.8917701244354248, "learning_rate": 4.8351924908414314e-05, "loss": 0.7479, "num_input_tokens_seen": 15947600, "step": 27475 }, { "epoch": 4.092940125111707, "grad_norm": 0.9161091446876526, "learning_rate": 4.835076443487587e-05, "loss": 0.6353, "num_input_tokens_seen": 15950416, "step": 27480 }, { "epoch": 4.093684837652666, "grad_norm": 1.0644599199295044, "learning_rate": 4.8349603566850003e-05, "loss": 0.5585, "num_input_tokens_seen": 15953168, "step": 27485 }, { "epoch": 4.094429550193626, "grad_norm": 1.154706597328186, "learning_rate": 4.834844230435631e-05, "loss": 0.6587, "num_input_tokens_seen": 15956112, "step": 27490 }, { "epoch": 4.095174262734584, "grad_norm": 1.0254768133163452, "learning_rate": 4.8347280647414416e-05, "loss": 0.5974, "num_input_tokens_seen": 15958928, "step": 27495 }, { "epoch": 4.095918975275544, "grad_norm": 1.2323508262634277, "learning_rate": 4.834611859604394e-05, "loss": 0.5823, "num_input_tokens_seen": 15961808, "step": 27500 }, { "epoch": 4.096663687816503, "grad_norm": 1.564630150794983, "learning_rate": 4.8344956150264524e-05, "loss": 0.8048, "num_input_tokens_seen": 15964624, "step": 27505 }, { "epoch": 4.0974084003574625, "grad_norm": 1.0282189846038818, "learning_rate": 4.83437933100958e-05, "loss": 0.756, "num_input_tokens_seen": 15967632, "step": 27510 }, { "epoch": 4.098153112898421, "grad_norm": 1.2868778705596924, "learning_rate": 4.834263007555741e-05, "loss": 0.8314, "num_input_tokens_seen": 15970352, "step": 27515 }, { "epoch": 4.09889782543938, "grad_norm": 1.0742807388305664, "learning_rate": 4.834146644666901e-05, "loss": 0.6247, "num_input_tokens_seen": 15973552, "step": 27520 }, { "epoch": 4.09964253798034, "grad_norm": 0.6050429344177246, "learning_rate": 4.834030242345026e-05, "loss": 0.599, "num_input_tokens_seen": 15976560, "step": 27525 }, { "epoch": 4.100387250521298, "grad_norm": 0.7088290452957153, "learning_rate": 4.8339138005920825e-05, "loss": 0.5043, "num_input_tokens_seen": 15979568, "step": 27530 }, { "epoch": 4.101131963062258, "grad_norm": 1.1665058135986328, "learning_rate": 4.833797319410037e-05, "loss": 0.5843, "num_input_tokens_seen": 15982512, "step": 27535 }, { "epoch": 4.101876675603217, "grad_norm": 1.0300328731536865, "learning_rate": 4.833680798800858e-05, "loss": 0.5428, "num_input_tokens_seen": 15985520, "step": 27540 }, { "epoch": 4.102621388144176, "grad_norm": 0.7588224411010742, "learning_rate": 4.833564238766513e-05, "loss": 0.6563, "num_input_tokens_seen": 15988112, "step": 27545 }, { "epoch": 4.103366100685135, "grad_norm": 1.3291460275650024, "learning_rate": 4.8334476393089726e-05, "loss": 0.832, "num_input_tokens_seen": 15990928, "step": 27550 }, { "epoch": 4.104110813226095, "grad_norm": 0.7791149616241455, "learning_rate": 4.8333310004302054e-05, "loss": 0.5594, "num_input_tokens_seen": 15993520, "step": 27555 }, { "epoch": 4.104855525767054, "grad_norm": 1.7657183408737183, "learning_rate": 4.833214322132183e-05, "loss": 0.955, "num_input_tokens_seen": 15996336, "step": 27560 }, { "epoch": 4.105600238308013, "grad_norm": 0.6254163980484009, "learning_rate": 4.8330976044168766e-05, "loss": 0.6093, "num_input_tokens_seen": 15999376, "step": 27565 }, { "epoch": 4.106344950848972, "grad_norm": 0.9873319268226624, "learning_rate": 4.832980847286256e-05, "loss": 0.6849, "num_input_tokens_seen": 16002736, "step": 27570 }, { "epoch": 4.107089663389932, "grad_norm": 0.86082923412323, "learning_rate": 4.832864050742296e-05, "loss": 0.5702, "num_input_tokens_seen": 16005584, "step": 27575 }, { "epoch": 4.10783437593089, "grad_norm": 0.6320050358772278, "learning_rate": 4.8327472147869684e-05, "loss": 0.6772, "num_input_tokens_seen": 16008176, "step": 27580 }, { "epoch": 4.10857908847185, "grad_norm": 0.8096228837966919, "learning_rate": 4.8326303394222476e-05, "loss": 0.617, "num_input_tokens_seen": 16011216, "step": 27585 }, { "epoch": 4.109323801012809, "grad_norm": 0.9399362802505493, "learning_rate": 4.832513424650108e-05, "loss": 0.7202, "num_input_tokens_seen": 16014160, "step": 27590 }, { "epoch": 4.1100685135537685, "grad_norm": 1.223035216331482, "learning_rate": 4.8323964704725254e-05, "loss": 0.5682, "num_input_tokens_seen": 16016944, "step": 27595 }, { "epoch": 4.110813226094727, "grad_norm": 0.8794853687286377, "learning_rate": 4.8322794768914745e-05, "loss": 0.5202, "num_input_tokens_seen": 16019824, "step": 27600 }, { "epoch": 4.111557938635687, "grad_norm": 1.1228973865509033, "learning_rate": 4.832162443908932e-05, "loss": 0.708, "num_input_tokens_seen": 16022736, "step": 27605 }, { "epoch": 4.112302651176646, "grad_norm": 1.130431890487671, "learning_rate": 4.832045371526876e-05, "loss": 0.7604, "num_input_tokens_seen": 16025904, "step": 27610 }, { "epoch": 4.113047363717605, "grad_norm": 1.0101312398910522, "learning_rate": 4.8319282597472823e-05, "loss": 0.6346, "num_input_tokens_seen": 16029168, "step": 27615 }, { "epoch": 4.113792076258564, "grad_norm": 0.891486644744873, "learning_rate": 4.8318111085721324e-05, "loss": 0.6843, "num_input_tokens_seen": 16031920, "step": 27620 }, { "epoch": 4.114536788799524, "grad_norm": 0.9866892695426941, "learning_rate": 4.8316939180034025e-05, "loss": 0.5848, "num_input_tokens_seen": 16034864, "step": 27625 }, { "epoch": 4.115281501340482, "grad_norm": 1.3185337781906128, "learning_rate": 4.831576688043075e-05, "loss": 0.6544, "num_input_tokens_seen": 16037584, "step": 27630 }, { "epoch": 4.116026213881442, "grad_norm": 0.9813870191574097, "learning_rate": 4.831459418693128e-05, "loss": 0.7903, "num_input_tokens_seen": 16040496, "step": 27635 }, { "epoch": 4.116770926422401, "grad_norm": 1.6590765714645386, "learning_rate": 4.8313421099555436e-05, "loss": 0.6252, "num_input_tokens_seen": 16043280, "step": 27640 }, { "epoch": 4.1175156389633605, "grad_norm": 1.103158712387085, "learning_rate": 4.831224761832304e-05, "loss": 0.5797, "num_input_tokens_seen": 16046160, "step": 27645 }, { "epoch": 4.118260351504319, "grad_norm": 0.8410094380378723, "learning_rate": 4.831107374325391e-05, "loss": 0.5962, "num_input_tokens_seen": 16049040, "step": 27650 }, { "epoch": 4.119005064045279, "grad_norm": 0.8526987433433533, "learning_rate": 4.8309899474367894e-05, "loss": 0.5959, "num_input_tokens_seen": 16051696, "step": 27655 }, { "epoch": 4.119749776586238, "grad_norm": 0.8750815987586975, "learning_rate": 4.8308724811684805e-05, "loss": 0.5767, "num_input_tokens_seen": 16054832, "step": 27660 }, { "epoch": 4.120494489127197, "grad_norm": 0.8035849332809448, "learning_rate": 4.830754975522451e-05, "loss": 0.6273, "num_input_tokens_seen": 16057808, "step": 27665 }, { "epoch": 4.121239201668156, "grad_norm": 1.6077712774276733, "learning_rate": 4.830637430500684e-05, "loss": 0.746, "num_input_tokens_seen": 16060656, "step": 27670 }, { "epoch": 4.121983914209116, "grad_norm": 0.9055505394935608, "learning_rate": 4.830519846105167e-05, "loss": 0.6787, "num_input_tokens_seen": 16063408, "step": 27675 }, { "epoch": 4.1227286267500745, "grad_norm": 1.2490763664245605, "learning_rate": 4.830402222337886e-05, "loss": 0.4689, "num_input_tokens_seen": 16066288, "step": 27680 }, { "epoch": 4.123473339291033, "grad_norm": 1.198034644126892, "learning_rate": 4.830284559200828e-05, "loss": 0.66, "num_input_tokens_seen": 16068880, "step": 27685 }, { "epoch": 4.124218051831993, "grad_norm": 0.7517163157463074, "learning_rate": 4.83016685669598e-05, "loss": 0.7295, "num_input_tokens_seen": 16071824, "step": 27690 }, { "epoch": 4.124962764372952, "grad_norm": 0.7994322180747986, "learning_rate": 4.8300491148253315e-05, "loss": 0.5593, "num_input_tokens_seen": 16074576, "step": 27695 }, { "epoch": 4.125707476913911, "grad_norm": 1.624310851097107, "learning_rate": 4.829931333590872e-05, "loss": 0.6938, "num_input_tokens_seen": 16077264, "step": 27700 }, { "epoch": 4.12645218945487, "grad_norm": 1.181935429573059, "learning_rate": 4.82981351299459e-05, "loss": 0.6686, "num_input_tokens_seen": 16079984, "step": 27705 }, { "epoch": 4.12719690199583, "grad_norm": 1.0351067781448364, "learning_rate": 4.829695653038477e-05, "loss": 0.5616, "num_input_tokens_seen": 16082704, "step": 27710 }, { "epoch": 4.127941614536788, "grad_norm": 1.1542280912399292, "learning_rate": 4.829577753724523e-05, "loss": 0.7219, "num_input_tokens_seen": 16085296, "step": 27715 }, { "epoch": 4.128686327077748, "grad_norm": 1.0438785552978516, "learning_rate": 4.829459815054722e-05, "loss": 0.6173, "num_input_tokens_seen": 16088080, "step": 27720 }, { "epoch": 4.129431039618707, "grad_norm": 0.924808144569397, "learning_rate": 4.829341837031064e-05, "loss": 0.5708, "num_input_tokens_seen": 16090960, "step": 27725 }, { "epoch": 4.1301757521596665, "grad_norm": 0.9535031318664551, "learning_rate": 4.829223819655543e-05, "loss": 0.8448, "num_input_tokens_seen": 16093648, "step": 27730 }, { "epoch": 4.130920464700625, "grad_norm": 1.161912202835083, "learning_rate": 4.829105762930153e-05, "loss": 0.6869, "num_input_tokens_seen": 16096528, "step": 27735 }, { "epoch": 4.131665177241585, "grad_norm": 1.1240835189819336, "learning_rate": 4.8289876668568886e-05, "loss": 0.6571, "num_input_tokens_seen": 16099536, "step": 27740 }, { "epoch": 4.132409889782544, "grad_norm": 0.9458321928977966, "learning_rate": 4.828869531437744e-05, "loss": 0.6688, "num_input_tokens_seen": 16102480, "step": 27745 }, { "epoch": 4.133154602323503, "grad_norm": 1.3078207969665527, "learning_rate": 4.828751356674717e-05, "loss": 0.6045, "num_input_tokens_seen": 16105360, "step": 27750 }, { "epoch": 4.133899314864462, "grad_norm": 1.5580943822860718, "learning_rate": 4.8286331425698014e-05, "loss": 0.5832, "num_input_tokens_seen": 16108336, "step": 27755 }, { "epoch": 4.134644027405422, "grad_norm": 0.7188640832901001, "learning_rate": 4.828514889124995e-05, "loss": 0.5032, "num_input_tokens_seen": 16111056, "step": 27760 }, { "epoch": 4.1353887399463805, "grad_norm": 2.5828938484191895, "learning_rate": 4.828396596342298e-05, "loss": 0.5257, "num_input_tokens_seen": 16113776, "step": 27765 }, { "epoch": 4.13613345248734, "grad_norm": 1.152489185333252, "learning_rate": 4.828278264223706e-05, "loss": 0.6111, "num_input_tokens_seen": 16116464, "step": 27770 }, { "epoch": 4.136878165028299, "grad_norm": 1.3879121541976929, "learning_rate": 4.828159892771219e-05, "loss": 0.4654, "num_input_tokens_seen": 16119472, "step": 27775 }, { "epoch": 4.1376228775692585, "grad_norm": 0.9896901845932007, "learning_rate": 4.828041481986837e-05, "loss": 0.6447, "num_input_tokens_seen": 16122480, "step": 27780 }, { "epoch": 4.138367590110217, "grad_norm": 1.535257339477539, "learning_rate": 4.82792303187256e-05, "loss": 0.6659, "num_input_tokens_seen": 16125296, "step": 27785 }, { "epoch": 4.139112302651177, "grad_norm": 0.6648650765419006, "learning_rate": 4.82780454243039e-05, "loss": 0.7109, "num_input_tokens_seen": 16128240, "step": 27790 }, { "epoch": 4.139857015192136, "grad_norm": 0.7822220325469971, "learning_rate": 4.827686013662327e-05, "loss": 0.7545, "num_input_tokens_seen": 16131440, "step": 27795 }, { "epoch": 4.140601727733095, "grad_norm": 1.5826865434646606, "learning_rate": 4.827567445570376e-05, "loss": 0.6495, "num_input_tokens_seen": 16134224, "step": 27800 }, { "epoch": 4.141346440274054, "grad_norm": 1.7346042394638062, "learning_rate": 4.827448838156537e-05, "loss": 0.6919, "num_input_tokens_seen": 16136944, "step": 27805 }, { "epoch": 4.142091152815014, "grad_norm": 1.8263580799102783, "learning_rate": 4.827330191422817e-05, "loss": 0.7809, "num_input_tokens_seen": 16139792, "step": 27810 }, { "epoch": 4.1428358653559725, "grad_norm": 0.8087747693061829, "learning_rate": 4.8272115053712185e-05, "loss": 0.6047, "num_input_tokens_seen": 16142608, "step": 27815 }, { "epoch": 4.143580577896932, "grad_norm": 2.5529937744140625, "learning_rate": 4.8270927800037465e-05, "loss": 0.8329, "num_input_tokens_seen": 16145520, "step": 27820 }, { "epoch": 4.144325290437891, "grad_norm": 0.8586840629577637, "learning_rate": 4.826974015322407e-05, "loss": 0.6947, "num_input_tokens_seen": 16148304, "step": 27825 }, { "epoch": 4.1450700029788505, "grad_norm": 1.1608248949050903, "learning_rate": 4.826855211329206e-05, "loss": 0.7241, "num_input_tokens_seen": 16151184, "step": 27830 }, { "epoch": 4.145814715519809, "grad_norm": 1.1040514707565308, "learning_rate": 4.826736368026152e-05, "loss": 0.4612, "num_input_tokens_seen": 16154224, "step": 27835 }, { "epoch": 4.146559428060769, "grad_norm": 1.057079553604126, "learning_rate": 4.826617485415252e-05, "loss": 0.844, "num_input_tokens_seen": 16156848, "step": 27840 }, { "epoch": 4.147304140601728, "grad_norm": 1.1207704544067383, "learning_rate": 4.826498563498514e-05, "loss": 0.7735, "num_input_tokens_seen": 16159760, "step": 27845 }, { "epoch": 4.148048853142687, "grad_norm": 0.9393161535263062, "learning_rate": 4.826379602277947e-05, "loss": 0.6386, "num_input_tokens_seen": 16162672, "step": 27850 }, { "epoch": 4.148793565683646, "grad_norm": 0.7619211673736572, "learning_rate": 4.8262606017555616e-05, "loss": 0.6104, "num_input_tokens_seen": 16165584, "step": 27855 }, { "epoch": 4.149538278224606, "grad_norm": 1.4644923210144043, "learning_rate": 4.826141561933367e-05, "loss": 0.8017, "num_input_tokens_seen": 16168624, "step": 27860 }, { "epoch": 4.1502829907655645, "grad_norm": 1.234791874885559, "learning_rate": 4.826022482813376e-05, "loss": 0.564, "num_input_tokens_seen": 16171440, "step": 27865 }, { "epoch": 4.151027703306523, "grad_norm": 0.8771460056304932, "learning_rate": 4.825903364397598e-05, "loss": 0.6065, "num_input_tokens_seen": 16174416, "step": 27870 }, { "epoch": 4.151772415847483, "grad_norm": 1.161909818649292, "learning_rate": 4.8257842066880474e-05, "loss": 0.5858, "num_input_tokens_seen": 16177328, "step": 27875 }, { "epoch": 4.152517128388442, "grad_norm": 0.7246087193489075, "learning_rate": 4.8256650096867364e-05, "loss": 0.6438, "num_input_tokens_seen": 16180208, "step": 27880 }, { "epoch": 4.153261840929401, "grad_norm": 0.6550514698028564, "learning_rate": 4.8255457733956774e-05, "loss": 0.7151, "num_input_tokens_seen": 16183440, "step": 27885 }, { "epoch": 4.15400655347036, "grad_norm": 0.6444531679153442, "learning_rate": 4.825426497816888e-05, "loss": 0.8565, "num_input_tokens_seen": 16186448, "step": 27890 }, { "epoch": 4.15475126601132, "grad_norm": 0.864232063293457, "learning_rate": 4.82530718295238e-05, "loss": 0.5773, "num_input_tokens_seen": 16189456, "step": 27895 }, { "epoch": 4.1554959785522785, "grad_norm": 0.7934386730194092, "learning_rate": 4.825187828804171e-05, "loss": 0.8463, "num_input_tokens_seen": 16192496, "step": 27900 }, { "epoch": 4.156240691093238, "grad_norm": 1.0994397401809692, "learning_rate": 4.825068435374277e-05, "loss": 0.6259, "num_input_tokens_seen": 16195568, "step": 27905 }, { "epoch": 4.156985403634197, "grad_norm": 1.05289888381958, "learning_rate": 4.824949002664715e-05, "loss": 0.7174, "num_input_tokens_seen": 16198384, "step": 27910 }, { "epoch": 4.1577301161751565, "grad_norm": 0.8535751700401306, "learning_rate": 4.824829530677503e-05, "loss": 0.6523, "num_input_tokens_seen": 16201072, "step": 27915 }, { "epoch": 4.158474828716115, "grad_norm": 1.8778290748596191, "learning_rate": 4.824710019414658e-05, "loss": 0.7945, "num_input_tokens_seen": 16203920, "step": 27920 }, { "epoch": 4.159219541257075, "grad_norm": 1.1222161054611206, "learning_rate": 4.8245904688781994e-05, "loss": 0.7435, "num_input_tokens_seen": 16206640, "step": 27925 }, { "epoch": 4.159964253798034, "grad_norm": 1.0215427875518799, "learning_rate": 4.8244708790701486e-05, "loss": 0.3991, "num_input_tokens_seen": 16209328, "step": 27930 }, { "epoch": 4.160708966338993, "grad_norm": 0.9090081453323364, "learning_rate": 4.824351249992525e-05, "loss": 0.8366, "num_input_tokens_seen": 16211984, "step": 27935 }, { "epoch": 4.161453678879952, "grad_norm": 1.6215258836746216, "learning_rate": 4.824231581647348e-05, "loss": 0.731, "num_input_tokens_seen": 16214960, "step": 27940 }, { "epoch": 4.162198391420912, "grad_norm": 0.9042147994041443, "learning_rate": 4.824111874036642e-05, "loss": 0.8063, "num_input_tokens_seen": 16217904, "step": 27945 }, { "epoch": 4.1629431039618705, "grad_norm": 0.7339760661125183, "learning_rate": 4.823992127162428e-05, "loss": 0.6255, "num_input_tokens_seen": 16220976, "step": 27950 }, { "epoch": 4.16368781650283, "grad_norm": 1.005082368850708, "learning_rate": 4.8238723410267285e-05, "loss": 0.5495, "num_input_tokens_seen": 16223760, "step": 27955 }, { "epoch": 4.164432529043789, "grad_norm": 1.048520803451538, "learning_rate": 4.823752515631568e-05, "loss": 0.5666, "num_input_tokens_seen": 16226672, "step": 27960 }, { "epoch": 4.165177241584749, "grad_norm": 0.9849333763122559, "learning_rate": 4.8236326509789695e-05, "loss": 0.7009, "num_input_tokens_seen": 16229776, "step": 27965 }, { "epoch": 4.165921954125707, "grad_norm": 1.2840139865875244, "learning_rate": 4.8235127470709594e-05, "loss": 0.6521, "num_input_tokens_seen": 16232336, "step": 27970 }, { "epoch": 4.166666666666667, "grad_norm": 1.043357014656067, "learning_rate": 4.8233928039095635e-05, "loss": 0.5605, "num_input_tokens_seen": 16235472, "step": 27975 }, { "epoch": 4.167411379207626, "grad_norm": 1.2342511415481567, "learning_rate": 4.823272821496808e-05, "loss": 0.5835, "num_input_tokens_seen": 16238448, "step": 27980 }, { "epoch": 4.168156091748585, "grad_norm": 2.09539794921875, "learning_rate": 4.823152799834718e-05, "loss": 0.6472, "num_input_tokens_seen": 16241296, "step": 27985 }, { "epoch": 4.168900804289544, "grad_norm": 0.9936110973358154, "learning_rate": 4.823032738925324e-05, "loss": 0.715, "num_input_tokens_seen": 16244592, "step": 27990 }, { "epoch": 4.169645516830504, "grad_norm": 0.5747356414794922, "learning_rate": 4.8229126387706516e-05, "loss": 0.672, "num_input_tokens_seen": 16247408, "step": 27995 }, { "epoch": 4.1703902293714625, "grad_norm": 0.7265253663063049, "learning_rate": 4.822792499372732e-05, "loss": 0.8405, "num_input_tokens_seen": 16250544, "step": 28000 }, { "epoch": 4.171134941912422, "grad_norm": 2.033522844314575, "learning_rate": 4.822672320733594e-05, "loss": 0.7509, "num_input_tokens_seen": 16253232, "step": 28005 }, { "epoch": 4.171879654453381, "grad_norm": 0.6534501910209656, "learning_rate": 4.822552102855267e-05, "loss": 0.5284, "num_input_tokens_seen": 16256368, "step": 28010 }, { "epoch": 4.172624366994341, "grad_norm": 0.8771116137504578, "learning_rate": 4.822431845739783e-05, "loss": 0.7707, "num_input_tokens_seen": 16259312, "step": 28015 }, { "epoch": 4.173369079535299, "grad_norm": 0.9292386174201965, "learning_rate": 4.822311549389174e-05, "loss": 0.766, "num_input_tokens_seen": 16262128, "step": 28020 }, { "epoch": 4.174113792076259, "grad_norm": 0.763704240322113, "learning_rate": 4.8221912138054715e-05, "loss": 0.5658, "num_input_tokens_seen": 16265232, "step": 28025 }, { "epoch": 4.174858504617218, "grad_norm": 0.9909020662307739, "learning_rate": 4.822070838990708e-05, "loss": 0.6967, "num_input_tokens_seen": 16268240, "step": 28030 }, { "epoch": 4.1756032171581765, "grad_norm": 1.2035796642303467, "learning_rate": 4.8219504249469186e-05, "loss": 0.7195, "num_input_tokens_seen": 16270896, "step": 28035 }, { "epoch": 4.176347929699136, "grad_norm": 1.2590230703353882, "learning_rate": 4.821829971676136e-05, "loss": 0.8105, "num_input_tokens_seen": 16273488, "step": 28040 }, { "epoch": 4.177092642240095, "grad_norm": 1.322840929031372, "learning_rate": 4.8217094791803966e-05, "loss": 0.7259, "num_input_tokens_seen": 16276048, "step": 28045 }, { "epoch": 4.177837354781055, "grad_norm": 1.0690218210220337, "learning_rate": 4.821588947461734e-05, "loss": 0.76, "num_input_tokens_seen": 16279120, "step": 28050 }, { "epoch": 4.178582067322013, "grad_norm": 1.8048923015594482, "learning_rate": 4.821468376522186e-05, "loss": 0.6521, "num_input_tokens_seen": 16282352, "step": 28055 }, { "epoch": 4.179326779862973, "grad_norm": 1.6260713338851929, "learning_rate": 4.82134776636379e-05, "loss": 0.8062, "num_input_tokens_seen": 16285008, "step": 28060 }, { "epoch": 4.180071492403932, "grad_norm": 1.0482498407363892, "learning_rate": 4.821227116988583e-05, "loss": 0.6495, "num_input_tokens_seen": 16288048, "step": 28065 }, { "epoch": 4.180816204944891, "grad_norm": 0.8147642612457275, "learning_rate": 4.8211064283986015e-05, "loss": 0.6158, "num_input_tokens_seen": 16291184, "step": 28070 }, { "epoch": 4.18156091748585, "grad_norm": 1.0632317066192627, "learning_rate": 4.8209857005958866e-05, "loss": 0.7707, "num_input_tokens_seen": 16293936, "step": 28075 }, { "epoch": 4.18230563002681, "grad_norm": 0.9181168675422668, "learning_rate": 4.820864933582478e-05, "loss": 0.6651, "num_input_tokens_seen": 16297040, "step": 28080 }, { "epoch": 4.1830503425677685, "grad_norm": 0.9117407202720642, "learning_rate": 4.8207441273604145e-05, "loss": 0.7492, "num_input_tokens_seen": 16300272, "step": 28085 }, { "epoch": 4.183795055108728, "grad_norm": 0.9077396392822266, "learning_rate": 4.820623281931738e-05, "loss": 0.6506, "num_input_tokens_seen": 16303312, "step": 28090 }, { "epoch": 4.184539767649687, "grad_norm": 0.8355264663696289, "learning_rate": 4.8205023972984896e-05, "loss": 0.6433, "num_input_tokens_seen": 16306512, "step": 28095 }, { "epoch": 4.185284480190647, "grad_norm": 1.5535508394241333, "learning_rate": 4.820381473462712e-05, "loss": 0.6567, "num_input_tokens_seen": 16309104, "step": 28100 }, { "epoch": 4.186029192731605, "grad_norm": 1.0864320993423462, "learning_rate": 4.820260510426447e-05, "loss": 0.6491, "num_input_tokens_seen": 16311920, "step": 28105 }, { "epoch": 4.186773905272565, "grad_norm": 0.9377768039703369, "learning_rate": 4.820139508191739e-05, "loss": 0.8212, "num_input_tokens_seen": 16314768, "step": 28110 }, { "epoch": 4.187518617813524, "grad_norm": 0.8944258093833923, "learning_rate": 4.820018466760633e-05, "loss": 0.7186, "num_input_tokens_seen": 16317392, "step": 28115 }, { "epoch": 4.188263330354483, "grad_norm": 0.9145146608352661, "learning_rate": 4.819897386135172e-05, "loss": 0.6667, "num_input_tokens_seen": 16320432, "step": 28120 }, { "epoch": 4.189008042895442, "grad_norm": 0.9659302234649658, "learning_rate": 4.819776266317403e-05, "loss": 0.6227, "num_input_tokens_seen": 16323120, "step": 28125 }, { "epoch": 4.189752755436402, "grad_norm": 0.8384186625480652, "learning_rate": 4.819655107309371e-05, "loss": 0.5215, "num_input_tokens_seen": 16325904, "step": 28130 }, { "epoch": 4.190497467977361, "grad_norm": 0.9851367473602295, "learning_rate": 4.819533909113124e-05, "loss": 0.6218, "num_input_tokens_seen": 16328848, "step": 28135 }, { "epoch": 4.19124218051832, "grad_norm": 1.0964914560317993, "learning_rate": 4.819412671730709e-05, "loss": 0.689, "num_input_tokens_seen": 16331632, "step": 28140 }, { "epoch": 4.191986893059279, "grad_norm": 0.9903561472892761, "learning_rate": 4.8192913951641746e-05, "loss": 0.5951, "num_input_tokens_seen": 16334160, "step": 28145 }, { "epoch": 4.192731605600239, "grad_norm": 1.947788953781128, "learning_rate": 4.819170079415569e-05, "loss": 0.6384, "num_input_tokens_seen": 16336816, "step": 28150 }, { "epoch": 4.193476318141197, "grad_norm": 1.0075604915618896, "learning_rate": 4.819048724486942e-05, "loss": 0.6052, "num_input_tokens_seen": 16339600, "step": 28155 }, { "epoch": 4.194221030682157, "grad_norm": 1.7323390245437622, "learning_rate": 4.818927330380344e-05, "loss": 0.6526, "num_input_tokens_seen": 16342480, "step": 28160 }, { "epoch": 4.194965743223116, "grad_norm": 1.819281816482544, "learning_rate": 4.8188058970978254e-05, "loss": 0.7859, "num_input_tokens_seen": 16345616, "step": 28165 }, { "epoch": 4.195710455764075, "grad_norm": 0.6521884202957153, "learning_rate": 4.818684424641438e-05, "loss": 0.5914, "num_input_tokens_seen": 16348272, "step": 28170 }, { "epoch": 4.196455168305034, "grad_norm": 0.7080375552177429, "learning_rate": 4.8185629130132336e-05, "loss": 0.715, "num_input_tokens_seen": 16351184, "step": 28175 }, { "epoch": 4.197199880845994, "grad_norm": 0.8523656129837036, "learning_rate": 4.818441362215266e-05, "loss": 0.729, "num_input_tokens_seen": 16354000, "step": 28180 }, { "epoch": 4.197944593386953, "grad_norm": 1.658069372177124, "learning_rate": 4.8183197722495877e-05, "loss": 0.761, "num_input_tokens_seen": 16356688, "step": 28185 }, { "epoch": 4.198689305927912, "grad_norm": 1.1509013175964355, "learning_rate": 4.8181981431182523e-05, "loss": 0.6484, "num_input_tokens_seen": 16359728, "step": 28190 }, { "epoch": 4.199434018468871, "grad_norm": 0.8483501076698303, "learning_rate": 4.818076474823316e-05, "loss": 0.7074, "num_input_tokens_seen": 16362736, "step": 28195 }, { "epoch": 4.200178731009831, "grad_norm": 1.072043538093567, "learning_rate": 4.817954767366833e-05, "loss": 0.6314, "num_input_tokens_seen": 16365456, "step": 28200 }, { "epoch": 4.200923443550789, "grad_norm": 0.9613244533538818, "learning_rate": 4.817833020750861e-05, "loss": 0.6323, "num_input_tokens_seen": 16368080, "step": 28205 }, { "epoch": 4.201668156091749, "grad_norm": 1.1383445262908936, "learning_rate": 4.8177112349774554e-05, "loss": 0.5506, "num_input_tokens_seen": 16371088, "step": 28210 }, { "epoch": 4.202412868632708, "grad_norm": 1.020517110824585, "learning_rate": 4.817589410048674e-05, "loss": 0.6982, "num_input_tokens_seen": 16374224, "step": 28215 }, { "epoch": 4.203157581173667, "grad_norm": 1.2719708681106567, "learning_rate": 4.817467545966575e-05, "loss": 0.5725, "num_input_tokens_seen": 16376848, "step": 28220 }, { "epoch": 4.203902293714626, "grad_norm": 1.3693522214889526, "learning_rate": 4.8173456427332176e-05, "loss": 0.6789, "num_input_tokens_seen": 16380112, "step": 28225 }, { "epoch": 4.204647006255585, "grad_norm": 0.9443467855453491, "learning_rate": 4.817223700350661e-05, "loss": 0.5886, "num_input_tokens_seen": 16382960, "step": 28230 }, { "epoch": 4.205391718796545, "grad_norm": 1.0704731941223145, "learning_rate": 4.817101718820965e-05, "loss": 0.5775, "num_input_tokens_seen": 16385776, "step": 28235 }, { "epoch": 4.206136431337503, "grad_norm": 0.7243302464485168, "learning_rate": 4.8169796981461904e-05, "loss": 0.5057, "num_input_tokens_seen": 16388592, "step": 28240 }, { "epoch": 4.206881143878463, "grad_norm": 1.1887307167053223, "learning_rate": 4.816857638328398e-05, "loss": 0.7344, "num_input_tokens_seen": 16391504, "step": 28245 }, { "epoch": 4.207625856419422, "grad_norm": 0.7302590012550354, "learning_rate": 4.816735539369651e-05, "loss": 0.6393, "num_input_tokens_seen": 16394672, "step": 28250 }, { "epoch": 4.208370568960381, "grad_norm": 0.9174990653991699, "learning_rate": 4.816613401272011e-05, "loss": 0.7294, "num_input_tokens_seen": 16397616, "step": 28255 }, { "epoch": 4.20911528150134, "grad_norm": 0.9259841442108154, "learning_rate": 4.816491224037543e-05, "loss": 0.5204, "num_input_tokens_seen": 16400400, "step": 28260 }, { "epoch": 4.2098599940423, "grad_norm": 0.8113053441047668, "learning_rate": 4.81636900766831e-05, "loss": 0.4177, "num_input_tokens_seen": 16403312, "step": 28265 }, { "epoch": 4.210604706583259, "grad_norm": 0.8489245772361755, "learning_rate": 4.816246752166377e-05, "loss": 0.5218, "num_input_tokens_seen": 16406160, "step": 28270 }, { "epoch": 4.211349419124218, "grad_norm": 0.49574458599090576, "learning_rate": 4.8161244575338086e-05, "loss": 0.495, "num_input_tokens_seen": 16409296, "step": 28275 }, { "epoch": 4.212094131665177, "grad_norm": 1.7732661962509155, "learning_rate": 4.816002123772672e-05, "loss": 0.6278, "num_input_tokens_seen": 16412144, "step": 28280 }, { "epoch": 4.212838844206137, "grad_norm": 1.0822434425354004, "learning_rate": 4.815879750885033e-05, "loss": 0.8183, "num_input_tokens_seen": 16415056, "step": 28285 }, { "epoch": 4.213583556747095, "grad_norm": 0.8115336298942566, "learning_rate": 4.81575733887296e-05, "loss": 0.7039, "num_input_tokens_seen": 16417904, "step": 28290 }, { "epoch": 4.214328269288055, "grad_norm": 1.2946721315383911, "learning_rate": 4.81563488773852e-05, "loss": 0.6325, "num_input_tokens_seen": 16420432, "step": 28295 }, { "epoch": 4.215072981829014, "grad_norm": 0.9421321749687195, "learning_rate": 4.8155123974837824e-05, "loss": 0.6731, "num_input_tokens_seen": 16423408, "step": 28300 }, { "epoch": 4.2158176943699734, "grad_norm": 1.784285306930542, "learning_rate": 4.815389868110816e-05, "loss": 0.9243, "num_input_tokens_seen": 16426576, "step": 28305 }, { "epoch": 4.216562406910932, "grad_norm": 0.9326374530792236, "learning_rate": 4.815267299621691e-05, "loss": 0.6954, "num_input_tokens_seen": 16429424, "step": 28310 }, { "epoch": 4.217307119451892, "grad_norm": 1.0074635744094849, "learning_rate": 4.815144692018477e-05, "loss": 0.6003, "num_input_tokens_seen": 16432368, "step": 28315 }, { "epoch": 4.218051831992851, "grad_norm": 0.9687261581420898, "learning_rate": 4.815022045303248e-05, "loss": 0.6607, "num_input_tokens_seen": 16435216, "step": 28320 }, { "epoch": 4.21879654453381, "grad_norm": 0.9029532074928284, "learning_rate": 4.814899359478074e-05, "loss": 0.7854, "num_input_tokens_seen": 16438576, "step": 28325 }, { "epoch": 4.219541257074769, "grad_norm": 1.1951676607131958, "learning_rate": 4.814776634545028e-05, "loss": 0.8595, "num_input_tokens_seen": 16441392, "step": 28330 }, { "epoch": 4.220285969615729, "grad_norm": 2.3748087882995605, "learning_rate": 4.814653870506183e-05, "loss": 0.7814, "num_input_tokens_seen": 16444208, "step": 28335 }, { "epoch": 4.221030682156687, "grad_norm": 0.6594576835632324, "learning_rate": 4.8145310673636143e-05, "loss": 0.717, "num_input_tokens_seen": 16446896, "step": 28340 }, { "epoch": 4.221775394697647, "grad_norm": 1.1625269651412964, "learning_rate": 4.814408225119395e-05, "loss": 0.7322, "num_input_tokens_seen": 16449680, "step": 28345 }, { "epoch": 4.222520107238606, "grad_norm": 1.3998161554336548, "learning_rate": 4.8142853437756006e-05, "loss": 0.5624, "num_input_tokens_seen": 16452528, "step": 28350 }, { "epoch": 4.2232648197795655, "grad_norm": 1.0735334157943726, "learning_rate": 4.814162423334309e-05, "loss": 0.767, "num_input_tokens_seen": 16455824, "step": 28355 }, { "epoch": 4.224009532320524, "grad_norm": 1.2443417310714722, "learning_rate": 4.814039463797594e-05, "loss": 0.625, "num_input_tokens_seen": 16458672, "step": 28360 }, { "epoch": 4.224754244861484, "grad_norm": 1.8922494649887085, "learning_rate": 4.813916465167534e-05, "loss": 0.6475, "num_input_tokens_seen": 16461584, "step": 28365 }, { "epoch": 4.225498957402443, "grad_norm": 0.7263465523719788, "learning_rate": 4.813793427446207e-05, "loss": 0.7037, "num_input_tokens_seen": 16464368, "step": 28370 }, { "epoch": 4.226243669943402, "grad_norm": 1.1221121549606323, "learning_rate": 4.813670350635693e-05, "loss": 0.6833, "num_input_tokens_seen": 16467440, "step": 28375 }, { "epoch": 4.226988382484361, "grad_norm": 1.0656611919403076, "learning_rate": 4.8135472347380684e-05, "loss": 0.5885, "num_input_tokens_seen": 16470192, "step": 28380 }, { "epoch": 4.22773309502532, "grad_norm": 0.7329288721084595, "learning_rate": 4.8134240797554155e-05, "loss": 0.786, "num_input_tokens_seen": 16473552, "step": 28385 }, { "epoch": 4.2284778075662794, "grad_norm": 1.55256986618042, "learning_rate": 4.813300885689814e-05, "loss": 0.8258, "num_input_tokens_seen": 16476368, "step": 28390 }, { "epoch": 4.229222520107238, "grad_norm": 1.027690052986145, "learning_rate": 4.813177652543345e-05, "loss": 0.6429, "num_input_tokens_seen": 16479120, "step": 28395 }, { "epoch": 4.229967232648198, "grad_norm": 1.0918169021606445, "learning_rate": 4.813054380318091e-05, "loss": 0.7095, "num_input_tokens_seen": 16482064, "step": 28400 }, { "epoch": 4.230711945189157, "grad_norm": 0.9725301861763, "learning_rate": 4.8129310690161335e-05, "loss": 0.5608, "num_input_tokens_seen": 16484688, "step": 28405 }, { "epoch": 4.231456657730116, "grad_norm": 0.7883470058441162, "learning_rate": 4.812807718639556e-05, "loss": 0.6102, "num_input_tokens_seen": 16487536, "step": 28410 }, { "epoch": 4.232201370271075, "grad_norm": 1.1136201620101929, "learning_rate": 4.812684329190443e-05, "loss": 0.7626, "num_input_tokens_seen": 16490000, "step": 28415 }, { "epoch": 4.232946082812035, "grad_norm": 1.2472721338272095, "learning_rate": 4.8125609006708796e-05, "loss": 0.5612, "num_input_tokens_seen": 16492880, "step": 28420 }, { "epoch": 4.233690795352993, "grad_norm": 0.8739681243896484, "learning_rate": 4.812437433082949e-05, "loss": 0.639, "num_input_tokens_seen": 16495600, "step": 28425 }, { "epoch": 4.234435507893953, "grad_norm": 1.0601130723953247, "learning_rate": 4.812313926428739e-05, "loss": 0.6807, "num_input_tokens_seen": 16498480, "step": 28430 }, { "epoch": 4.235180220434912, "grad_norm": 0.9736555814743042, "learning_rate": 4.812190380710335e-05, "loss": 0.7283, "num_input_tokens_seen": 16501424, "step": 28435 }, { "epoch": 4.2359249329758715, "grad_norm": 0.908449649810791, "learning_rate": 4.812066795929825e-05, "loss": 0.5636, "num_input_tokens_seen": 16504400, "step": 28440 }, { "epoch": 4.23666964551683, "grad_norm": 1.0498895645141602, "learning_rate": 4.811943172089296e-05, "loss": 0.6232, "num_input_tokens_seen": 16507248, "step": 28445 }, { "epoch": 4.23741435805779, "grad_norm": 1.1357529163360596, "learning_rate": 4.811819509190837e-05, "loss": 0.6878, "num_input_tokens_seen": 16510032, "step": 28450 }, { "epoch": 4.238159070598749, "grad_norm": 1.0243600606918335, "learning_rate": 4.811695807236537e-05, "loss": 0.9294, "num_input_tokens_seen": 16512816, "step": 28455 }, { "epoch": 4.238903783139708, "grad_norm": 1.038613200187683, "learning_rate": 4.8115720662284855e-05, "loss": 0.5263, "num_input_tokens_seen": 16515504, "step": 28460 }, { "epoch": 4.239648495680667, "grad_norm": 1.479133129119873, "learning_rate": 4.8114482861687734e-05, "loss": 0.6276, "num_input_tokens_seen": 16518128, "step": 28465 }, { "epoch": 4.240393208221627, "grad_norm": 0.68373703956604, "learning_rate": 4.8113244670594926e-05, "loss": 0.4951, "num_input_tokens_seen": 16521040, "step": 28470 }, { "epoch": 4.2411379207625854, "grad_norm": 0.9785385131835938, "learning_rate": 4.811200608902733e-05, "loss": 0.7402, "num_input_tokens_seen": 16524112, "step": 28475 }, { "epoch": 4.241882633303545, "grad_norm": 0.9776009321212769, "learning_rate": 4.811076711700588e-05, "loss": 0.7005, "num_input_tokens_seen": 16526800, "step": 28480 }, { "epoch": 4.242627345844504, "grad_norm": 0.8198614716529846, "learning_rate": 4.810952775455152e-05, "loss": 0.7243, "num_input_tokens_seen": 16529488, "step": 28485 }, { "epoch": 4.2433720583854635, "grad_norm": 0.8345264196395874, "learning_rate": 4.810828800168517e-05, "loss": 0.5124, "num_input_tokens_seen": 16532336, "step": 28490 }, { "epoch": 4.244116770926422, "grad_norm": 0.7563048005104065, "learning_rate": 4.810704785842778e-05, "loss": 0.6501, "num_input_tokens_seen": 16535024, "step": 28495 }, { "epoch": 4.244861483467382, "grad_norm": 0.9830135703086853, "learning_rate": 4.81058073248003e-05, "loss": 0.6576, "num_input_tokens_seen": 16537872, "step": 28500 }, { "epoch": 4.245606196008341, "grad_norm": 1.5031452178955078, "learning_rate": 4.810456640082369e-05, "loss": 0.6848, "num_input_tokens_seen": 16540624, "step": 28505 }, { "epoch": 4.2463509085493, "grad_norm": 1.0626343488693237, "learning_rate": 4.810332508651891e-05, "loss": 0.6642, "num_input_tokens_seen": 16543632, "step": 28510 }, { "epoch": 4.247095621090259, "grad_norm": 0.645421028137207, "learning_rate": 4.810208338190694e-05, "loss": 0.511, "num_input_tokens_seen": 16546480, "step": 28515 }, { "epoch": 4.247840333631219, "grad_norm": 1.5709528923034668, "learning_rate": 4.810084128700875e-05, "loss": 0.575, "num_input_tokens_seen": 16549328, "step": 28520 }, { "epoch": 4.2485850461721775, "grad_norm": 0.8528809547424316, "learning_rate": 4.809959880184532e-05, "loss": 0.6333, "num_input_tokens_seen": 16552144, "step": 28525 }, { "epoch": 4.249329758713137, "grad_norm": 1.3118376731872559, "learning_rate": 4.8098355926437655e-05, "loss": 0.635, "num_input_tokens_seen": 16555120, "step": 28530 }, { "epoch": 4.250074471254096, "grad_norm": 1.2738614082336426, "learning_rate": 4.809711266080673e-05, "loss": 0.6283, "num_input_tokens_seen": 16558032, "step": 28535 }, { "epoch": 4.2508191837950555, "grad_norm": 0.9182602167129517, "learning_rate": 4.809586900497357e-05, "loss": 0.6201, "num_input_tokens_seen": 16561040, "step": 28540 }, { "epoch": 4.251563896336014, "grad_norm": 1.3462141752243042, "learning_rate": 4.809462495895918e-05, "loss": 0.6796, "num_input_tokens_seen": 16563792, "step": 28545 }, { "epoch": 4.252308608876973, "grad_norm": 1.407271146774292, "learning_rate": 4.809338052278456e-05, "loss": 0.8042, "num_input_tokens_seen": 16566672, "step": 28550 }, { "epoch": 4.253053321417933, "grad_norm": 0.7904491424560547, "learning_rate": 4.809213569647076e-05, "loss": 0.735, "num_input_tokens_seen": 16569648, "step": 28555 }, { "epoch": 4.253798033958892, "grad_norm": 0.8792999386787415, "learning_rate": 4.8090890480038796e-05, "loss": 0.5572, "num_input_tokens_seen": 16572432, "step": 28560 }, { "epoch": 4.254542746499851, "grad_norm": 0.9316447377204895, "learning_rate": 4.80896448735097e-05, "loss": 0.4275, "num_input_tokens_seen": 16575088, "step": 28565 }, { "epoch": 4.25528745904081, "grad_norm": 0.7251430153846741, "learning_rate": 4.8088398876904526e-05, "loss": 0.5839, "num_input_tokens_seen": 16578096, "step": 28570 }, { "epoch": 4.2560321715817695, "grad_norm": 1.7297828197479248, "learning_rate": 4.808715249024431e-05, "loss": 0.6837, "num_input_tokens_seen": 16580944, "step": 28575 }, { "epoch": 4.256776884122728, "grad_norm": 0.6818189024925232, "learning_rate": 4.808590571355013e-05, "loss": 0.7907, "num_input_tokens_seen": 16583664, "step": 28580 }, { "epoch": 4.257521596663688, "grad_norm": 1.0134421586990356, "learning_rate": 4.808465854684303e-05, "loss": 0.7083, "num_input_tokens_seen": 16586640, "step": 28585 }, { "epoch": 4.258266309204647, "grad_norm": 1.2017902135849, "learning_rate": 4.8083410990144085e-05, "loss": 0.7547, "num_input_tokens_seen": 16589424, "step": 28590 }, { "epoch": 4.259011021745606, "grad_norm": 1.093208909034729, "learning_rate": 4.808216304347438e-05, "loss": 0.4882, "num_input_tokens_seen": 16592112, "step": 28595 }, { "epoch": 4.259755734286565, "grad_norm": 1.1059494018554688, "learning_rate": 4.8080914706854985e-05, "loss": 0.5418, "num_input_tokens_seen": 16595120, "step": 28600 }, { "epoch": 4.260500446827525, "grad_norm": 0.8494677543640137, "learning_rate": 4.8079665980306986e-05, "loss": 0.6608, "num_input_tokens_seen": 16597808, "step": 28605 }, { "epoch": 4.2612451593684835, "grad_norm": 0.9876880049705505, "learning_rate": 4.80784168638515e-05, "loss": 0.6058, "num_input_tokens_seen": 16600656, "step": 28610 }, { "epoch": 4.261989871909443, "grad_norm": 0.9363489151000977, "learning_rate": 4.807716735750961e-05, "loss": 0.6879, "num_input_tokens_seen": 16603408, "step": 28615 }, { "epoch": 4.262734584450402, "grad_norm": 1.838894009590149, "learning_rate": 4.8075917461302435e-05, "loss": 0.603, "num_input_tokens_seen": 16606640, "step": 28620 }, { "epoch": 4.2634792969913615, "grad_norm": 1.3176721334457397, "learning_rate": 4.807466717525109e-05, "loss": 0.7214, "num_input_tokens_seen": 16609360, "step": 28625 }, { "epoch": 4.26422400953232, "grad_norm": 0.6362738013267517, "learning_rate": 4.807341649937669e-05, "loss": 0.6104, "num_input_tokens_seen": 16612112, "step": 28630 }, { "epoch": 4.26496872207328, "grad_norm": 1.3586561679840088, "learning_rate": 4.8072165433700366e-05, "loss": 0.6399, "num_input_tokens_seen": 16615248, "step": 28635 }, { "epoch": 4.265713434614239, "grad_norm": 0.8601013422012329, "learning_rate": 4.807091397824327e-05, "loss": 0.5859, "num_input_tokens_seen": 16618256, "step": 28640 }, { "epoch": 4.266458147155198, "grad_norm": 0.8813484907150269, "learning_rate": 4.806966213302652e-05, "loss": 0.5885, "num_input_tokens_seen": 16621616, "step": 28645 }, { "epoch": 4.267202859696157, "grad_norm": 0.8559757471084595, "learning_rate": 4.806840989807128e-05, "loss": 0.6213, "num_input_tokens_seen": 16624336, "step": 28650 }, { "epoch": 4.267947572237117, "grad_norm": 1.3289493322372437, "learning_rate": 4.806715727339869e-05, "loss": 0.6512, "num_input_tokens_seen": 16627376, "step": 28655 }, { "epoch": 4.2686922847780755, "grad_norm": 0.5213431119918823, "learning_rate": 4.8065904259029934e-05, "loss": 0.6452, "num_input_tokens_seen": 16630512, "step": 28660 }, { "epoch": 4.269436997319035, "grad_norm": 2.025944232940674, "learning_rate": 4.806465085498616e-05, "loss": 0.7227, "num_input_tokens_seen": 16633424, "step": 28665 }, { "epoch": 4.270181709859994, "grad_norm": 0.9123669862747192, "learning_rate": 4.806339706128856e-05, "loss": 0.452, "num_input_tokens_seen": 16636336, "step": 28670 }, { "epoch": 4.2709264224009535, "grad_norm": 1.00993013381958, "learning_rate": 4.8062142877958307e-05, "loss": 0.7218, "num_input_tokens_seen": 16638928, "step": 28675 }, { "epoch": 4.271671134941912, "grad_norm": 1.6554192304611206, "learning_rate": 4.8060888305016584e-05, "loss": 0.7713, "num_input_tokens_seen": 16641872, "step": 28680 }, { "epoch": 4.272415847482872, "grad_norm": 1.7660291194915771, "learning_rate": 4.8059633342484586e-05, "loss": 0.8268, "num_input_tokens_seen": 16644560, "step": 28685 }, { "epoch": 4.273160560023831, "grad_norm": 1.4130152463912964, "learning_rate": 4.805837799038353e-05, "loss": 0.7674, "num_input_tokens_seen": 16647344, "step": 28690 }, { "epoch": 4.27390527256479, "grad_norm": 1.0980292558670044, "learning_rate": 4.805712224873461e-05, "loss": 0.4938, "num_input_tokens_seen": 16650224, "step": 28695 }, { "epoch": 4.274649985105749, "grad_norm": 1.2713834047317505, "learning_rate": 4.805586611755905e-05, "loss": 0.7024, "num_input_tokens_seen": 16653424, "step": 28700 }, { "epoch": 4.275394697646709, "grad_norm": 0.7844905257225037, "learning_rate": 4.805460959687805e-05, "loss": 0.5884, "num_input_tokens_seen": 16656112, "step": 28705 }, { "epoch": 4.2761394101876675, "grad_norm": 0.74845951795578, "learning_rate": 4.805335268671286e-05, "loss": 0.5781, "num_input_tokens_seen": 16658832, "step": 28710 }, { "epoch": 4.276884122728626, "grad_norm": 0.8691748976707458, "learning_rate": 4.805209538708471e-05, "loss": 0.5826, "num_input_tokens_seen": 16661840, "step": 28715 }, { "epoch": 4.277628835269586, "grad_norm": 0.8273022770881653, "learning_rate": 4.805083769801484e-05, "loss": 0.694, "num_input_tokens_seen": 16665008, "step": 28720 }, { "epoch": 4.278373547810546, "grad_norm": 0.960016131401062, "learning_rate": 4.804957961952449e-05, "loss": 0.6793, "num_input_tokens_seen": 16668080, "step": 28725 }, { "epoch": 4.279118260351504, "grad_norm": 0.5787007808685303, "learning_rate": 4.804832115163491e-05, "loss": 0.6209, "num_input_tokens_seen": 16670928, "step": 28730 }, { "epoch": 4.279862972892463, "grad_norm": 1.2172818183898926, "learning_rate": 4.804706229436739e-05, "loss": 0.7177, "num_input_tokens_seen": 16673296, "step": 28735 }, { "epoch": 4.280607685433423, "grad_norm": 0.7229565382003784, "learning_rate": 4.804580304774316e-05, "loss": 0.7002, "num_input_tokens_seen": 16676208, "step": 28740 }, { "epoch": 4.2813523979743815, "grad_norm": 0.8752335906028748, "learning_rate": 4.804454341178352e-05, "loss": 0.7298, "num_input_tokens_seen": 16679248, "step": 28745 }, { "epoch": 4.282097110515341, "grad_norm": 0.8047708868980408, "learning_rate": 4.804328338650973e-05, "loss": 0.5163, "num_input_tokens_seen": 16682128, "step": 28750 }, { "epoch": 4.2828418230563, "grad_norm": 1.2383564710617065, "learning_rate": 4.804202297194309e-05, "loss": 0.7648, "num_input_tokens_seen": 16685040, "step": 28755 }, { "epoch": 4.2835865355972595, "grad_norm": 1.77450430393219, "learning_rate": 4.8040762168104895e-05, "loss": 0.595, "num_input_tokens_seen": 16687696, "step": 28760 }, { "epoch": 4.284331248138218, "grad_norm": 1.0869462490081787, "learning_rate": 4.803950097501644e-05, "loss": 0.614, "num_input_tokens_seen": 16690448, "step": 28765 }, { "epoch": 4.285075960679178, "grad_norm": 1.1030616760253906, "learning_rate": 4.8038239392699033e-05, "loss": 0.7252, "num_input_tokens_seen": 16693264, "step": 28770 }, { "epoch": 4.285820673220137, "grad_norm": 1.1371498107910156, "learning_rate": 4.803697742117399e-05, "loss": 0.7797, "num_input_tokens_seen": 16695984, "step": 28775 }, { "epoch": 4.286565385761096, "grad_norm": 1.3143621683120728, "learning_rate": 4.8035715060462614e-05, "loss": 0.6737, "num_input_tokens_seen": 16698864, "step": 28780 }, { "epoch": 4.287310098302055, "grad_norm": 0.6366613507270813, "learning_rate": 4.803445231058625e-05, "loss": 0.645, "num_input_tokens_seen": 16701648, "step": 28785 }, { "epoch": 4.288054810843015, "grad_norm": 1.2111029624938965, "learning_rate": 4.803318917156624e-05, "loss": 0.7388, "num_input_tokens_seen": 16704528, "step": 28790 }, { "epoch": 4.2887995233839735, "grad_norm": 1.3821707963943481, "learning_rate": 4.803192564342389e-05, "loss": 0.6905, "num_input_tokens_seen": 16707344, "step": 28795 }, { "epoch": 4.289544235924933, "grad_norm": 0.9675589799880981, "learning_rate": 4.803066172618058e-05, "loss": 0.6729, "num_input_tokens_seen": 16710320, "step": 28800 }, { "epoch": 4.290288948465892, "grad_norm": 1.090083360671997, "learning_rate": 4.802939741985763e-05, "loss": 0.8272, "num_input_tokens_seen": 16713520, "step": 28805 }, { "epoch": 4.291033661006852, "grad_norm": 1.0434248447418213, "learning_rate": 4.802813272447643e-05, "loss": 0.6883, "num_input_tokens_seen": 16716304, "step": 28810 }, { "epoch": 4.29177837354781, "grad_norm": 2.0873780250549316, "learning_rate": 4.8026867640058335e-05, "loss": 0.8153, "num_input_tokens_seen": 16719152, "step": 28815 }, { "epoch": 4.29252308608877, "grad_norm": 0.9762817025184631, "learning_rate": 4.8025602166624705e-05, "loss": 0.539, "num_input_tokens_seen": 16722096, "step": 28820 }, { "epoch": 4.293267798629729, "grad_norm": 1.267244815826416, "learning_rate": 4.8024336304196927e-05, "loss": 0.6925, "num_input_tokens_seen": 16724880, "step": 28825 }, { "epoch": 4.294012511170688, "grad_norm": 0.7729890942573547, "learning_rate": 4.802307005279639e-05, "loss": 0.6837, "num_input_tokens_seen": 16727792, "step": 28830 }, { "epoch": 4.294757223711647, "grad_norm": 0.7390091419219971, "learning_rate": 4.8021803412444496e-05, "loss": 0.6142, "num_input_tokens_seen": 16730896, "step": 28835 }, { "epoch": 4.295501936252607, "grad_norm": 1.0043607950210571, "learning_rate": 4.8020536383162615e-05, "loss": 0.6668, "num_input_tokens_seen": 16733872, "step": 28840 }, { "epoch": 4.2962466487935655, "grad_norm": 0.9936238527297974, "learning_rate": 4.8019268964972184e-05, "loss": 0.557, "num_input_tokens_seen": 16736528, "step": 28845 }, { "epoch": 4.296991361334525, "grad_norm": 1.1277745962142944, "learning_rate": 4.801800115789459e-05, "loss": 0.7029, "num_input_tokens_seen": 16739344, "step": 28850 }, { "epoch": 4.297736073875484, "grad_norm": 0.8812772631645203, "learning_rate": 4.801673296195126e-05, "loss": 0.6991, "num_input_tokens_seen": 16742416, "step": 28855 }, { "epoch": 4.298480786416444, "grad_norm": 1.060455322265625, "learning_rate": 4.801546437716362e-05, "loss": 0.5966, "num_input_tokens_seen": 16745456, "step": 28860 }, { "epoch": 4.299225498957402, "grad_norm": 0.7599489688873291, "learning_rate": 4.801419540355311e-05, "loss": 0.7969, "num_input_tokens_seen": 16748432, "step": 28865 }, { "epoch": 4.299970211498362, "grad_norm": 0.9190466403961182, "learning_rate": 4.801292604114115e-05, "loss": 0.7173, "num_input_tokens_seen": 16751408, "step": 28870 }, { "epoch": 4.300714924039321, "grad_norm": 1.1061848402023315, "learning_rate": 4.80116562899492e-05, "loss": 0.7301, "num_input_tokens_seen": 16754384, "step": 28875 }, { "epoch": 4.30145963658028, "grad_norm": 1.0979279279708862, "learning_rate": 4.80103861499987e-05, "loss": 0.83, "num_input_tokens_seen": 16757104, "step": 28880 }, { "epoch": 4.302204349121239, "grad_norm": 1.13933265209198, "learning_rate": 4.800911562131112e-05, "loss": 0.578, "num_input_tokens_seen": 16759824, "step": 28885 }, { "epoch": 4.302949061662199, "grad_norm": 1.377500295639038, "learning_rate": 4.800784470390791e-05, "loss": 0.5747, "num_input_tokens_seen": 16762800, "step": 28890 }, { "epoch": 4.303693774203158, "grad_norm": 0.7288435697555542, "learning_rate": 4.800657339781055e-05, "loss": 0.5684, "num_input_tokens_seen": 16765616, "step": 28895 }, { "epoch": 4.304438486744116, "grad_norm": 0.7074532508850098, "learning_rate": 4.800530170304051e-05, "loss": 0.5768, "num_input_tokens_seen": 16768272, "step": 28900 }, { "epoch": 4.305183199285076, "grad_norm": 1.44046950340271, "learning_rate": 4.800402961961928e-05, "loss": 0.6699, "num_input_tokens_seen": 16771088, "step": 28905 }, { "epoch": 4.305927911826035, "grad_norm": 1.018841028213501, "learning_rate": 4.800275714756836e-05, "loss": 0.7424, "num_input_tokens_seen": 16774352, "step": 28910 }, { "epoch": 4.306672624366994, "grad_norm": 0.9118160605430603, "learning_rate": 4.800148428690923e-05, "loss": 0.6171, "num_input_tokens_seen": 16777296, "step": 28915 }, { "epoch": 4.307417336907953, "grad_norm": 0.983877956867218, "learning_rate": 4.80002110376634e-05, "loss": 0.6177, "num_input_tokens_seen": 16780144, "step": 28920 }, { "epoch": 4.308162049448913, "grad_norm": 0.7656976580619812, "learning_rate": 4.7998937399852386e-05, "loss": 0.8175, "num_input_tokens_seen": 16782992, "step": 28925 }, { "epoch": 4.3089067619898715, "grad_norm": 0.6929761171340942, "learning_rate": 4.799766337349769e-05, "loss": 0.798, "num_input_tokens_seen": 16786192, "step": 28930 }, { "epoch": 4.309651474530831, "grad_norm": 1.3193551301956177, "learning_rate": 4.799638895862085e-05, "loss": 0.6596, "num_input_tokens_seen": 16789200, "step": 28935 }, { "epoch": 4.31039618707179, "grad_norm": 1.2370811700820923, "learning_rate": 4.79951141552434e-05, "loss": 0.6876, "num_input_tokens_seen": 16792080, "step": 28940 }, { "epoch": 4.31114089961275, "grad_norm": 1.3242120742797852, "learning_rate": 4.799383896338686e-05, "loss": 0.6532, "num_input_tokens_seen": 16794928, "step": 28945 }, { "epoch": 4.311885612153708, "grad_norm": 0.8261741399765015, "learning_rate": 4.7992563383072775e-05, "loss": 0.6237, "num_input_tokens_seen": 16797776, "step": 28950 }, { "epoch": 4.312630324694668, "grad_norm": 0.8500916361808777, "learning_rate": 4.799128741432271e-05, "loss": 0.6629, "num_input_tokens_seen": 16800848, "step": 28955 }, { "epoch": 4.313375037235627, "grad_norm": 1.257280707359314, "learning_rate": 4.7990011057158207e-05, "loss": 0.7048, "num_input_tokens_seen": 16803632, "step": 28960 }, { "epoch": 4.314119749776586, "grad_norm": 1.082586407661438, "learning_rate": 4.798873431160084e-05, "loss": 0.7078, "num_input_tokens_seen": 16806224, "step": 28965 }, { "epoch": 4.314864462317545, "grad_norm": 2.1953375339508057, "learning_rate": 4.798745717767216e-05, "loss": 0.7531, "num_input_tokens_seen": 16808912, "step": 28970 }, { "epoch": 4.315609174858505, "grad_norm": 1.0113604068756104, "learning_rate": 4.7986179655393756e-05, "loss": 0.6533, "num_input_tokens_seen": 16811728, "step": 28975 }, { "epoch": 4.316353887399464, "grad_norm": 0.8060128092765808, "learning_rate": 4.798490174478721e-05, "loss": 0.6587, "num_input_tokens_seen": 16815024, "step": 28980 }, { "epoch": 4.317098599940423, "grad_norm": 0.9148125648498535, "learning_rate": 4.7983623445874114e-05, "loss": 0.5534, "num_input_tokens_seen": 16817712, "step": 28985 }, { "epoch": 4.317843312481382, "grad_norm": 0.6433026194572449, "learning_rate": 4.798234475867606e-05, "loss": 0.616, "num_input_tokens_seen": 16820240, "step": 28990 }, { "epoch": 4.318588025022342, "grad_norm": 1.1212965250015259, "learning_rate": 4.7981065683214645e-05, "loss": 0.6696, "num_input_tokens_seen": 16822992, "step": 28995 }, { "epoch": 4.3193327375633, "grad_norm": 0.9148591756820679, "learning_rate": 4.797978621951148e-05, "loss": 0.6426, "num_input_tokens_seen": 16825872, "step": 29000 }, { "epoch": 4.32007745010426, "grad_norm": 1.009052038192749, "learning_rate": 4.797850636758819e-05, "loss": 0.5543, "num_input_tokens_seen": 16828688, "step": 29005 }, { "epoch": 4.320822162645219, "grad_norm": 1.0774621963500977, "learning_rate": 4.7977226127466386e-05, "loss": 0.6293, "num_input_tokens_seen": 16831632, "step": 29010 }, { "epoch": 4.321566875186178, "grad_norm": 1.678845763206482, "learning_rate": 4.7975945499167696e-05, "loss": 0.6278, "num_input_tokens_seen": 16834384, "step": 29015 }, { "epoch": 4.322311587727137, "grad_norm": 0.9489478468894958, "learning_rate": 4.797466448271376e-05, "loss": 0.7336, "num_input_tokens_seen": 16837168, "step": 29020 }, { "epoch": 4.323056300268097, "grad_norm": 1.1135244369506836, "learning_rate": 4.7973383078126223e-05, "loss": 0.5213, "num_input_tokens_seen": 16840432, "step": 29025 }, { "epoch": 4.323801012809056, "grad_norm": 1.2791954278945923, "learning_rate": 4.797210128542673e-05, "loss": 0.6713, "num_input_tokens_seen": 16843088, "step": 29030 }, { "epoch": 4.324545725350015, "grad_norm": 1.2569332122802734, "learning_rate": 4.7970819104636924e-05, "loss": 0.7568, "num_input_tokens_seen": 16846000, "step": 29035 }, { "epoch": 4.325290437890974, "grad_norm": 0.794788122177124, "learning_rate": 4.796953653577848e-05, "loss": 0.6069, "num_input_tokens_seen": 16848752, "step": 29040 }, { "epoch": 4.326035150431934, "grad_norm": 1.1326894760131836, "learning_rate": 4.7968253578873054e-05, "loss": 0.7783, "num_input_tokens_seen": 16851728, "step": 29045 }, { "epoch": 4.326779862972892, "grad_norm": 1.0220359563827515, "learning_rate": 4.796697023394234e-05, "loss": 0.6519, "num_input_tokens_seen": 16854832, "step": 29050 }, { "epoch": 4.327524575513852, "grad_norm": 1.4052603244781494, "learning_rate": 4.7965686501008e-05, "loss": 0.6273, "num_input_tokens_seen": 16857680, "step": 29055 }, { "epoch": 4.328269288054811, "grad_norm": 1.3342355489730835, "learning_rate": 4.7964402380091734e-05, "loss": 0.7903, "num_input_tokens_seen": 16860784, "step": 29060 }, { "epoch": 4.32901400059577, "grad_norm": 0.9203695058822632, "learning_rate": 4.7963117871215224e-05, "loss": 0.6412, "num_input_tokens_seen": 16864080, "step": 29065 }, { "epoch": 4.329758713136729, "grad_norm": 0.7837125658988953, "learning_rate": 4.796183297440018e-05, "loss": 0.5886, "num_input_tokens_seen": 16866960, "step": 29070 }, { "epoch": 4.330503425677689, "grad_norm": 0.6855179071426392, "learning_rate": 4.79605476896683e-05, "loss": 0.6918, "num_input_tokens_seen": 16870128, "step": 29075 }, { "epoch": 4.331248138218648, "grad_norm": 1.2998135089874268, "learning_rate": 4.795926201704131e-05, "loss": 0.7281, "num_input_tokens_seen": 16873072, "step": 29080 }, { "epoch": 4.331992850759606, "grad_norm": 0.7920904159545898, "learning_rate": 4.795797595654091e-05, "loss": 0.6816, "num_input_tokens_seen": 16875984, "step": 29085 }, { "epoch": 4.332737563300566, "grad_norm": 0.8828892707824707, "learning_rate": 4.795668950818885e-05, "loss": 0.6809, "num_input_tokens_seen": 16879056, "step": 29090 }, { "epoch": 4.333482275841525, "grad_norm": 1.134704828262329, "learning_rate": 4.7955402672006854e-05, "loss": 0.6093, "num_input_tokens_seen": 16881776, "step": 29095 }, { "epoch": 4.334226988382484, "grad_norm": 0.9429495930671692, "learning_rate": 4.7954115448016654e-05, "loss": 0.5805, "num_input_tokens_seen": 16884592, "step": 29100 }, { "epoch": 4.334971700923443, "grad_norm": 1.015913963317871, "learning_rate": 4.795282783624001e-05, "loss": 0.6629, "num_input_tokens_seen": 16887344, "step": 29105 }, { "epoch": 4.335716413464403, "grad_norm": 1.7979809045791626, "learning_rate": 4.795153983669867e-05, "loss": 0.7994, "num_input_tokens_seen": 16890576, "step": 29110 }, { "epoch": 4.336461126005362, "grad_norm": 1.00113844871521, "learning_rate": 4.795025144941438e-05, "loss": 0.7145, "num_input_tokens_seen": 16893488, "step": 29115 }, { "epoch": 4.337205838546321, "grad_norm": 0.8252432346343994, "learning_rate": 4.794896267440893e-05, "loss": 0.5679, "num_input_tokens_seen": 16896528, "step": 29120 }, { "epoch": 4.33795055108728, "grad_norm": 0.9623602628707886, "learning_rate": 4.794767351170406e-05, "loss": 0.6404, "num_input_tokens_seen": 16899568, "step": 29125 }, { "epoch": 4.33869526362824, "grad_norm": 1.6126362085342407, "learning_rate": 4.794638396132159e-05, "loss": 0.5467, "num_input_tokens_seen": 16902256, "step": 29130 }, { "epoch": 4.339439976169198, "grad_norm": 0.8821008205413818, "learning_rate": 4.7945094023283275e-05, "loss": 0.7539, "num_input_tokens_seen": 16904976, "step": 29135 }, { "epoch": 4.340184688710158, "grad_norm": 0.812647819519043, "learning_rate": 4.794380369761092e-05, "loss": 0.5856, "num_input_tokens_seen": 16907760, "step": 29140 }, { "epoch": 4.340929401251117, "grad_norm": 1.0602596998214722, "learning_rate": 4.794251298432632e-05, "loss": 0.4711, "num_input_tokens_seen": 16910640, "step": 29145 }, { "epoch": 4.3416741137920765, "grad_norm": 1.5158848762512207, "learning_rate": 4.794122188345128e-05, "loss": 0.6533, "num_input_tokens_seen": 16913296, "step": 29150 }, { "epoch": 4.342418826333035, "grad_norm": 0.8956998586654663, "learning_rate": 4.7939930395007615e-05, "loss": 0.6151, "num_input_tokens_seen": 16916112, "step": 29155 }, { "epoch": 4.343163538873995, "grad_norm": 0.8442599177360535, "learning_rate": 4.7938638519017134e-05, "loss": 0.7952, "num_input_tokens_seen": 16918960, "step": 29160 }, { "epoch": 4.343908251414954, "grad_norm": 0.840085506439209, "learning_rate": 4.793734625550167e-05, "loss": 0.8115, "num_input_tokens_seen": 16922064, "step": 29165 }, { "epoch": 4.344652963955913, "grad_norm": 0.8626365661621094, "learning_rate": 4.7936053604483065e-05, "loss": 0.6727, "num_input_tokens_seen": 16924912, "step": 29170 }, { "epoch": 4.345397676496872, "grad_norm": 1.4391200542449951, "learning_rate": 4.793476056598314e-05, "loss": 0.5942, "num_input_tokens_seen": 16927760, "step": 29175 }, { "epoch": 4.346142389037832, "grad_norm": 0.9884179830551147, "learning_rate": 4.7933467140023736e-05, "loss": 0.5661, "num_input_tokens_seen": 16930352, "step": 29180 }, { "epoch": 4.34688710157879, "grad_norm": 0.6883770823478699, "learning_rate": 4.793217332662672e-05, "loss": 0.6348, "num_input_tokens_seen": 16933040, "step": 29185 }, { "epoch": 4.34763181411975, "grad_norm": 0.6305306553840637, "learning_rate": 4.7930879125813945e-05, "loss": 0.5133, "num_input_tokens_seen": 16935632, "step": 29190 }, { "epoch": 4.348376526660709, "grad_norm": 1.1442409753799438, "learning_rate": 4.792958453760728e-05, "loss": 0.6131, "num_input_tokens_seen": 16938768, "step": 29195 }, { "epoch": 4.3491212392016685, "grad_norm": 1.0817338228225708, "learning_rate": 4.792828956202857e-05, "loss": 0.6611, "num_input_tokens_seen": 16941936, "step": 29200 }, { "epoch": 4.349865951742627, "grad_norm": 0.9345781207084656, "learning_rate": 4.792699419909972e-05, "loss": 0.5245, "num_input_tokens_seen": 16944816, "step": 29205 }, { "epoch": 4.350610664283587, "grad_norm": 1.0734859704971313, "learning_rate": 4.792569844884261e-05, "loss": 0.6241, "num_input_tokens_seen": 16947760, "step": 29210 }, { "epoch": 4.351355376824546, "grad_norm": 0.9895613789558411, "learning_rate": 4.792440231127912e-05, "loss": 0.5639, "num_input_tokens_seen": 16950480, "step": 29215 }, { "epoch": 4.352100089365505, "grad_norm": 2.2069785594940186, "learning_rate": 4.792310578643116e-05, "loss": 0.6574, "num_input_tokens_seen": 16953296, "step": 29220 }, { "epoch": 4.352844801906464, "grad_norm": 1.089903712272644, "learning_rate": 4.7921808874320616e-05, "loss": 0.648, "num_input_tokens_seen": 16956112, "step": 29225 }, { "epoch": 4.353589514447424, "grad_norm": 1.072149634361267, "learning_rate": 4.792051157496941e-05, "loss": 0.7495, "num_input_tokens_seen": 16958992, "step": 29230 }, { "epoch": 4.3543342269883825, "grad_norm": 1.5563985109329224, "learning_rate": 4.791921388839946e-05, "loss": 0.6878, "num_input_tokens_seen": 16961904, "step": 29235 }, { "epoch": 4.355078939529342, "grad_norm": 2.2198245525360107, "learning_rate": 4.791791581463268e-05, "loss": 0.8716, "num_input_tokens_seen": 16964784, "step": 29240 }, { "epoch": 4.355823652070301, "grad_norm": 0.9275757074356079, "learning_rate": 4.791661735369101e-05, "loss": 0.6654, "num_input_tokens_seen": 16967664, "step": 29245 }, { "epoch": 4.35656836461126, "grad_norm": 1.8744722604751587, "learning_rate": 4.791531850559637e-05, "loss": 0.8216, "num_input_tokens_seen": 16970384, "step": 29250 }, { "epoch": 4.357313077152219, "grad_norm": 0.786263644695282, "learning_rate": 4.791401927037073e-05, "loss": 0.6102, "num_input_tokens_seen": 16973328, "step": 29255 }, { "epoch": 4.358057789693178, "grad_norm": 1.297654628753662, "learning_rate": 4.791271964803602e-05, "loss": 0.5825, "num_input_tokens_seen": 16976176, "step": 29260 }, { "epoch": 4.358802502234138, "grad_norm": 1.0526896715164185, "learning_rate": 4.791141963861419e-05, "loss": 0.7021, "num_input_tokens_seen": 16979024, "step": 29265 }, { "epoch": 4.359547214775096, "grad_norm": 1.0240510702133179, "learning_rate": 4.791011924212721e-05, "loss": 0.6493, "num_input_tokens_seen": 16982192, "step": 29270 }, { "epoch": 4.360291927316056, "grad_norm": 1.3332929611206055, "learning_rate": 4.790881845859707e-05, "loss": 0.6627, "num_input_tokens_seen": 16985104, "step": 29275 }, { "epoch": 4.361036639857015, "grad_norm": 2.045309543609619, "learning_rate": 4.790751728804571e-05, "loss": 0.7968, "num_input_tokens_seen": 16988112, "step": 29280 }, { "epoch": 4.3617813523979745, "grad_norm": 2.372645378112793, "learning_rate": 4.790621573049513e-05, "loss": 0.7434, "num_input_tokens_seen": 16990960, "step": 29285 }, { "epoch": 4.362526064938933, "grad_norm": 1.1887271404266357, "learning_rate": 4.790491378596731e-05, "loss": 0.8915, "num_input_tokens_seen": 16993840, "step": 29290 }, { "epoch": 4.363270777479893, "grad_norm": 0.7522968053817749, "learning_rate": 4.7903611454484266e-05, "loss": 0.731, "num_input_tokens_seen": 16996816, "step": 29295 }, { "epoch": 4.364015490020852, "grad_norm": 1.2578083276748657, "learning_rate": 4.790230873606797e-05, "loss": 0.7464, "num_input_tokens_seen": 16999920, "step": 29300 }, { "epoch": 4.364760202561811, "grad_norm": 1.386443853378296, "learning_rate": 4.790100563074045e-05, "loss": 0.614, "num_input_tokens_seen": 17002704, "step": 29305 }, { "epoch": 4.36550491510277, "grad_norm": 1.6978546380996704, "learning_rate": 4.789970213852372e-05, "loss": 0.6385, "num_input_tokens_seen": 17005552, "step": 29310 }, { "epoch": 4.36624962764373, "grad_norm": 0.8969727754592896, "learning_rate": 4.789839825943979e-05, "loss": 0.7661, "num_input_tokens_seen": 17008528, "step": 29315 }, { "epoch": 4.3669943401846885, "grad_norm": 0.9265002012252808, "learning_rate": 4.78970939935107e-05, "loss": 0.5077, "num_input_tokens_seen": 17011216, "step": 29320 }, { "epoch": 4.367739052725648, "grad_norm": 1.1556862592697144, "learning_rate": 4.789578934075847e-05, "loss": 0.6628, "num_input_tokens_seen": 17014192, "step": 29325 }, { "epoch": 4.368483765266607, "grad_norm": 1.1166144609451294, "learning_rate": 4.7894484301205156e-05, "loss": 0.8162, "num_input_tokens_seen": 17017072, "step": 29330 }, { "epoch": 4.3692284778075665, "grad_norm": 1.0963281393051147, "learning_rate": 4.78931788748728e-05, "loss": 0.5849, "num_input_tokens_seen": 17019952, "step": 29335 }, { "epoch": 4.369973190348525, "grad_norm": 0.8228617906570435, "learning_rate": 4.789187306178345e-05, "loss": 0.4227, "num_input_tokens_seen": 17022864, "step": 29340 }, { "epoch": 4.370717902889485, "grad_norm": 0.7075296640396118, "learning_rate": 4.789056686195917e-05, "loss": 0.5656, "num_input_tokens_seen": 17025616, "step": 29345 }, { "epoch": 4.371462615430444, "grad_norm": 1.2781319618225098, "learning_rate": 4.788926027542203e-05, "loss": 0.6023, "num_input_tokens_seen": 17028464, "step": 29350 }, { "epoch": 4.372207327971403, "grad_norm": 0.7963135838508606, "learning_rate": 4.7887953302194106e-05, "loss": 0.6191, "num_input_tokens_seen": 17031504, "step": 29355 }, { "epoch": 4.372952040512362, "grad_norm": 1.0144819021224976, "learning_rate": 4.788664594229747e-05, "loss": 0.5626, "num_input_tokens_seen": 17034384, "step": 29360 }, { "epoch": 4.373696753053322, "grad_norm": 2.1397342681884766, "learning_rate": 4.788533819575421e-05, "loss": 0.5899, "num_input_tokens_seen": 17037296, "step": 29365 }, { "epoch": 4.3744414655942805, "grad_norm": 2.0001628398895264, "learning_rate": 4.7884030062586424e-05, "loss": 0.7023, "num_input_tokens_seen": 17040144, "step": 29370 }, { "epoch": 4.37518617813524, "grad_norm": 0.6851540207862854, "learning_rate": 4.78827215428162e-05, "loss": 0.6331, "num_input_tokens_seen": 17043120, "step": 29375 }, { "epoch": 4.375930890676199, "grad_norm": 1.127927303314209, "learning_rate": 4.7881412636465664e-05, "loss": 0.7087, "num_input_tokens_seen": 17045904, "step": 29380 }, { "epoch": 4.3766756032171585, "grad_norm": 0.8476393222808838, "learning_rate": 4.7880103343556906e-05, "loss": 0.4664, "num_input_tokens_seen": 17048496, "step": 29385 }, { "epoch": 4.377420315758117, "grad_norm": 0.8760827779769897, "learning_rate": 4.787879366411206e-05, "loss": 0.534, "num_input_tokens_seen": 17051600, "step": 29390 }, { "epoch": 4.378165028299077, "grad_norm": 1.0483365058898926, "learning_rate": 4.787748359815326e-05, "loss": 0.7889, "num_input_tokens_seen": 17054448, "step": 29395 }, { "epoch": 4.378909740840036, "grad_norm": 1.0566774606704712, "learning_rate": 4.787617314570261e-05, "loss": 0.7139, "num_input_tokens_seen": 17057424, "step": 29400 }, { "epoch": 4.379654453380995, "grad_norm": 2.021544933319092, "learning_rate": 4.7874862306782276e-05, "loss": 0.7005, "num_input_tokens_seen": 17060336, "step": 29405 }, { "epoch": 4.380399165921954, "grad_norm": 1.6017252206802368, "learning_rate": 4.787355108141439e-05, "loss": 0.6764, "num_input_tokens_seen": 17063216, "step": 29410 }, { "epoch": 4.381143878462913, "grad_norm": 1.1884063482284546, "learning_rate": 4.78722394696211e-05, "loss": 0.5953, "num_input_tokens_seen": 17066320, "step": 29415 }, { "epoch": 4.3818885910038725, "grad_norm": 1.1617501974105835, "learning_rate": 4.787092747142458e-05, "loss": 0.7196, "num_input_tokens_seen": 17069456, "step": 29420 }, { "epoch": 4.382633303544832, "grad_norm": 0.751309335231781, "learning_rate": 4.7869615086846973e-05, "loss": 0.5237, "num_input_tokens_seen": 17072176, "step": 29425 }, { "epoch": 4.383378016085791, "grad_norm": 1.4703041315078735, "learning_rate": 4.786830231591047e-05, "loss": 0.874, "num_input_tokens_seen": 17075056, "step": 29430 }, { "epoch": 4.38412272862675, "grad_norm": 1.0930887460708618, "learning_rate": 4.786698915863724e-05, "loss": 0.7264, "num_input_tokens_seen": 17077712, "step": 29435 }, { "epoch": 4.384867441167709, "grad_norm": 1.0293989181518555, "learning_rate": 4.7865675615049464e-05, "loss": 0.548, "num_input_tokens_seen": 17080688, "step": 29440 }, { "epoch": 4.385612153708668, "grad_norm": 1.5039023160934448, "learning_rate": 4.786436168516935e-05, "loss": 0.6919, "num_input_tokens_seen": 17083792, "step": 29445 }, { "epoch": 4.386356866249628, "grad_norm": 1.0929150581359863, "learning_rate": 4.786304736901908e-05, "loss": 0.8303, "num_input_tokens_seen": 17086800, "step": 29450 }, { "epoch": 4.3871015787905865, "grad_norm": 0.9168564677238464, "learning_rate": 4.7861732666620856e-05, "loss": 0.7217, "num_input_tokens_seen": 17089616, "step": 29455 }, { "epoch": 4.387846291331546, "grad_norm": 1.5646198987960815, "learning_rate": 4.78604175779969e-05, "loss": 0.6346, "num_input_tokens_seen": 17092592, "step": 29460 }, { "epoch": 4.388591003872505, "grad_norm": 0.7374613285064697, "learning_rate": 4.7859102103169415e-05, "loss": 0.6817, "num_input_tokens_seen": 17095440, "step": 29465 }, { "epoch": 4.3893357164134645, "grad_norm": 1.043255090713501, "learning_rate": 4.785778624216064e-05, "loss": 0.509, "num_input_tokens_seen": 17098000, "step": 29470 }, { "epoch": 4.390080428954423, "grad_norm": 1.405071496963501, "learning_rate": 4.7856469994992805e-05, "loss": 0.8146, "num_input_tokens_seen": 17100816, "step": 29475 }, { "epoch": 4.390825141495383, "grad_norm": 0.7703168988227844, "learning_rate": 4.7855153361688124e-05, "loss": 0.7214, "num_input_tokens_seen": 17103600, "step": 29480 }, { "epoch": 4.391569854036342, "grad_norm": 1.0260517597198486, "learning_rate": 4.785383634226887e-05, "loss": 0.5668, "num_input_tokens_seen": 17106416, "step": 29485 }, { "epoch": 4.392314566577301, "grad_norm": 0.9725991487503052, "learning_rate": 4.785251893675727e-05, "loss": 0.8147, "num_input_tokens_seen": 17109648, "step": 29490 }, { "epoch": 4.39305927911826, "grad_norm": 1.1599619388580322, "learning_rate": 4.785120114517559e-05, "loss": 0.6736, "num_input_tokens_seen": 17112464, "step": 29495 }, { "epoch": 4.39380399165922, "grad_norm": 1.3070755004882812, "learning_rate": 4.7849882967546086e-05, "loss": 0.7495, "num_input_tokens_seen": 17115568, "step": 29500 }, { "epoch": 4.3945487042001785, "grad_norm": 1.0092347860336304, "learning_rate": 4.784856440389105e-05, "loss": 0.6475, "num_input_tokens_seen": 17118736, "step": 29505 }, { "epoch": 4.395293416741138, "grad_norm": 1.1447101831436157, "learning_rate": 4.784724545423272e-05, "loss": 0.8416, "num_input_tokens_seen": 17121296, "step": 29510 }, { "epoch": 4.396038129282097, "grad_norm": 2.237793207168579, "learning_rate": 4.7845926118593415e-05, "loss": 0.6966, "num_input_tokens_seen": 17123952, "step": 29515 }, { "epoch": 4.396782841823057, "grad_norm": 1.1926642656326294, "learning_rate": 4.784460639699541e-05, "loss": 0.6177, "num_input_tokens_seen": 17126896, "step": 29520 }, { "epoch": 4.397527554364015, "grad_norm": 0.9356604814529419, "learning_rate": 4.784328628946098e-05, "loss": 0.6486, "num_input_tokens_seen": 17129744, "step": 29525 }, { "epoch": 4.398272266904975, "grad_norm": 1.1061327457427979, "learning_rate": 4.784196579601246e-05, "loss": 0.7419, "num_input_tokens_seen": 17132656, "step": 29530 }, { "epoch": 4.399016979445934, "grad_norm": 1.219487190246582, "learning_rate": 4.784064491667214e-05, "loss": 0.605, "num_input_tokens_seen": 17135408, "step": 29535 }, { "epoch": 4.399761691986893, "grad_norm": 1.0579384565353394, "learning_rate": 4.7839323651462334e-05, "loss": 0.5872, "num_input_tokens_seen": 17138352, "step": 29540 }, { "epoch": 4.400506404527852, "grad_norm": 0.9433521628379822, "learning_rate": 4.783800200040537e-05, "loss": 0.5829, "num_input_tokens_seen": 17141168, "step": 29545 }, { "epoch": 4.401251117068812, "grad_norm": 0.9031031727790833, "learning_rate": 4.783667996352357e-05, "loss": 0.5828, "num_input_tokens_seen": 17144144, "step": 29550 }, { "epoch": 4.4019958296097705, "grad_norm": 1.1100786924362183, "learning_rate": 4.783535754083927e-05, "loss": 0.6592, "num_input_tokens_seen": 17147024, "step": 29555 }, { "epoch": 4.40274054215073, "grad_norm": 0.8341569304466248, "learning_rate": 4.783403473237483e-05, "loss": 0.5293, "num_input_tokens_seen": 17149744, "step": 29560 }, { "epoch": 4.403485254691689, "grad_norm": 0.7371013760566711, "learning_rate": 4.783271153815257e-05, "loss": 0.6598, "num_input_tokens_seen": 17152784, "step": 29565 }, { "epoch": 4.404229967232649, "grad_norm": 0.7760767936706543, "learning_rate": 4.783138795819485e-05, "loss": 0.6358, "num_input_tokens_seen": 17155760, "step": 29570 }, { "epoch": 4.404974679773607, "grad_norm": 0.8876723647117615, "learning_rate": 4.783006399252404e-05, "loss": 0.6296, "num_input_tokens_seen": 17158384, "step": 29575 }, { "epoch": 4.405719392314566, "grad_norm": 1.2214702367782593, "learning_rate": 4.782873964116251e-05, "loss": 0.6785, "num_input_tokens_seen": 17161328, "step": 29580 }, { "epoch": 4.406464104855526, "grad_norm": 1.109753131866455, "learning_rate": 4.782741490413262e-05, "loss": 0.7355, "num_input_tokens_seen": 17163984, "step": 29585 }, { "epoch": 4.407208817396485, "grad_norm": 0.8022556304931641, "learning_rate": 4.782608978145675e-05, "loss": 0.6422, "num_input_tokens_seen": 17166864, "step": 29590 }, { "epoch": 4.407953529937444, "grad_norm": 1.6767126321792603, "learning_rate": 4.7824764273157295e-05, "loss": 0.6761, "num_input_tokens_seen": 17169584, "step": 29595 }, { "epoch": 4.408698242478403, "grad_norm": 1.0344221591949463, "learning_rate": 4.782343837925665e-05, "loss": 0.6302, "num_input_tokens_seen": 17172304, "step": 29600 }, { "epoch": 4.409442955019363, "grad_norm": 1.2444125413894653, "learning_rate": 4.7822112099777205e-05, "loss": 0.6482, "num_input_tokens_seen": 17175408, "step": 29605 }, { "epoch": 4.410187667560321, "grad_norm": 0.9488095641136169, "learning_rate": 4.7820785434741375e-05, "loss": 0.5122, "num_input_tokens_seen": 17178352, "step": 29610 }, { "epoch": 4.410932380101281, "grad_norm": 1.370564579963684, "learning_rate": 4.7819458384171566e-05, "loss": 0.7114, "num_input_tokens_seen": 17181392, "step": 29615 }, { "epoch": 4.41167709264224, "grad_norm": 1.1212581396102905, "learning_rate": 4.78181309480902e-05, "loss": 0.6611, "num_input_tokens_seen": 17184240, "step": 29620 }, { "epoch": 4.412421805183199, "grad_norm": 0.9638068079948425, "learning_rate": 4.781680312651971e-05, "loss": 0.7856, "num_input_tokens_seen": 17187344, "step": 29625 }, { "epoch": 4.413166517724158, "grad_norm": 1.3461213111877441, "learning_rate": 4.781547491948252e-05, "loss": 0.5534, "num_input_tokens_seen": 17190064, "step": 29630 }, { "epoch": 4.413911230265118, "grad_norm": 1.7619967460632324, "learning_rate": 4.7814146327001067e-05, "loss": 0.7027, "num_input_tokens_seen": 17193136, "step": 29635 }, { "epoch": 4.4146559428060765, "grad_norm": 1.050864577293396, "learning_rate": 4.7812817349097796e-05, "loss": 0.73, "num_input_tokens_seen": 17196048, "step": 29640 }, { "epoch": 4.415400655347036, "grad_norm": 1.7381573915481567, "learning_rate": 4.7811487985795164e-05, "loss": 0.7349, "num_input_tokens_seen": 17198992, "step": 29645 }, { "epoch": 4.416145367887995, "grad_norm": 0.9369109272956848, "learning_rate": 4.781015823711563e-05, "loss": 0.7334, "num_input_tokens_seen": 17201872, "step": 29650 }, { "epoch": 4.416890080428955, "grad_norm": 0.8873350024223328, "learning_rate": 4.780882810308165e-05, "loss": 0.6514, "num_input_tokens_seen": 17204720, "step": 29655 }, { "epoch": 4.417634792969913, "grad_norm": 1.4035255908966064, "learning_rate": 4.7807497583715704e-05, "loss": 0.7253, "num_input_tokens_seen": 17207568, "step": 29660 }, { "epoch": 4.418379505510873, "grad_norm": 1.4270366430282593, "learning_rate": 4.780616667904026e-05, "loss": 0.6208, "num_input_tokens_seen": 17210384, "step": 29665 }, { "epoch": 4.419124218051832, "grad_norm": 0.6639850735664368, "learning_rate": 4.7804835389077824e-05, "loss": 0.6618, "num_input_tokens_seen": 17213392, "step": 29670 }, { "epoch": 4.419868930592791, "grad_norm": 0.8607977628707886, "learning_rate": 4.780350371385086e-05, "loss": 0.5325, "num_input_tokens_seen": 17216368, "step": 29675 }, { "epoch": 4.42061364313375, "grad_norm": 0.7518293261528015, "learning_rate": 4.7802171653381885e-05, "loss": 0.593, "num_input_tokens_seen": 17219280, "step": 29680 }, { "epoch": 4.42135835567471, "grad_norm": 1.1919796466827393, "learning_rate": 4.780083920769339e-05, "loss": 0.7077, "num_input_tokens_seen": 17222128, "step": 29685 }, { "epoch": 4.422103068215669, "grad_norm": 0.8088395595550537, "learning_rate": 4.779950637680789e-05, "loss": 0.5472, "num_input_tokens_seen": 17224688, "step": 29690 }, { "epoch": 4.422847780756628, "grad_norm": 1.631648302078247, "learning_rate": 4.7798173160747906e-05, "loss": 0.7141, "num_input_tokens_seen": 17227792, "step": 29695 }, { "epoch": 4.423592493297587, "grad_norm": 0.6887129545211792, "learning_rate": 4.7796839559535955e-05, "loss": 0.7557, "num_input_tokens_seen": 17230672, "step": 29700 }, { "epoch": 4.424337205838547, "grad_norm": 1.5203944444656372, "learning_rate": 4.779550557319457e-05, "loss": 0.6642, "num_input_tokens_seen": 17233584, "step": 29705 }, { "epoch": 4.425081918379505, "grad_norm": 1.0397896766662598, "learning_rate": 4.7794171201746285e-05, "loss": 0.6338, "num_input_tokens_seen": 17236240, "step": 29710 }, { "epoch": 4.425826630920465, "grad_norm": 0.9571608304977417, "learning_rate": 4.779283644521365e-05, "loss": 0.5743, "num_input_tokens_seen": 17239216, "step": 29715 }, { "epoch": 4.426571343461424, "grad_norm": 0.9088314175605774, "learning_rate": 4.7791501303619205e-05, "loss": 0.6083, "num_input_tokens_seen": 17241840, "step": 29720 }, { "epoch": 4.427316056002383, "grad_norm": 0.8996896147727966, "learning_rate": 4.7790165776985504e-05, "loss": 0.6048, "num_input_tokens_seen": 17245136, "step": 29725 }, { "epoch": 4.428060768543342, "grad_norm": 1.2053070068359375, "learning_rate": 4.7788829865335125e-05, "loss": 0.6262, "num_input_tokens_seen": 17247984, "step": 29730 }, { "epoch": 4.428805481084302, "grad_norm": 0.9112732410430908, "learning_rate": 4.778749356869062e-05, "loss": 0.6336, "num_input_tokens_seen": 17251056, "step": 29735 }, { "epoch": 4.429550193625261, "grad_norm": 0.7764618396759033, "learning_rate": 4.778615688707457e-05, "loss": 0.7263, "num_input_tokens_seen": 17254000, "step": 29740 }, { "epoch": 4.43029490616622, "grad_norm": 1.089055061340332, "learning_rate": 4.778481982050956e-05, "loss": 0.5373, "num_input_tokens_seen": 17256944, "step": 29745 }, { "epoch": 4.431039618707179, "grad_norm": 1.1967724561691284, "learning_rate": 4.778348236901818e-05, "loss": 0.543, "num_input_tokens_seen": 17259888, "step": 29750 }, { "epoch": 4.431784331248139, "grad_norm": 1.175166368484497, "learning_rate": 4.7782144532623016e-05, "loss": 0.5881, "num_input_tokens_seen": 17262704, "step": 29755 }, { "epoch": 4.432529043789097, "grad_norm": 1.1550976037979126, "learning_rate": 4.7780806311346684e-05, "loss": 0.558, "num_input_tokens_seen": 17265520, "step": 29760 }, { "epoch": 4.433273756330056, "grad_norm": 0.760300874710083, "learning_rate": 4.777946770521178e-05, "loss": 0.6899, "num_input_tokens_seen": 17268496, "step": 29765 }, { "epoch": 4.434018468871016, "grad_norm": 1.2909691333770752, "learning_rate": 4.7778128714240915e-05, "loss": 0.5515, "num_input_tokens_seen": 17271280, "step": 29770 }, { "epoch": 4.434763181411975, "grad_norm": 1.9153636693954468, "learning_rate": 4.7776789338456717e-05, "loss": 0.6234, "num_input_tokens_seen": 17274256, "step": 29775 }, { "epoch": 4.435507893952934, "grad_norm": 0.9684014916419983, "learning_rate": 4.777544957788182e-05, "loss": 0.6503, "num_input_tokens_seen": 17277360, "step": 29780 }, { "epoch": 4.436252606493893, "grad_norm": 0.5935559272766113, "learning_rate": 4.7774109432538843e-05, "loss": 0.6513, "num_input_tokens_seen": 17280464, "step": 29785 }, { "epoch": 4.436997319034853, "grad_norm": 1.6404168605804443, "learning_rate": 4.777276890245044e-05, "loss": 0.8268, "num_input_tokens_seen": 17282928, "step": 29790 }, { "epoch": 4.437742031575811, "grad_norm": 2.271796941757202, "learning_rate": 4.7771427987639246e-05, "loss": 0.6982, "num_input_tokens_seen": 17285712, "step": 29795 }, { "epoch": 4.438486744116771, "grad_norm": 1.2664663791656494, "learning_rate": 4.777008668812793e-05, "loss": 0.6786, "num_input_tokens_seen": 17288560, "step": 29800 }, { "epoch": 4.43923145665773, "grad_norm": 2.4383177757263184, "learning_rate": 4.776874500393912e-05, "loss": 0.7584, "num_input_tokens_seen": 17291824, "step": 29805 }, { "epoch": 4.439976169198689, "grad_norm": 1.2177493572235107, "learning_rate": 4.7767402935095525e-05, "loss": 0.522, "num_input_tokens_seen": 17294832, "step": 29810 }, { "epoch": 4.440720881739648, "grad_norm": 3.0310285091400146, "learning_rate": 4.776606048161979e-05, "loss": 0.8739, "num_input_tokens_seen": 17297968, "step": 29815 }, { "epoch": 4.441465594280608, "grad_norm": 1.0722205638885498, "learning_rate": 4.77647176435346e-05, "loss": 0.73, "num_input_tokens_seen": 17300912, "step": 29820 }, { "epoch": 4.442210306821567, "grad_norm": 1.0102424621582031, "learning_rate": 4.7763374420862645e-05, "loss": 0.6835, "num_input_tokens_seen": 17303760, "step": 29825 }, { "epoch": 4.442955019362526, "grad_norm": 0.7836819887161255, "learning_rate": 4.7762030813626615e-05, "loss": 0.8166, "num_input_tokens_seen": 17306704, "step": 29830 }, { "epoch": 4.443699731903485, "grad_norm": 1.590430498123169, "learning_rate": 4.776068682184921e-05, "loss": 0.5986, "num_input_tokens_seen": 17309680, "step": 29835 }, { "epoch": 4.444444444444445, "grad_norm": 0.9011185765266418, "learning_rate": 4.7759342445553124e-05, "loss": 0.7794, "num_input_tokens_seen": 17312432, "step": 29840 }, { "epoch": 4.445189156985403, "grad_norm": 0.7097480297088623, "learning_rate": 4.775799768476109e-05, "loss": 0.613, "num_input_tokens_seen": 17315248, "step": 29845 }, { "epoch": 4.445933869526363, "grad_norm": 0.9142248630523682, "learning_rate": 4.775665253949581e-05, "loss": 0.7135, "num_input_tokens_seen": 17318000, "step": 29850 }, { "epoch": 4.446678582067322, "grad_norm": 1.7494860887527466, "learning_rate": 4.775530700978002e-05, "loss": 0.5067, "num_input_tokens_seen": 17320688, "step": 29855 }, { "epoch": 4.4474232946082815, "grad_norm": 1.3468682765960693, "learning_rate": 4.775396109563644e-05, "loss": 0.7505, "num_input_tokens_seen": 17323312, "step": 29860 }, { "epoch": 4.44816800714924, "grad_norm": 1.1150223016738892, "learning_rate": 4.775261479708781e-05, "loss": 0.5511, "num_input_tokens_seen": 17326320, "step": 29865 }, { "epoch": 4.4489127196902, "grad_norm": 1.0796847343444824, "learning_rate": 4.775126811415689e-05, "loss": 0.733, "num_input_tokens_seen": 17329328, "step": 29870 }, { "epoch": 4.449657432231159, "grad_norm": 1.3268816471099854, "learning_rate": 4.7749921046866407e-05, "loss": 0.6694, "num_input_tokens_seen": 17332368, "step": 29875 }, { "epoch": 4.450402144772118, "grad_norm": 1.0508073568344116, "learning_rate": 4.7748573595239134e-05, "loss": 0.6906, "num_input_tokens_seen": 17335184, "step": 29880 }, { "epoch": 4.451146857313077, "grad_norm": 1.0938302278518677, "learning_rate": 4.7747225759297835e-05, "loss": 0.5862, "num_input_tokens_seen": 17337872, "step": 29885 }, { "epoch": 4.451891569854037, "grad_norm": 0.6288942694664001, "learning_rate": 4.774587753906526e-05, "loss": 0.5245, "num_input_tokens_seen": 17340848, "step": 29890 }, { "epoch": 4.452636282394995, "grad_norm": 1.1233553886413574, "learning_rate": 4.774452893456423e-05, "loss": 0.6855, "num_input_tokens_seen": 17343984, "step": 29895 }, { "epoch": 4.453380994935955, "grad_norm": 1.23386812210083, "learning_rate": 4.774317994581748e-05, "loss": 0.6603, "num_input_tokens_seen": 17347152, "step": 29900 }, { "epoch": 4.454125707476914, "grad_norm": 1.223266839981079, "learning_rate": 4.7741830572847826e-05, "loss": 0.7828, "num_input_tokens_seen": 17350224, "step": 29905 }, { "epoch": 4.4548704200178735, "grad_norm": 1.3970588445663452, "learning_rate": 4.774048081567805e-05, "loss": 0.7206, "num_input_tokens_seen": 17353008, "step": 29910 }, { "epoch": 4.455615132558832, "grad_norm": 2.7442760467529297, "learning_rate": 4.7739130674330966e-05, "loss": 0.6257, "num_input_tokens_seen": 17356240, "step": 29915 }, { "epoch": 4.456359845099792, "grad_norm": 1.438281774520874, "learning_rate": 4.773778014882939e-05, "loss": 0.5788, "num_input_tokens_seen": 17359248, "step": 29920 }, { "epoch": 4.457104557640751, "grad_norm": 0.8599122166633606, "learning_rate": 4.773642923919612e-05, "loss": 0.5635, "num_input_tokens_seen": 17362384, "step": 29925 }, { "epoch": 4.457849270181709, "grad_norm": 1.4985402822494507, "learning_rate": 4.773507794545399e-05, "loss": 0.5638, "num_input_tokens_seen": 17365456, "step": 29930 }, { "epoch": 4.458593982722669, "grad_norm": 1.0856996774673462, "learning_rate": 4.7733726267625824e-05, "loss": 0.7749, "num_input_tokens_seen": 17368368, "step": 29935 }, { "epoch": 4.459338695263629, "grad_norm": 0.7845546007156372, "learning_rate": 4.7732374205734456e-05, "loss": 0.5077, "num_input_tokens_seen": 17371312, "step": 29940 }, { "epoch": 4.4600834078045875, "grad_norm": 0.9943300485610962, "learning_rate": 4.773102175980273e-05, "loss": 0.5794, "num_input_tokens_seen": 17374000, "step": 29945 }, { "epoch": 4.460828120345546, "grad_norm": 1.7496851682662964, "learning_rate": 4.772966892985349e-05, "loss": 0.7787, "num_input_tokens_seen": 17376976, "step": 29950 }, { "epoch": 4.461572832886506, "grad_norm": 1.209057331085205, "learning_rate": 4.77283157159096e-05, "loss": 0.6469, "num_input_tokens_seen": 17380112, "step": 29955 }, { "epoch": 4.462317545427465, "grad_norm": 0.8713886141777039, "learning_rate": 4.772696211799392e-05, "loss": 0.5801, "num_input_tokens_seen": 17383120, "step": 29960 }, { "epoch": 4.463062257968424, "grad_norm": 0.8674046993255615, "learning_rate": 4.7725608136129305e-05, "loss": 0.7559, "num_input_tokens_seen": 17386064, "step": 29965 }, { "epoch": 4.463806970509383, "grad_norm": 1.2705349922180176, "learning_rate": 4.7724253770338645e-05, "loss": 0.6256, "num_input_tokens_seen": 17388560, "step": 29970 }, { "epoch": 4.464551683050343, "grad_norm": 0.9495323300361633, "learning_rate": 4.772289902064481e-05, "loss": 0.7809, "num_input_tokens_seen": 17391504, "step": 29975 }, { "epoch": 4.465296395591301, "grad_norm": 1.53224778175354, "learning_rate": 4.772154388707069e-05, "loss": 0.615, "num_input_tokens_seen": 17394288, "step": 29980 }, { "epoch": 4.466041108132261, "grad_norm": 1.1137362718582153, "learning_rate": 4.7720188369639186e-05, "loss": 0.6641, "num_input_tokens_seen": 17397360, "step": 29985 }, { "epoch": 4.46678582067322, "grad_norm": 2.292325735092163, "learning_rate": 4.771883246837318e-05, "loss": 0.712, "num_input_tokens_seen": 17400432, "step": 29990 }, { "epoch": 4.4675305332141795, "grad_norm": 1.1309362649917603, "learning_rate": 4.77174761832956e-05, "loss": 0.6662, "num_input_tokens_seen": 17403408, "step": 29995 }, { "epoch": 4.468275245755138, "grad_norm": 1.149705410003662, "learning_rate": 4.771611951442935e-05, "loss": 0.5881, "num_input_tokens_seen": 17406608, "step": 30000 }, { "epoch": 4.469019958296098, "grad_norm": 1.3361916542053223, "learning_rate": 4.771476246179734e-05, "loss": 0.7092, "num_input_tokens_seen": 17409360, "step": 30005 }, { "epoch": 4.469764670837057, "grad_norm": 1.0342133045196533, "learning_rate": 4.7713405025422505e-05, "loss": 0.5745, "num_input_tokens_seen": 17412464, "step": 30010 }, { "epoch": 4.470509383378016, "grad_norm": 0.8956518173217773, "learning_rate": 4.771204720532778e-05, "loss": 0.6914, "num_input_tokens_seen": 17415344, "step": 30015 }, { "epoch": 4.471254095918975, "grad_norm": 1.2580673694610596, "learning_rate": 4.7710689001536105e-05, "loss": 0.5528, "num_input_tokens_seen": 17418224, "step": 30020 }, { "epoch": 4.471998808459935, "grad_norm": 1.2930625677108765, "learning_rate": 4.7709330414070406e-05, "loss": 0.5297, "num_input_tokens_seen": 17421232, "step": 30025 }, { "epoch": 4.4727435210008935, "grad_norm": 0.9277179837226868, "learning_rate": 4.770797144295366e-05, "loss": 0.7581, "num_input_tokens_seen": 17423920, "step": 30030 }, { "epoch": 4.473488233541853, "grad_norm": 0.9632557034492493, "learning_rate": 4.7706612088208826e-05, "loss": 0.6773, "num_input_tokens_seen": 17426736, "step": 30035 }, { "epoch": 4.474232946082812, "grad_norm": 0.8332674503326416, "learning_rate": 4.770525234985884e-05, "loss": 0.7107, "num_input_tokens_seen": 17429712, "step": 30040 }, { "epoch": 4.4749776586237715, "grad_norm": 1.437522530555725, "learning_rate": 4.770389222792671e-05, "loss": 0.773, "num_input_tokens_seen": 17432656, "step": 30045 }, { "epoch": 4.47572237116473, "grad_norm": 1.0724953413009644, "learning_rate": 4.770253172243538e-05, "loss": 0.7853, "num_input_tokens_seen": 17435696, "step": 30050 }, { "epoch": 4.47646708370569, "grad_norm": 0.9948124885559082, "learning_rate": 4.770117083340786e-05, "loss": 0.6235, "num_input_tokens_seen": 17438768, "step": 30055 }, { "epoch": 4.477211796246649, "grad_norm": 1.6332288980484009, "learning_rate": 4.769980956086714e-05, "loss": 0.6404, "num_input_tokens_seen": 17441648, "step": 30060 }, { "epoch": 4.477956508787608, "grad_norm": 0.679021418094635, "learning_rate": 4.769844790483619e-05, "loss": 0.6196, "num_input_tokens_seen": 17444464, "step": 30065 }, { "epoch": 4.478701221328567, "grad_norm": 1.7067129611968994, "learning_rate": 4.769708586533804e-05, "loss": 0.6527, "num_input_tokens_seen": 17447184, "step": 30070 }, { "epoch": 4.479445933869527, "grad_norm": 1.3235191106796265, "learning_rate": 4.7695723442395694e-05, "loss": 0.6391, "num_input_tokens_seen": 17450160, "step": 30075 }, { "epoch": 4.4801906464104855, "grad_norm": 0.875815749168396, "learning_rate": 4.769436063603217e-05, "loss": 0.6668, "num_input_tokens_seen": 17453360, "step": 30080 }, { "epoch": 4.480935358951445, "grad_norm": 0.7541200518608093, "learning_rate": 4.769299744627048e-05, "loss": 0.6918, "num_input_tokens_seen": 17456144, "step": 30085 }, { "epoch": 4.481680071492404, "grad_norm": 2.8481037616729736, "learning_rate": 4.769163387313367e-05, "loss": 0.702, "num_input_tokens_seen": 17459152, "step": 30090 }, { "epoch": 4.4824247840333635, "grad_norm": 0.7826330065727234, "learning_rate": 4.7690269916644766e-05, "loss": 0.663, "num_input_tokens_seen": 17462032, "step": 30095 }, { "epoch": 4.483169496574322, "grad_norm": 1.660447359085083, "learning_rate": 4.768890557682681e-05, "loss": 0.7373, "num_input_tokens_seen": 17464848, "step": 30100 }, { "epoch": 4.483914209115282, "grad_norm": 1.0529814958572388, "learning_rate": 4.768754085370286e-05, "loss": 0.7555, "num_input_tokens_seen": 17467696, "step": 30105 }, { "epoch": 4.484658921656241, "grad_norm": 0.7567616105079651, "learning_rate": 4.768617574729596e-05, "loss": 0.7173, "num_input_tokens_seen": 17470480, "step": 30110 }, { "epoch": 4.4854036341971995, "grad_norm": 2.099153518676758, "learning_rate": 4.768481025762918e-05, "loss": 0.7343, "num_input_tokens_seen": 17473232, "step": 30115 }, { "epoch": 4.486148346738159, "grad_norm": 0.6744366884231567, "learning_rate": 4.768344438472559e-05, "loss": 0.6497, "num_input_tokens_seen": 17475984, "step": 30120 }, { "epoch": 4.486893059279118, "grad_norm": 1.1617733240127563, "learning_rate": 4.768207812860826e-05, "loss": 0.6029, "num_input_tokens_seen": 17479056, "step": 30125 }, { "epoch": 4.4876377718200775, "grad_norm": 0.7783321142196655, "learning_rate": 4.768071148930027e-05, "loss": 0.6723, "num_input_tokens_seen": 17482032, "step": 30130 }, { "epoch": 4.488382484361036, "grad_norm": 0.8233795762062073, "learning_rate": 4.7679344466824716e-05, "loss": 0.5208, "num_input_tokens_seen": 17484880, "step": 30135 }, { "epoch": 4.489127196901996, "grad_norm": 1.0784562826156616, "learning_rate": 4.767797706120468e-05, "loss": 0.4871, "num_input_tokens_seen": 17487664, "step": 30140 }, { "epoch": 4.489871909442955, "grad_norm": 0.867205023765564, "learning_rate": 4.767660927246328e-05, "loss": 0.7382, "num_input_tokens_seen": 17490576, "step": 30145 }, { "epoch": 4.490616621983914, "grad_norm": 0.7974668741226196, "learning_rate": 4.7675241100623604e-05, "loss": 0.6204, "num_input_tokens_seen": 17493488, "step": 30150 }, { "epoch": 4.491361334524873, "grad_norm": 0.5753485560417175, "learning_rate": 4.7673872545708784e-05, "loss": 0.5902, "num_input_tokens_seen": 17496240, "step": 30155 }, { "epoch": 4.492106047065833, "grad_norm": 1.1966711282730103, "learning_rate": 4.767250360774193e-05, "loss": 0.5916, "num_input_tokens_seen": 17498800, "step": 30160 }, { "epoch": 4.4928507596067915, "grad_norm": 0.9148069620132446, "learning_rate": 4.767113428674616e-05, "loss": 0.4977, "num_input_tokens_seen": 17501456, "step": 30165 }, { "epoch": 4.493595472147751, "grad_norm": 1.0015134811401367, "learning_rate": 4.766976458274464e-05, "loss": 0.6947, "num_input_tokens_seen": 17504368, "step": 30170 }, { "epoch": 4.49434018468871, "grad_norm": 0.9887891411781311, "learning_rate": 4.766839449576047e-05, "loss": 0.5316, "num_input_tokens_seen": 17507536, "step": 30175 }, { "epoch": 4.4950848972296695, "grad_norm": 1.4771153926849365, "learning_rate": 4.766702402581682e-05, "loss": 0.8053, "num_input_tokens_seen": 17510768, "step": 30180 }, { "epoch": 4.495829609770628, "grad_norm": 1.576213002204895, "learning_rate": 4.766565317293683e-05, "loss": 0.6635, "num_input_tokens_seen": 17513744, "step": 30185 }, { "epoch": 4.496574322311588, "grad_norm": 0.9277065992355347, "learning_rate": 4.766428193714367e-05, "loss": 0.7106, "num_input_tokens_seen": 17516976, "step": 30190 }, { "epoch": 4.497319034852547, "grad_norm": 1.1045753955841064, "learning_rate": 4.766291031846051e-05, "loss": 0.6562, "num_input_tokens_seen": 17520112, "step": 30195 }, { "epoch": 4.498063747393506, "grad_norm": 1.2139939069747925, "learning_rate": 4.76615383169105e-05, "loss": 0.871, "num_input_tokens_seen": 17522832, "step": 30200 }, { "epoch": 4.498808459934465, "grad_norm": 0.9852260947227478, "learning_rate": 4.766016593251684e-05, "loss": 0.7272, "num_input_tokens_seen": 17525744, "step": 30205 }, { "epoch": 4.499553172475425, "grad_norm": 1.1423869132995605, "learning_rate": 4.765879316530272e-05, "loss": 0.6476, "num_input_tokens_seen": 17528464, "step": 30210 }, { "epoch": 4.5002978850163835, "grad_norm": 0.8534265160560608, "learning_rate": 4.76574200152913e-05, "loss": 0.5656, "num_input_tokens_seen": 17531408, "step": 30215 }, { "epoch": 4.501042597557343, "grad_norm": 1.356245756149292, "learning_rate": 4.76560464825058e-05, "loss": 0.6818, "num_input_tokens_seen": 17534224, "step": 30220 }, { "epoch": 4.501787310098302, "grad_norm": 0.8313052654266357, "learning_rate": 4.7654672566969424e-05, "loss": 0.6117, "num_input_tokens_seen": 17537200, "step": 30225 }, { "epoch": 4.5025320226392616, "grad_norm": 1.3130125999450684, "learning_rate": 4.765329826870538e-05, "loss": 0.7362, "num_input_tokens_seen": 17540272, "step": 30230 }, { "epoch": 4.50327673518022, "grad_norm": 1.1368759870529175, "learning_rate": 4.765192358773689e-05, "loss": 0.6159, "num_input_tokens_seen": 17543280, "step": 30235 }, { "epoch": 4.50402144772118, "grad_norm": 1.5153716802597046, "learning_rate": 4.765054852408717e-05, "loss": 0.5692, "num_input_tokens_seen": 17546288, "step": 30240 }, { "epoch": 4.504766160262139, "grad_norm": 1.2113133668899536, "learning_rate": 4.7649173077779455e-05, "loss": 0.6125, "num_input_tokens_seen": 17549520, "step": 30245 }, { "epoch": 4.505510872803098, "grad_norm": 0.81610107421875, "learning_rate": 4.7647797248836975e-05, "loss": 0.6475, "num_input_tokens_seen": 17552368, "step": 30250 }, { "epoch": 4.506255585344057, "grad_norm": 1.0763790607452393, "learning_rate": 4.7646421037282984e-05, "loss": 0.5413, "num_input_tokens_seen": 17555312, "step": 30255 }, { "epoch": 4.507000297885017, "grad_norm": 1.153236746788025, "learning_rate": 4.764504444314072e-05, "loss": 0.608, "num_input_tokens_seen": 17558416, "step": 30260 }, { "epoch": 4.5077450104259755, "grad_norm": 1.1980643272399902, "learning_rate": 4.7643667466433453e-05, "loss": 0.6911, "num_input_tokens_seen": 17561328, "step": 30265 }, { "epoch": 4.508489722966935, "grad_norm": 1.2533739805221558, "learning_rate": 4.7642290107184426e-05, "loss": 0.7398, "num_input_tokens_seen": 17564432, "step": 30270 }, { "epoch": 4.509234435507894, "grad_norm": 1.3789973258972168, "learning_rate": 4.764091236541693e-05, "loss": 0.5551, "num_input_tokens_seen": 17567376, "step": 30275 }, { "epoch": 4.509979148048853, "grad_norm": 0.7690272331237793, "learning_rate": 4.763953424115424e-05, "loss": 0.5657, "num_input_tokens_seen": 17570288, "step": 30280 }, { "epoch": 4.510723860589812, "grad_norm": 1.44523286819458, "learning_rate": 4.7638155734419616e-05, "loss": 0.7733, "num_input_tokens_seen": 17573456, "step": 30285 }, { "epoch": 4.511468573130772, "grad_norm": 2.914987564086914, "learning_rate": 4.763677684523636e-05, "loss": 0.8053, "num_input_tokens_seen": 17576176, "step": 30290 }, { "epoch": 4.512213285671731, "grad_norm": 0.8562756180763245, "learning_rate": 4.7635397573627774e-05, "loss": 0.5996, "num_input_tokens_seen": 17578800, "step": 30295 }, { "epoch": 4.5129579982126895, "grad_norm": 2.023683547973633, "learning_rate": 4.7634017919617143e-05, "loss": 0.6409, "num_input_tokens_seen": 17581712, "step": 30300 }, { "epoch": 4.513702710753649, "grad_norm": 0.7927324771881104, "learning_rate": 4.763263788322778e-05, "loss": 0.6104, "num_input_tokens_seen": 17584688, "step": 30305 }, { "epoch": 4.514447423294608, "grad_norm": 0.9922342896461487, "learning_rate": 4.7631257464483014e-05, "loss": 0.6442, "num_input_tokens_seen": 17587504, "step": 30310 }, { "epoch": 4.5151921358355676, "grad_norm": 0.9121677279472351, "learning_rate": 4.762987666340615e-05, "loss": 0.5628, "num_input_tokens_seen": 17590128, "step": 30315 }, { "epoch": 4.515936848376526, "grad_norm": 1.170873999595642, "learning_rate": 4.7628495480020516e-05, "loss": 0.6758, "num_input_tokens_seen": 17592976, "step": 30320 }, { "epoch": 4.516681560917486, "grad_norm": 1.1421642303466797, "learning_rate": 4.762711391434945e-05, "loss": 0.7562, "num_input_tokens_seen": 17595984, "step": 30325 }, { "epoch": 4.517426273458445, "grad_norm": 0.9592097401618958, "learning_rate": 4.76257319664163e-05, "loss": 0.8481, "num_input_tokens_seen": 17598832, "step": 30330 }, { "epoch": 4.518170985999404, "grad_norm": 0.9679399132728577, "learning_rate": 4.76243496362444e-05, "loss": 0.772, "num_input_tokens_seen": 17601552, "step": 30335 }, { "epoch": 4.518915698540363, "grad_norm": 1.5497227907180786, "learning_rate": 4.76229669238571e-05, "loss": 0.6727, "num_input_tokens_seen": 17604592, "step": 30340 }, { "epoch": 4.519660411081323, "grad_norm": 1.0237066745758057, "learning_rate": 4.762158382927777e-05, "loss": 0.8023, "num_input_tokens_seen": 17607536, "step": 30345 }, { "epoch": 4.5204051236222815, "grad_norm": 1.4750932455062866, "learning_rate": 4.762020035252978e-05, "loss": 0.835, "num_input_tokens_seen": 17610672, "step": 30350 }, { "epoch": 4.521149836163241, "grad_norm": 2.0655858516693115, "learning_rate": 4.761881649363649e-05, "loss": 0.6435, "num_input_tokens_seen": 17613520, "step": 30355 }, { "epoch": 4.5218945487042, "grad_norm": 1.3299256563186646, "learning_rate": 4.7617432252621285e-05, "loss": 0.6188, "num_input_tokens_seen": 17616624, "step": 30360 }, { "epoch": 4.52263926124516, "grad_norm": 1.7227038145065308, "learning_rate": 4.7616047629507556e-05, "loss": 0.7843, "num_input_tokens_seen": 17619792, "step": 30365 }, { "epoch": 4.523383973786118, "grad_norm": 0.9256711006164551, "learning_rate": 4.761466262431867e-05, "loss": 0.607, "num_input_tokens_seen": 17622992, "step": 30370 }, { "epoch": 4.524128686327078, "grad_norm": 1.462611198425293, "learning_rate": 4.7613277237078055e-05, "loss": 0.7683, "num_input_tokens_seen": 17625840, "step": 30375 }, { "epoch": 4.524873398868037, "grad_norm": 1.231173038482666, "learning_rate": 4.761189146780911e-05, "loss": 0.5609, "num_input_tokens_seen": 17628496, "step": 30380 }, { "epoch": 4.525618111408996, "grad_norm": 1.1563680171966553, "learning_rate": 4.761050531653524e-05, "loss": 0.5451, "num_input_tokens_seen": 17631248, "step": 30385 }, { "epoch": 4.526362823949955, "grad_norm": 1.164494276046753, "learning_rate": 4.760911878327985e-05, "loss": 0.5514, "num_input_tokens_seen": 17634032, "step": 30390 }, { "epoch": 4.527107536490915, "grad_norm": 1.3099411725997925, "learning_rate": 4.760773186806639e-05, "loss": 0.5703, "num_input_tokens_seen": 17637328, "step": 30395 }, { "epoch": 4.5278522490318736, "grad_norm": 0.47977033257484436, "learning_rate": 4.7606344570918264e-05, "loss": 0.666, "num_input_tokens_seen": 17640432, "step": 30400 }, { "epoch": 4.528596961572833, "grad_norm": 0.7590100169181824, "learning_rate": 4.760495689185893e-05, "loss": 0.6865, "num_input_tokens_seen": 17643152, "step": 30405 }, { "epoch": 4.529341674113792, "grad_norm": 0.8467578291893005, "learning_rate": 4.760356883091183e-05, "loss": 0.7507, "num_input_tokens_seen": 17646000, "step": 30410 }, { "epoch": 4.530086386654752, "grad_norm": 0.9607017040252686, "learning_rate": 4.7602180388100395e-05, "loss": 0.5943, "num_input_tokens_seen": 17648880, "step": 30415 }, { "epoch": 4.53083109919571, "grad_norm": 0.9724254012107849, "learning_rate": 4.760079156344811e-05, "loss": 0.6318, "num_input_tokens_seen": 17651632, "step": 30420 }, { "epoch": 4.53157581173667, "grad_norm": 1.2025542259216309, "learning_rate": 4.7599402356978406e-05, "loss": 0.7174, "num_input_tokens_seen": 17654448, "step": 30425 }, { "epoch": 4.532320524277629, "grad_norm": 0.8831389546394348, "learning_rate": 4.759801276871478e-05, "loss": 0.6863, "num_input_tokens_seen": 17657680, "step": 30430 }, { "epoch": 4.533065236818588, "grad_norm": 1.5718644857406616, "learning_rate": 4.759662279868069e-05, "loss": 0.6984, "num_input_tokens_seen": 17660528, "step": 30435 }, { "epoch": 4.533809949359547, "grad_norm": 0.7661899328231812, "learning_rate": 4.759523244689963e-05, "loss": 0.6353, "num_input_tokens_seen": 17663600, "step": 30440 }, { "epoch": 4.534554661900506, "grad_norm": 1.2751384973526, "learning_rate": 4.759384171339507e-05, "loss": 0.6644, "num_input_tokens_seen": 17666480, "step": 30445 }, { "epoch": 4.535299374441466, "grad_norm": 1.3964391946792603, "learning_rate": 4.759245059819053e-05, "loss": 0.7303, "num_input_tokens_seen": 17669520, "step": 30450 }, { "epoch": 4.536044086982425, "grad_norm": 1.1900084018707275, "learning_rate": 4.759105910130949e-05, "loss": 0.4723, "num_input_tokens_seen": 17672208, "step": 30455 }, { "epoch": 4.536788799523384, "grad_norm": 0.8157675862312317, "learning_rate": 4.758966722277547e-05, "loss": 0.7353, "num_input_tokens_seen": 17675152, "step": 30460 }, { "epoch": 4.537533512064343, "grad_norm": 0.7389463186264038, "learning_rate": 4.758827496261199e-05, "loss": 0.6314, "num_input_tokens_seen": 17678448, "step": 30465 }, { "epoch": 4.538278224605302, "grad_norm": 0.790926456451416, "learning_rate": 4.758688232084255e-05, "loss": 0.7645, "num_input_tokens_seen": 17681488, "step": 30470 }, { "epoch": 4.539022937146262, "grad_norm": 1.0426290035247803, "learning_rate": 4.7585489297490694e-05, "loss": 0.6534, "num_input_tokens_seen": 17684432, "step": 30475 }, { "epoch": 4.539767649687221, "grad_norm": 0.7054222226142883, "learning_rate": 4.758409589257995e-05, "loss": 0.7648, "num_input_tokens_seen": 17687280, "step": 30480 }, { "epoch": 4.5405123622281796, "grad_norm": 1.4749411344528198, "learning_rate": 4.758270210613387e-05, "loss": 0.7626, "num_input_tokens_seen": 17690160, "step": 30485 }, { "epoch": 4.541257074769139, "grad_norm": 0.6561206579208374, "learning_rate": 4.758130793817598e-05, "loss": 0.5545, "num_input_tokens_seen": 17693008, "step": 30490 }, { "epoch": 4.542001787310098, "grad_norm": 0.958332359790802, "learning_rate": 4.7579913388729844e-05, "loss": 0.7419, "num_input_tokens_seen": 17695952, "step": 30495 }, { "epoch": 4.542746499851058, "grad_norm": 0.8236675262451172, "learning_rate": 4.757851845781902e-05, "loss": 0.6053, "num_input_tokens_seen": 17698736, "step": 30500 }, { "epoch": 4.543491212392016, "grad_norm": 1.3170279264450073, "learning_rate": 4.757712314546707e-05, "loss": 0.5702, "num_input_tokens_seen": 17701552, "step": 30505 }, { "epoch": 4.544235924932976, "grad_norm": 1.6185017824172974, "learning_rate": 4.7575727451697585e-05, "loss": 0.5499, "num_input_tokens_seen": 17704528, "step": 30510 }, { "epoch": 4.544980637473935, "grad_norm": 0.8476671576499939, "learning_rate": 4.757433137653411e-05, "loss": 0.4644, "num_input_tokens_seen": 17707408, "step": 30515 }, { "epoch": 4.545725350014894, "grad_norm": 1.352988362312317, "learning_rate": 4.757293492000027e-05, "loss": 0.7806, "num_input_tokens_seen": 17710128, "step": 30520 }, { "epoch": 4.546470062555853, "grad_norm": 1.8886899948120117, "learning_rate": 4.757153808211962e-05, "loss": 0.7396, "num_input_tokens_seen": 17712976, "step": 30525 }, { "epoch": 4.547214775096813, "grad_norm": 1.5077053308486938, "learning_rate": 4.757014086291579e-05, "loss": 0.7199, "num_input_tokens_seen": 17715984, "step": 30530 }, { "epoch": 4.547959487637772, "grad_norm": 0.9643524885177612, "learning_rate": 4.7568743262412354e-05, "loss": 0.8004, "num_input_tokens_seen": 17718928, "step": 30535 }, { "epoch": 4.548704200178731, "grad_norm": 0.9912697672843933, "learning_rate": 4.756734528063295e-05, "loss": 0.6302, "num_input_tokens_seen": 17721680, "step": 30540 }, { "epoch": 4.54944891271969, "grad_norm": 0.9948949217796326, "learning_rate": 4.756594691760118e-05, "loss": 0.6275, "num_input_tokens_seen": 17724400, "step": 30545 }, { "epoch": 4.55019362526065, "grad_norm": 0.6747484803199768, "learning_rate": 4.7564548173340664e-05, "loss": 0.5946, "num_input_tokens_seen": 17727280, "step": 30550 }, { "epoch": 4.550938337801608, "grad_norm": 1.501271367073059, "learning_rate": 4.7563149047875054e-05, "loss": 0.7106, "num_input_tokens_seen": 17730448, "step": 30555 }, { "epoch": 4.551683050342568, "grad_norm": 0.7032579779624939, "learning_rate": 4.756174954122796e-05, "loss": 0.6798, "num_input_tokens_seen": 17733328, "step": 30560 }, { "epoch": 4.552427762883527, "grad_norm": 0.8488497734069824, "learning_rate": 4.7560349653423055e-05, "loss": 0.7207, "num_input_tokens_seen": 17736144, "step": 30565 }, { "epoch": 4.553172475424486, "grad_norm": 1.3954896926879883, "learning_rate": 4.755894938448395e-05, "loss": 0.7677, "num_input_tokens_seen": 17739152, "step": 30570 }, { "epoch": 4.553917187965445, "grad_norm": 1.5411264896392822, "learning_rate": 4.755754873443434e-05, "loss": 0.6562, "num_input_tokens_seen": 17742128, "step": 30575 }, { "epoch": 4.554661900506405, "grad_norm": 1.0202596187591553, "learning_rate": 4.7556147703297865e-05, "loss": 0.6473, "num_input_tokens_seen": 17744976, "step": 30580 }, { "epoch": 4.555406613047364, "grad_norm": 1.1709212064743042, "learning_rate": 4.75547462910982e-05, "loss": 0.6567, "num_input_tokens_seen": 17747984, "step": 30585 }, { "epoch": 4.556151325588323, "grad_norm": 1.1078476905822754, "learning_rate": 4.755334449785902e-05, "loss": 0.8153, "num_input_tokens_seen": 17750992, "step": 30590 }, { "epoch": 4.556896038129282, "grad_norm": 1.4462170600891113, "learning_rate": 4.755194232360401e-05, "loss": 0.6428, "num_input_tokens_seen": 17753968, "step": 30595 }, { "epoch": 4.557640750670242, "grad_norm": 0.7834644913673401, "learning_rate": 4.755053976835685e-05, "loss": 0.6013, "num_input_tokens_seen": 17756784, "step": 30600 }, { "epoch": 4.5583854632112, "grad_norm": 1.173498272895813, "learning_rate": 4.754913683214124e-05, "loss": 0.6438, "num_input_tokens_seen": 17759984, "step": 30605 }, { "epoch": 4.559130175752159, "grad_norm": 0.9140558242797852, "learning_rate": 4.754773351498088e-05, "loss": 0.6854, "num_input_tokens_seen": 17763280, "step": 30610 }, { "epoch": 4.559874888293119, "grad_norm": 1.6272844076156616, "learning_rate": 4.754632981689949e-05, "loss": 0.644, "num_input_tokens_seen": 17765872, "step": 30615 }, { "epoch": 4.5606196008340785, "grad_norm": 0.760336697101593, "learning_rate": 4.7544925737920766e-05, "loss": 0.6627, "num_input_tokens_seen": 17768816, "step": 30620 }, { "epoch": 4.561364313375037, "grad_norm": 0.7025957703590393, "learning_rate": 4.754352127806843e-05, "loss": 0.7293, "num_input_tokens_seen": 17771920, "step": 30625 }, { "epoch": 4.562109025915996, "grad_norm": 0.8730055689811707, "learning_rate": 4.754211643736622e-05, "loss": 0.6168, "num_input_tokens_seen": 17774640, "step": 30630 }, { "epoch": 4.562853738456956, "grad_norm": 0.9624743461608887, "learning_rate": 4.7540711215837866e-05, "loss": 0.5887, "num_input_tokens_seen": 17777264, "step": 30635 }, { "epoch": 4.563598450997915, "grad_norm": 1.1927019357681274, "learning_rate": 4.7539305613507096e-05, "loss": 0.64, "num_input_tokens_seen": 17780432, "step": 30640 }, { "epoch": 4.564343163538874, "grad_norm": 1.0251381397247314, "learning_rate": 4.753789963039767e-05, "loss": 0.5793, "num_input_tokens_seen": 17783312, "step": 30645 }, { "epoch": 4.565087876079833, "grad_norm": 0.8863298892974854, "learning_rate": 4.753649326653334e-05, "loss": 0.444, "num_input_tokens_seen": 17786064, "step": 30650 }, { "epoch": 4.565832588620792, "grad_norm": 0.8795680999755859, "learning_rate": 4.753508652193785e-05, "loss": 0.6427, "num_input_tokens_seen": 17788912, "step": 30655 }, { "epoch": 4.566577301161751, "grad_norm": 0.9737774133682251, "learning_rate": 4.7533679396634986e-05, "loss": 0.6646, "num_input_tokens_seen": 17791376, "step": 30660 }, { "epoch": 4.567322013702711, "grad_norm": 1.291269063949585, "learning_rate": 4.7532271890648516e-05, "loss": 0.7915, "num_input_tokens_seen": 17794320, "step": 30665 }, { "epoch": 4.56806672624367, "grad_norm": 2.350912570953369, "learning_rate": 4.753086400400221e-05, "loss": 0.9385, "num_input_tokens_seen": 17797488, "step": 30670 }, { "epoch": 4.568811438784629, "grad_norm": 0.9556213021278381, "learning_rate": 4.752945573671985e-05, "loss": 0.7539, "num_input_tokens_seen": 17800208, "step": 30675 }, { "epoch": 4.569556151325588, "grad_norm": 0.8389272689819336, "learning_rate": 4.752804708882523e-05, "loss": 0.6622, "num_input_tokens_seen": 17802960, "step": 30680 }, { "epoch": 4.570300863866548, "grad_norm": 1.8064836263656616, "learning_rate": 4.7526638060342164e-05, "loss": 0.7135, "num_input_tokens_seen": 17805488, "step": 30685 }, { "epoch": 4.571045576407506, "grad_norm": 1.1193581819534302, "learning_rate": 4.752522865129444e-05, "loss": 0.9024, "num_input_tokens_seen": 17808208, "step": 30690 }, { "epoch": 4.571790288948466, "grad_norm": 1.1644762754440308, "learning_rate": 4.7523818861705865e-05, "loss": 0.5871, "num_input_tokens_seen": 17810896, "step": 30695 }, { "epoch": 4.572535001489425, "grad_norm": 1.3164602518081665, "learning_rate": 4.752240869160026e-05, "loss": 0.7349, "num_input_tokens_seen": 17813648, "step": 30700 }, { "epoch": 4.5732797140303845, "grad_norm": 2.0723252296447754, "learning_rate": 4.752099814100146e-05, "loss": 0.6095, "num_input_tokens_seen": 17816272, "step": 30705 }, { "epoch": 4.574024426571343, "grad_norm": 0.9105907082557678, "learning_rate": 4.751958720993328e-05, "loss": 0.5275, "num_input_tokens_seen": 17819248, "step": 30710 }, { "epoch": 4.574769139112303, "grad_norm": 1.1936227083206177, "learning_rate": 4.751817589841957e-05, "loss": 0.8163, "num_input_tokens_seen": 17822064, "step": 30715 }, { "epoch": 4.575513851653262, "grad_norm": 1.162347435951233, "learning_rate": 4.7516764206484156e-05, "loss": 0.6381, "num_input_tokens_seen": 17824816, "step": 30720 }, { "epoch": 4.576258564194221, "grad_norm": 0.9829928278923035, "learning_rate": 4.75153521341509e-05, "loss": 0.6267, "num_input_tokens_seen": 17827600, "step": 30725 }, { "epoch": 4.57700327673518, "grad_norm": 1.0644665956497192, "learning_rate": 4.751393968144365e-05, "loss": 0.5866, "num_input_tokens_seen": 17830768, "step": 30730 }, { "epoch": 4.57774798927614, "grad_norm": 1.1603058576583862, "learning_rate": 4.7512526848386276e-05, "loss": 0.6797, "num_input_tokens_seen": 17833616, "step": 30735 }, { "epoch": 4.578492701817098, "grad_norm": 0.8266977667808533, "learning_rate": 4.751111363500263e-05, "loss": 0.5633, "num_input_tokens_seen": 17836464, "step": 30740 }, { "epoch": 4.579237414358058, "grad_norm": 0.7703676819801331, "learning_rate": 4.750970004131662e-05, "loss": 0.6067, "num_input_tokens_seen": 17839280, "step": 30745 }, { "epoch": 4.579982126899017, "grad_norm": 1.0364539623260498, "learning_rate": 4.7508286067352085e-05, "loss": 0.5855, "num_input_tokens_seen": 17842768, "step": 30750 }, { "epoch": 4.5807268394399765, "grad_norm": 1.55430006980896, "learning_rate": 4.750687171313294e-05, "loss": 0.6836, "num_input_tokens_seen": 17845488, "step": 30755 }, { "epoch": 4.581471551980935, "grad_norm": 1.0367863178253174, "learning_rate": 4.750545697868307e-05, "loss": 0.7464, "num_input_tokens_seen": 17848304, "step": 30760 }, { "epoch": 4.582216264521895, "grad_norm": 0.6683785915374756, "learning_rate": 4.750404186402639e-05, "loss": 0.6878, "num_input_tokens_seen": 17851312, "step": 30765 }, { "epoch": 4.582960977062854, "grad_norm": 0.5683627724647522, "learning_rate": 4.7502626369186784e-05, "loss": 0.6427, "num_input_tokens_seen": 17854000, "step": 30770 }, { "epoch": 4.583705689603813, "grad_norm": 1.3529551029205322, "learning_rate": 4.750121049418817e-05, "loss": 0.6511, "num_input_tokens_seen": 17856848, "step": 30775 }, { "epoch": 4.584450402144772, "grad_norm": 1.0559134483337402, "learning_rate": 4.749979423905449e-05, "loss": 0.6609, "num_input_tokens_seen": 17859760, "step": 30780 }, { "epoch": 4.585195114685732, "grad_norm": 0.7752434611320496, "learning_rate": 4.749837760380965e-05, "loss": 0.645, "num_input_tokens_seen": 17862448, "step": 30785 }, { "epoch": 4.5859398272266905, "grad_norm": 1.3653591871261597, "learning_rate": 4.749696058847758e-05, "loss": 0.787, "num_input_tokens_seen": 17865392, "step": 30790 }, { "epoch": 4.586684539767649, "grad_norm": 1.201267123222351, "learning_rate": 4.749554319308223e-05, "loss": 0.7343, "num_input_tokens_seen": 17868048, "step": 30795 }, { "epoch": 4.587429252308609, "grad_norm": 0.8178430795669556, "learning_rate": 4.7494125417647536e-05, "loss": 0.5418, "num_input_tokens_seen": 17870768, "step": 30800 }, { "epoch": 4.5881739648495685, "grad_norm": 0.8048833012580872, "learning_rate": 4.749270726219746e-05, "loss": 0.6531, "num_input_tokens_seen": 17873520, "step": 30805 }, { "epoch": 4.588918677390527, "grad_norm": 0.8354753851890564, "learning_rate": 4.7491288726755954e-05, "loss": 0.6008, "num_input_tokens_seen": 17876464, "step": 30810 }, { "epoch": 4.589663389931486, "grad_norm": 1.1474007368087769, "learning_rate": 4.7489869811346984e-05, "loss": 0.6954, "num_input_tokens_seen": 17879152, "step": 30815 }, { "epoch": 4.590408102472446, "grad_norm": 1.0649257898330688, "learning_rate": 4.748845051599452e-05, "loss": 0.6878, "num_input_tokens_seen": 17882000, "step": 30820 }, { "epoch": 4.591152815013404, "grad_norm": 0.9697059988975525, "learning_rate": 4.748703084072255e-05, "loss": 0.7129, "num_input_tokens_seen": 17884912, "step": 30825 }, { "epoch": 4.591897527554364, "grad_norm": 1.0577619075775146, "learning_rate": 4.748561078555504e-05, "loss": 0.6415, "num_input_tokens_seen": 17887632, "step": 30830 }, { "epoch": 4.592642240095323, "grad_norm": 0.736473798751831, "learning_rate": 4.748419035051599e-05, "loss": 0.6534, "num_input_tokens_seen": 17890480, "step": 30835 }, { "epoch": 4.5933869526362825, "grad_norm": 0.9362838268280029, "learning_rate": 4.748276953562939e-05, "loss": 0.5956, "num_input_tokens_seen": 17893392, "step": 30840 }, { "epoch": 4.594131665177241, "grad_norm": 2.636887788772583, "learning_rate": 4.7481348340919255e-05, "loss": 0.6248, "num_input_tokens_seen": 17896144, "step": 30845 }, { "epoch": 4.594876377718201, "grad_norm": 0.9425969123840332, "learning_rate": 4.747992676640959e-05, "loss": 0.7543, "num_input_tokens_seen": 17899248, "step": 30850 }, { "epoch": 4.59562109025916, "grad_norm": 0.977813184261322, "learning_rate": 4.7478504812124416e-05, "loss": 0.6021, "num_input_tokens_seen": 17902000, "step": 30855 }, { "epoch": 4.596365802800119, "grad_norm": 1.7752619981765747, "learning_rate": 4.7477082478087734e-05, "loss": 0.7172, "num_input_tokens_seen": 17904720, "step": 30860 }, { "epoch": 4.597110515341078, "grad_norm": 0.8323196172714233, "learning_rate": 4.74756597643236e-05, "loss": 0.4874, "num_input_tokens_seen": 17907472, "step": 30865 }, { "epoch": 4.597855227882038, "grad_norm": 1.4054360389709473, "learning_rate": 4.747423667085603e-05, "loss": 0.6664, "num_input_tokens_seen": 17910032, "step": 30870 }, { "epoch": 4.5985999404229965, "grad_norm": 1.3255188465118408, "learning_rate": 4.7472813197709084e-05, "loss": 0.7719, "num_input_tokens_seen": 17913040, "step": 30875 }, { "epoch": 4.599344652963956, "grad_norm": 1.0890307426452637, "learning_rate": 4.747138934490679e-05, "loss": 0.6016, "num_input_tokens_seen": 17915984, "step": 30880 }, { "epoch": 4.600089365504915, "grad_norm": 0.8656755685806274, "learning_rate": 4.746996511247321e-05, "loss": 0.5958, "num_input_tokens_seen": 17918640, "step": 30885 }, { "epoch": 4.6008340780458745, "grad_norm": 0.678022027015686, "learning_rate": 4.746854050043241e-05, "loss": 0.7776, "num_input_tokens_seen": 17921808, "step": 30890 }, { "epoch": 4.601578790586833, "grad_norm": 1.3043241500854492, "learning_rate": 4.7467115508808456e-05, "loss": 0.7067, "num_input_tokens_seen": 17924816, "step": 30895 }, { "epoch": 4.602323503127793, "grad_norm": 0.586641252040863, "learning_rate": 4.746569013762543e-05, "loss": 0.5209, "num_input_tokens_seen": 17927664, "step": 30900 }, { "epoch": 4.603068215668752, "grad_norm": 0.9056847095489502, "learning_rate": 4.7464264386907385e-05, "loss": 0.6771, "num_input_tokens_seen": 17930288, "step": 30905 }, { "epoch": 4.603812928209711, "grad_norm": 0.9336923360824585, "learning_rate": 4.746283825667843e-05, "loss": 0.4512, "num_input_tokens_seen": 17933072, "step": 30910 }, { "epoch": 4.60455764075067, "grad_norm": 0.7790898680686951, "learning_rate": 4.746141174696266e-05, "loss": 0.7859, "num_input_tokens_seen": 17935856, "step": 30915 }, { "epoch": 4.60530235329163, "grad_norm": 1.2345516681671143, "learning_rate": 4.745998485778416e-05, "loss": 0.7104, "num_input_tokens_seen": 17938800, "step": 30920 }, { "epoch": 4.6060470658325885, "grad_norm": 2.5971615314483643, "learning_rate": 4.7458557589167044e-05, "loss": 0.8466, "num_input_tokens_seen": 17942032, "step": 30925 }, { "epoch": 4.606791778373548, "grad_norm": 1.1973302364349365, "learning_rate": 4.7457129941135424e-05, "loss": 0.8505, "num_input_tokens_seen": 17945072, "step": 30930 }, { "epoch": 4.607536490914507, "grad_norm": 0.8402388095855713, "learning_rate": 4.7455701913713424e-05, "loss": 0.779, "num_input_tokens_seen": 17948048, "step": 30935 }, { "epoch": 4.6082812034554665, "grad_norm": 1.1422245502471924, "learning_rate": 4.745427350692515e-05, "loss": 0.6585, "num_input_tokens_seen": 17951216, "step": 30940 }, { "epoch": 4.609025915996425, "grad_norm": 0.8982545733451843, "learning_rate": 4.7452844720794756e-05, "loss": 0.6491, "num_input_tokens_seen": 17954160, "step": 30945 }, { "epoch": 4.609770628537385, "grad_norm": 1.0697965621948242, "learning_rate": 4.745141555534637e-05, "loss": 0.7517, "num_input_tokens_seen": 17957104, "step": 30950 }, { "epoch": 4.610515341078344, "grad_norm": 1.507808804512024, "learning_rate": 4.744998601060414e-05, "loss": 0.7873, "num_input_tokens_seen": 17960080, "step": 30955 }, { "epoch": 4.6112600536193025, "grad_norm": 1.163204312324524, "learning_rate": 4.74485560865922e-05, "loss": 0.6506, "num_input_tokens_seen": 17963152, "step": 30960 }, { "epoch": 4.612004766160262, "grad_norm": 0.8094893097877502, "learning_rate": 4.744712578333473e-05, "loss": 0.5569, "num_input_tokens_seen": 17966064, "step": 30965 }, { "epoch": 4.612749478701222, "grad_norm": 0.9336675405502319, "learning_rate": 4.744569510085589e-05, "loss": 0.6213, "num_input_tokens_seen": 17969040, "step": 30970 }, { "epoch": 4.6134941912421805, "grad_norm": 1.5555648803710938, "learning_rate": 4.7444264039179845e-05, "loss": 0.6723, "num_input_tokens_seen": 17972208, "step": 30975 }, { "epoch": 4.614238903783139, "grad_norm": 1.8626906871795654, "learning_rate": 4.744283259833076e-05, "loss": 0.7255, "num_input_tokens_seen": 17974896, "step": 30980 }, { "epoch": 4.614983616324099, "grad_norm": 0.6499040722846985, "learning_rate": 4.744140077833283e-05, "loss": 0.6086, "num_input_tokens_seen": 17977776, "step": 30985 }, { "epoch": 4.615728328865059, "grad_norm": 0.7066724896430969, "learning_rate": 4.743996857921024e-05, "loss": 0.6043, "num_input_tokens_seen": 17980368, "step": 30990 }, { "epoch": 4.616473041406017, "grad_norm": 0.9803593158721924, "learning_rate": 4.7438536000987195e-05, "loss": 0.6276, "num_input_tokens_seen": 17983280, "step": 30995 }, { "epoch": 4.617217753946976, "grad_norm": 0.6762518882751465, "learning_rate": 4.743710304368788e-05, "loss": 0.7783, "num_input_tokens_seen": 17986384, "step": 31000 }, { "epoch": 4.617962466487936, "grad_norm": 1.1005651950836182, "learning_rate": 4.743566970733652e-05, "loss": 0.572, "num_input_tokens_seen": 17989328, "step": 31005 }, { "epoch": 4.6187071790288945, "grad_norm": 1.0615040063858032, "learning_rate": 4.7434235991957326e-05, "loss": 0.6329, "num_input_tokens_seen": 17992112, "step": 31010 }, { "epoch": 4.619451891569854, "grad_norm": 0.8105672001838684, "learning_rate": 4.743280189757451e-05, "loss": 0.7562, "num_input_tokens_seen": 17995024, "step": 31015 }, { "epoch": 4.620196604110813, "grad_norm": 1.2990785837173462, "learning_rate": 4.7431367424212305e-05, "loss": 0.5843, "num_input_tokens_seen": 17998160, "step": 31020 }, { "epoch": 4.6209413166517725, "grad_norm": 0.9591882824897766, "learning_rate": 4.7429932571894954e-05, "loss": 0.7663, "num_input_tokens_seen": 18000496, "step": 31025 }, { "epoch": 4.621686029192731, "grad_norm": 0.9874192476272583, "learning_rate": 4.742849734064668e-05, "loss": 0.645, "num_input_tokens_seen": 18003312, "step": 31030 }, { "epoch": 4.622430741733691, "grad_norm": 1.0570995807647705, "learning_rate": 4.742706173049174e-05, "loss": 0.6812, "num_input_tokens_seen": 18006160, "step": 31035 }, { "epoch": 4.62317545427465, "grad_norm": 1.2634073495864868, "learning_rate": 4.7425625741454394e-05, "loss": 0.6687, "num_input_tokens_seen": 18008976, "step": 31040 }, { "epoch": 4.623920166815609, "grad_norm": 1.1835294961929321, "learning_rate": 4.7424189373558886e-05, "loss": 0.7532, "num_input_tokens_seen": 18011792, "step": 31045 }, { "epoch": 4.624664879356568, "grad_norm": 0.9738779067993164, "learning_rate": 4.742275262682949e-05, "loss": 0.7022, "num_input_tokens_seen": 18014992, "step": 31050 }, { "epoch": 4.625409591897528, "grad_norm": 1.1116665601730347, "learning_rate": 4.7421315501290484e-05, "loss": 0.7403, "num_input_tokens_seen": 18017904, "step": 31055 }, { "epoch": 4.6261543044384865, "grad_norm": 1.3265061378479004, "learning_rate": 4.7419877996966134e-05, "loss": 0.5964, "num_input_tokens_seen": 18020848, "step": 31060 }, { "epoch": 4.626899016979446, "grad_norm": 1.2297558784484863, "learning_rate": 4.7418440113880736e-05, "loss": 0.7105, "num_input_tokens_seen": 18023728, "step": 31065 }, { "epoch": 4.627643729520405, "grad_norm": 0.9624285697937012, "learning_rate": 4.7417001852058576e-05, "loss": 0.6074, "num_input_tokens_seen": 18026480, "step": 31070 }, { "epoch": 4.628388442061365, "grad_norm": 0.8549289107322693, "learning_rate": 4.741556321152395e-05, "loss": 0.6301, "num_input_tokens_seen": 18029104, "step": 31075 }, { "epoch": 4.629133154602323, "grad_norm": 0.9012812376022339, "learning_rate": 4.741412419230117e-05, "loss": 0.5502, "num_input_tokens_seen": 18032016, "step": 31080 }, { "epoch": 4.629877867143283, "grad_norm": 1.0741323232650757, "learning_rate": 4.741268479441453e-05, "loss": 0.7971, "num_input_tokens_seen": 18034832, "step": 31085 }, { "epoch": 4.630622579684242, "grad_norm": 0.9772024154663086, "learning_rate": 4.7411245017888374e-05, "loss": 0.4703, "num_input_tokens_seen": 18037616, "step": 31090 }, { "epoch": 4.631367292225201, "grad_norm": 0.9278423190116882, "learning_rate": 4.7409804862747007e-05, "loss": 0.6882, "num_input_tokens_seen": 18040976, "step": 31095 }, { "epoch": 4.63211200476616, "grad_norm": 0.9708302617073059, "learning_rate": 4.740836432901476e-05, "loss": 0.7388, "num_input_tokens_seen": 18044592, "step": 31100 }, { "epoch": 4.63285671730712, "grad_norm": 1.1888713836669922, "learning_rate": 4.7406923416715976e-05, "loss": 0.7253, "num_input_tokens_seen": 18047504, "step": 31105 }, { "epoch": 4.6336014298480785, "grad_norm": 1.430057406425476, "learning_rate": 4.7405482125875e-05, "loss": 0.7416, "num_input_tokens_seen": 18050352, "step": 31110 }, { "epoch": 4.634346142389038, "grad_norm": 1.0282881259918213, "learning_rate": 4.7404040456516164e-05, "loss": 0.5642, "num_input_tokens_seen": 18053136, "step": 31115 }, { "epoch": 4.635090854929997, "grad_norm": 1.024828314781189, "learning_rate": 4.7402598408663846e-05, "loss": 0.6705, "num_input_tokens_seen": 18056208, "step": 31120 }, { "epoch": 4.635835567470957, "grad_norm": 1.366296410560608, "learning_rate": 4.740115598234239e-05, "loss": 0.6871, "num_input_tokens_seen": 18059344, "step": 31125 }, { "epoch": 4.636580280011915, "grad_norm": 1.4369970560073853, "learning_rate": 4.739971317757617e-05, "loss": 0.7779, "num_input_tokens_seen": 18062224, "step": 31130 }, { "epoch": 4.637324992552875, "grad_norm": 0.9167100191116333, "learning_rate": 4.7398269994389567e-05, "loss": 0.5196, "num_input_tokens_seen": 18065008, "step": 31135 }, { "epoch": 4.638069705093834, "grad_norm": 1.066663384437561, "learning_rate": 4.739682643280695e-05, "loss": 0.6738, "num_input_tokens_seen": 18067536, "step": 31140 }, { "epoch": 4.6388144176347925, "grad_norm": 1.106054425239563, "learning_rate": 4.739538249285272e-05, "loss": 0.6504, "num_input_tokens_seen": 18070608, "step": 31145 }, { "epoch": 4.639559130175752, "grad_norm": 0.836715042591095, "learning_rate": 4.739393817455127e-05, "loss": 0.5403, "num_input_tokens_seen": 18073488, "step": 31150 }, { "epoch": 4.640303842716712, "grad_norm": 1.2898999452590942, "learning_rate": 4.739249347792698e-05, "loss": 0.6825, "num_input_tokens_seen": 18076336, "step": 31155 }, { "epoch": 4.641048555257671, "grad_norm": 1.1514980792999268, "learning_rate": 4.739104840300428e-05, "loss": 0.651, "num_input_tokens_seen": 18079088, "step": 31160 }, { "epoch": 4.641793267798629, "grad_norm": 1.3260126113891602, "learning_rate": 4.738960294980757e-05, "loss": 0.6516, "num_input_tokens_seen": 18082224, "step": 31165 }, { "epoch": 4.642537980339589, "grad_norm": 1.5752986669540405, "learning_rate": 4.738815711836128e-05, "loss": 0.6085, "num_input_tokens_seen": 18085200, "step": 31170 }, { "epoch": 4.643282692880548, "grad_norm": 0.9420431852340698, "learning_rate": 4.738671090868982e-05, "loss": 0.6749, "num_input_tokens_seen": 18087952, "step": 31175 }, { "epoch": 4.644027405421507, "grad_norm": 1.2220405340194702, "learning_rate": 4.738526432081765e-05, "loss": 0.7246, "num_input_tokens_seen": 18090992, "step": 31180 }, { "epoch": 4.644772117962466, "grad_norm": 1.0102659463882446, "learning_rate": 4.738381735476917e-05, "loss": 0.691, "num_input_tokens_seen": 18093872, "step": 31185 }, { "epoch": 4.645516830503426, "grad_norm": 0.8273594975471497, "learning_rate": 4.738237001056886e-05, "loss": 0.6926, "num_input_tokens_seen": 18096624, "step": 31190 }, { "epoch": 4.6462615430443845, "grad_norm": 1.435882329940796, "learning_rate": 4.738092228824115e-05, "loss": 0.6197, "num_input_tokens_seen": 18099504, "step": 31195 }, { "epoch": 4.647006255585344, "grad_norm": 0.7794814705848694, "learning_rate": 4.7379474187810506e-05, "loss": 0.6231, "num_input_tokens_seen": 18102448, "step": 31200 }, { "epoch": 4.647750968126303, "grad_norm": 0.8109406232833862, "learning_rate": 4.7378025709301386e-05, "loss": 0.5942, "num_input_tokens_seen": 18105488, "step": 31205 }, { "epoch": 4.648495680667263, "grad_norm": 1.2660777568817139, "learning_rate": 4.7376576852738274e-05, "loss": 0.6318, "num_input_tokens_seen": 18108336, "step": 31210 }, { "epoch": 4.649240393208221, "grad_norm": 0.6208914518356323, "learning_rate": 4.7375127618145645e-05, "loss": 0.6702, "num_input_tokens_seen": 18111248, "step": 31215 }, { "epoch": 4.649985105749181, "grad_norm": 1.0134527683258057, "learning_rate": 4.737367800554796e-05, "loss": 0.6137, "num_input_tokens_seen": 18114416, "step": 31220 }, { "epoch": 4.65072981829014, "grad_norm": 1.4030581712722778, "learning_rate": 4.737222801496973e-05, "loss": 0.8182, "num_input_tokens_seen": 18117360, "step": 31225 }, { "epoch": 4.651474530831099, "grad_norm": 1.2834736108779907, "learning_rate": 4.737077764643545e-05, "loss": 0.6443, "num_input_tokens_seen": 18120208, "step": 31230 }, { "epoch": 4.652219243372058, "grad_norm": 0.9077401757240295, "learning_rate": 4.736932689996962e-05, "loss": 0.6767, "num_input_tokens_seen": 18122896, "step": 31235 }, { "epoch": 4.652963955913018, "grad_norm": 0.8194457292556763, "learning_rate": 4.7367875775596746e-05, "loss": 0.6776, "num_input_tokens_seen": 18125680, "step": 31240 }, { "epoch": 4.653708668453977, "grad_norm": 2.0336670875549316, "learning_rate": 4.7366424273341334e-05, "loss": 0.7473, "num_input_tokens_seen": 18128688, "step": 31245 }, { "epoch": 4.654453380994936, "grad_norm": 0.833940863609314, "learning_rate": 4.7364972393227925e-05, "loss": 0.7113, "num_input_tokens_seen": 18131536, "step": 31250 }, { "epoch": 4.655198093535895, "grad_norm": 0.9057127833366394, "learning_rate": 4.736352013528104e-05, "loss": 0.6112, "num_input_tokens_seen": 18134256, "step": 31255 }, { "epoch": 4.655942806076855, "grad_norm": 1.4617841243743896, "learning_rate": 4.736206749952521e-05, "loss": 0.5681, "num_input_tokens_seen": 18137040, "step": 31260 }, { "epoch": 4.656687518617813, "grad_norm": 2.4672603607177734, "learning_rate": 4.736061448598498e-05, "loss": 0.6593, "num_input_tokens_seen": 18139920, "step": 31265 }, { "epoch": 4.657432231158773, "grad_norm": 1.0114721059799194, "learning_rate": 4.7359161094684886e-05, "loss": 0.5622, "num_input_tokens_seen": 18142800, "step": 31270 }, { "epoch": 4.658176943699732, "grad_norm": 0.758628785610199, "learning_rate": 4.735770732564949e-05, "loss": 0.5791, "num_input_tokens_seen": 18145456, "step": 31275 }, { "epoch": 4.658921656240691, "grad_norm": 1.0390032529830933, "learning_rate": 4.735625317890336e-05, "loss": 0.6476, "num_input_tokens_seen": 18148528, "step": 31280 }, { "epoch": 4.65966636878165, "grad_norm": 0.8130402565002441, "learning_rate": 4.735479865447105e-05, "loss": 0.5946, "num_input_tokens_seen": 18151280, "step": 31285 }, { "epoch": 4.66041108132261, "grad_norm": 0.9883406758308411, "learning_rate": 4.735334375237714e-05, "loss": 0.6047, "num_input_tokens_seen": 18154096, "step": 31290 }, { "epoch": 4.661155793863569, "grad_norm": 1.7223575115203857, "learning_rate": 4.7351888472646204e-05, "loss": 0.7865, "num_input_tokens_seen": 18157040, "step": 31295 }, { "epoch": 4.661900506404528, "grad_norm": 0.8694816827774048, "learning_rate": 4.735043281530283e-05, "loss": 0.6865, "num_input_tokens_seen": 18159984, "step": 31300 }, { "epoch": 4.662645218945487, "grad_norm": 1.2205846309661865, "learning_rate": 4.7348976780371615e-05, "loss": 0.6449, "num_input_tokens_seen": 18162832, "step": 31305 }, { "epoch": 4.663389931486446, "grad_norm": 1.336095929145813, "learning_rate": 4.734752036787714e-05, "loss": 0.5894, "num_input_tokens_seen": 18165872, "step": 31310 }, { "epoch": 4.664134644027405, "grad_norm": 0.9459306597709656, "learning_rate": 4.734606357784403e-05, "loss": 0.6448, "num_input_tokens_seen": 18168784, "step": 31315 }, { "epoch": 4.664879356568365, "grad_norm": 0.8915378451347351, "learning_rate": 4.734460641029689e-05, "loss": 0.8602, "num_input_tokens_seen": 18171920, "step": 31320 }, { "epoch": 4.665624069109324, "grad_norm": 1.3464587926864624, "learning_rate": 4.7343148865260326e-05, "loss": 0.6461, "num_input_tokens_seen": 18174576, "step": 31325 }, { "epoch": 4.666368781650283, "grad_norm": 1.1115986108779907, "learning_rate": 4.7341690942758974e-05, "loss": 0.5864, "num_input_tokens_seen": 18177648, "step": 31330 }, { "epoch": 4.667113494191242, "grad_norm": 1.1640803813934326, "learning_rate": 4.734023264281746e-05, "loss": 0.6414, "num_input_tokens_seen": 18180688, "step": 31335 }, { "epoch": 4.667858206732202, "grad_norm": 1.6460930109024048, "learning_rate": 4.7338773965460426e-05, "loss": 0.6126, "num_input_tokens_seen": 18183440, "step": 31340 }, { "epoch": 4.668602919273161, "grad_norm": 1.3207237720489502, "learning_rate": 4.733731491071251e-05, "loss": 0.6329, "num_input_tokens_seen": 18186384, "step": 31345 }, { "epoch": 4.669347631814119, "grad_norm": 1.0051875114440918, "learning_rate": 4.7335855478598354e-05, "loss": 0.8281, "num_input_tokens_seen": 18189168, "step": 31350 }, { "epoch": 4.670092344355079, "grad_norm": 1.2466316223144531, "learning_rate": 4.7334395669142616e-05, "loss": 0.7696, "num_input_tokens_seen": 18192400, "step": 31355 }, { "epoch": 4.670837056896038, "grad_norm": 0.8181772828102112, "learning_rate": 4.733293548236998e-05, "loss": 0.6679, "num_input_tokens_seen": 18195408, "step": 31360 }, { "epoch": 4.671581769436997, "grad_norm": 0.739508867263794, "learning_rate": 4.7331474918305086e-05, "loss": 0.5778, "num_input_tokens_seen": 18198480, "step": 31365 }, { "epoch": 4.672326481977956, "grad_norm": 1.3585014343261719, "learning_rate": 4.733001397697262e-05, "loss": 0.6555, "num_input_tokens_seen": 18201296, "step": 31370 }, { "epoch": 4.673071194518916, "grad_norm": 0.8382537364959717, "learning_rate": 4.732855265839726e-05, "loss": 0.6779, "num_input_tokens_seen": 18204464, "step": 31375 }, { "epoch": 4.673815907059875, "grad_norm": 0.9852908253669739, "learning_rate": 4.7327090962603704e-05, "loss": 0.506, "num_input_tokens_seen": 18207312, "step": 31380 }, { "epoch": 4.674560619600834, "grad_norm": 0.5949330925941467, "learning_rate": 4.7325628889616644e-05, "loss": 0.5649, "num_input_tokens_seen": 18210160, "step": 31385 }, { "epoch": 4.675305332141793, "grad_norm": 1.6160154342651367, "learning_rate": 4.732416643946076e-05, "loss": 0.7932, "num_input_tokens_seen": 18213040, "step": 31390 }, { "epoch": 4.676050044682753, "grad_norm": 1.1502383947372437, "learning_rate": 4.732270361216078e-05, "loss": 0.7274, "num_input_tokens_seen": 18216208, "step": 31395 }, { "epoch": 4.676794757223711, "grad_norm": 0.7664003372192383, "learning_rate": 4.732124040774142e-05, "loss": 0.6401, "num_input_tokens_seen": 18218864, "step": 31400 }, { "epoch": 4.677539469764671, "grad_norm": 0.9416619539260864, "learning_rate": 4.731977682622737e-05, "loss": 0.6875, "num_input_tokens_seen": 18221680, "step": 31405 }, { "epoch": 4.67828418230563, "grad_norm": 2.696098804473877, "learning_rate": 4.731831286764339e-05, "loss": 0.662, "num_input_tokens_seen": 18224496, "step": 31410 }, { "epoch": 4.6790288948465895, "grad_norm": 0.6885557174682617, "learning_rate": 4.731684853201419e-05, "loss": 0.5141, "num_input_tokens_seen": 18227280, "step": 31415 }, { "epoch": 4.679773607387548, "grad_norm": 1.3781417608261108, "learning_rate": 4.7315383819364526e-05, "loss": 0.7596, "num_input_tokens_seen": 18230064, "step": 31420 }, { "epoch": 4.680518319928508, "grad_norm": 1.2620428800582886, "learning_rate": 4.731391872971912e-05, "loss": 0.665, "num_input_tokens_seen": 18232880, "step": 31425 }, { "epoch": 4.681263032469467, "grad_norm": 1.302370309829712, "learning_rate": 4.731245326310274e-05, "loss": 0.6627, "num_input_tokens_seen": 18235536, "step": 31430 }, { "epoch": 4.682007745010426, "grad_norm": 0.9009010195732117, "learning_rate": 4.731098741954014e-05, "loss": 0.6122, "num_input_tokens_seen": 18238096, "step": 31435 }, { "epoch": 4.682752457551385, "grad_norm": 0.6617019772529602, "learning_rate": 4.730952119905609e-05, "loss": 0.5534, "num_input_tokens_seen": 18240912, "step": 31440 }, { "epoch": 4.683497170092345, "grad_norm": 0.7547720074653625, "learning_rate": 4.730805460167534e-05, "loss": 0.6583, "num_input_tokens_seen": 18243984, "step": 31445 }, { "epoch": 4.684241882633303, "grad_norm": 1.3804962635040283, "learning_rate": 4.730658762742269e-05, "loss": 0.7538, "num_input_tokens_seen": 18246992, "step": 31450 }, { "epoch": 4.684986595174263, "grad_norm": 1.083691954612732, "learning_rate": 4.730512027632292e-05, "loss": 0.8401, "num_input_tokens_seen": 18249776, "step": 31455 }, { "epoch": 4.685731307715222, "grad_norm": 0.5118603110313416, "learning_rate": 4.7303652548400803e-05, "loss": 0.7166, "num_input_tokens_seen": 18252688, "step": 31460 }, { "epoch": 4.6864760202561815, "grad_norm": 0.7554293274879456, "learning_rate": 4.730218444368114e-05, "loss": 0.6515, "num_input_tokens_seen": 18255696, "step": 31465 }, { "epoch": 4.68722073279714, "grad_norm": 0.6843804121017456, "learning_rate": 4.7300715962188744e-05, "loss": 0.6683, "num_input_tokens_seen": 18258672, "step": 31470 }, { "epoch": 4.687965445338099, "grad_norm": 0.9416961073875427, "learning_rate": 4.7299247103948417e-05, "loss": 0.5741, "num_input_tokens_seen": 18261424, "step": 31475 }, { "epoch": 4.688710157879059, "grad_norm": 2.220942735671997, "learning_rate": 4.729777786898498e-05, "loss": 0.9259, "num_input_tokens_seen": 18264336, "step": 31480 }, { "epoch": 4.689454870420018, "grad_norm": 1.022192358970642, "learning_rate": 4.729630825732324e-05, "loss": 0.7081, "num_input_tokens_seen": 18267504, "step": 31485 }, { "epoch": 4.690199582960977, "grad_norm": 1.097377896308899, "learning_rate": 4.729483826898804e-05, "loss": 0.9233, "num_input_tokens_seen": 18270384, "step": 31490 }, { "epoch": 4.690944295501936, "grad_norm": 0.8907696008682251, "learning_rate": 4.729336790400421e-05, "loss": 0.6256, "num_input_tokens_seen": 18273424, "step": 31495 }, { "epoch": 4.6916890080428955, "grad_norm": 0.9206147789955139, "learning_rate": 4.729189716239657e-05, "loss": 0.6049, "num_input_tokens_seen": 18276400, "step": 31500 }, { "epoch": 4.692433720583855, "grad_norm": 1.5386953353881836, "learning_rate": 4.7290426044189995e-05, "loss": 0.633, "num_input_tokens_seen": 18279216, "step": 31505 }, { "epoch": 4.693178433124814, "grad_norm": 0.7689759135246277, "learning_rate": 4.728895454940933e-05, "loss": 0.7143, "num_input_tokens_seen": 18282256, "step": 31510 }, { "epoch": 4.693923145665773, "grad_norm": 1.2169252634048462, "learning_rate": 4.728748267807942e-05, "loss": 0.7213, "num_input_tokens_seen": 18285232, "step": 31515 }, { "epoch": 4.694667858206732, "grad_norm": 0.8737556338310242, "learning_rate": 4.728601043022515e-05, "loss": 0.5303, "num_input_tokens_seen": 18288496, "step": 31520 }, { "epoch": 4.695412570747691, "grad_norm": 0.7556222677230835, "learning_rate": 4.728453780587139e-05, "loss": 0.8375, "num_input_tokens_seen": 18291504, "step": 31525 }, { "epoch": 4.696157283288651, "grad_norm": 0.6843181848526001, "learning_rate": 4.7283064805043e-05, "loss": 0.7122, "num_input_tokens_seen": 18294128, "step": 31530 }, { "epoch": 4.696901995829609, "grad_norm": 0.7351882457733154, "learning_rate": 4.7281591427764886e-05, "loss": 0.7734, "num_input_tokens_seen": 18297232, "step": 31535 }, { "epoch": 4.697646708370569, "grad_norm": 1.24655020236969, "learning_rate": 4.728011767406193e-05, "loss": 0.8735, "num_input_tokens_seen": 18300240, "step": 31540 }, { "epoch": 4.698391420911528, "grad_norm": 0.6335499882698059, "learning_rate": 4.7278643543959025e-05, "loss": 0.7153, "num_input_tokens_seen": 18302864, "step": 31545 }, { "epoch": 4.6991361334524875, "grad_norm": 1.487070918083191, "learning_rate": 4.727716903748108e-05, "loss": 0.724, "num_input_tokens_seen": 18306032, "step": 31550 }, { "epoch": 4.699880845993446, "grad_norm": 0.500499427318573, "learning_rate": 4.727569415465302e-05, "loss": 0.5027, "num_input_tokens_seen": 18308880, "step": 31555 }, { "epoch": 4.700625558534406, "grad_norm": 0.6412420272827148, "learning_rate": 4.727421889549973e-05, "loss": 0.5911, "num_input_tokens_seen": 18311952, "step": 31560 }, { "epoch": 4.701370271075365, "grad_norm": 0.9015461802482605, "learning_rate": 4.727274326004616e-05, "loss": 0.7403, "num_input_tokens_seen": 18314960, "step": 31565 }, { "epoch": 4.702114983616324, "grad_norm": 0.8377741575241089, "learning_rate": 4.727126724831723e-05, "loss": 0.6485, "num_input_tokens_seen": 18317648, "step": 31570 }, { "epoch": 4.702859696157283, "grad_norm": 1.0963842868804932, "learning_rate": 4.726979086033787e-05, "loss": 0.5545, "num_input_tokens_seen": 18320656, "step": 31575 }, { "epoch": 4.703604408698243, "grad_norm": 1.1578651666641235, "learning_rate": 4.726831409613303e-05, "loss": 0.6875, "num_input_tokens_seen": 18323504, "step": 31580 }, { "epoch": 4.7043491212392015, "grad_norm": 1.2118052244186401, "learning_rate": 4.7266836955727655e-05, "loss": 0.7114, "num_input_tokens_seen": 18326320, "step": 31585 }, { "epoch": 4.705093833780161, "grad_norm": 1.0062203407287598, "learning_rate": 4.72653594391467e-05, "loss": 0.6213, "num_input_tokens_seen": 18329424, "step": 31590 }, { "epoch": 4.70583854632112, "grad_norm": 0.6120133399963379, "learning_rate": 4.7263881546415135e-05, "loss": 0.6568, "num_input_tokens_seen": 18332208, "step": 31595 }, { "epoch": 4.7065832588620795, "grad_norm": 1.232208013534546, "learning_rate": 4.726240327755791e-05, "loss": 0.5068, "num_input_tokens_seen": 18334928, "step": 31600 }, { "epoch": 4.707327971403038, "grad_norm": 0.9576987624168396, "learning_rate": 4.726092463260001e-05, "loss": 0.7132, "num_input_tokens_seen": 18337872, "step": 31605 }, { "epoch": 4.708072683943998, "grad_norm": 0.743838906288147, "learning_rate": 4.7259445611566414e-05, "loss": 0.6446, "num_input_tokens_seen": 18340912, "step": 31610 }, { "epoch": 4.708817396484957, "grad_norm": 1.091698169708252, "learning_rate": 4.7257966214482106e-05, "loss": 0.6741, "num_input_tokens_seen": 18343600, "step": 31615 }, { "epoch": 4.709562109025916, "grad_norm": 0.7251657247543335, "learning_rate": 4.725648644137208e-05, "loss": 0.657, "num_input_tokens_seen": 18346352, "step": 31620 }, { "epoch": 4.710306821566875, "grad_norm": 0.9664065837860107, "learning_rate": 4.725500629226134e-05, "loss": 0.5786, "num_input_tokens_seen": 18349104, "step": 31625 }, { "epoch": 4.711051534107835, "grad_norm": 1.0751101970672607, "learning_rate": 4.725352576717489e-05, "loss": 0.7109, "num_input_tokens_seen": 18351920, "step": 31630 }, { "epoch": 4.7117962466487935, "grad_norm": 1.1650105714797974, "learning_rate": 4.7252044866137736e-05, "loss": 0.7401, "num_input_tokens_seen": 18354736, "step": 31635 }, { "epoch": 4.712540959189753, "grad_norm": 0.8615177273750305, "learning_rate": 4.72505635891749e-05, "loss": 0.7228, "num_input_tokens_seen": 18357936, "step": 31640 }, { "epoch": 4.713285671730712, "grad_norm": 0.9182793498039246, "learning_rate": 4.7249081936311415e-05, "loss": 0.7594, "num_input_tokens_seen": 18360880, "step": 31645 }, { "epoch": 4.7140303842716715, "grad_norm": 1.175140142440796, "learning_rate": 4.7247599907572285e-05, "loss": 0.6203, "num_input_tokens_seen": 18363952, "step": 31650 }, { "epoch": 4.71477509681263, "grad_norm": 0.9379825592041016, "learning_rate": 4.724611750298258e-05, "loss": 0.5922, "num_input_tokens_seen": 18366928, "step": 31655 }, { "epoch": 4.715519809353589, "grad_norm": 1.340657114982605, "learning_rate": 4.724463472256733e-05, "loss": 0.5345, "num_input_tokens_seen": 18369776, "step": 31660 }, { "epoch": 4.716264521894549, "grad_norm": 0.7358241081237793, "learning_rate": 4.724315156635157e-05, "loss": 0.6944, "num_input_tokens_seen": 18372752, "step": 31665 }, { "epoch": 4.717009234435508, "grad_norm": 1.4861516952514648, "learning_rate": 4.7241668034360384e-05, "loss": 0.7599, "num_input_tokens_seen": 18375376, "step": 31670 }, { "epoch": 4.717753946976467, "grad_norm": 1.3392670154571533, "learning_rate": 4.724018412661883e-05, "loss": 0.6185, "num_input_tokens_seen": 18378096, "step": 31675 }, { "epoch": 4.718498659517426, "grad_norm": 1.307201623916626, "learning_rate": 4.7238699843151954e-05, "loss": 0.9462, "num_input_tokens_seen": 18380880, "step": 31680 }, { "epoch": 4.7192433720583855, "grad_norm": 1.4705604314804077, "learning_rate": 4.723721518398485e-05, "loss": 0.6988, "num_input_tokens_seen": 18384080, "step": 31685 }, { "epoch": 4.719988084599344, "grad_norm": 1.40027916431427, "learning_rate": 4.72357301491426e-05, "loss": 0.6315, "num_input_tokens_seen": 18386992, "step": 31690 }, { "epoch": 4.720732797140304, "grad_norm": 0.8658312559127808, "learning_rate": 4.723424473865029e-05, "loss": 0.567, "num_input_tokens_seen": 18390128, "step": 31695 }, { "epoch": 4.721477509681263, "grad_norm": 1.1763874292373657, "learning_rate": 4.7232758952533006e-05, "loss": 0.6978, "num_input_tokens_seen": 18393200, "step": 31700 }, { "epoch": 4.722222222222222, "grad_norm": 0.9044343829154968, "learning_rate": 4.723127279081586e-05, "loss": 0.5488, "num_input_tokens_seen": 18395888, "step": 31705 }, { "epoch": 4.722966934763181, "grad_norm": 1.2261651754379272, "learning_rate": 4.7229786253523956e-05, "loss": 0.663, "num_input_tokens_seen": 18398832, "step": 31710 }, { "epoch": 4.723711647304141, "grad_norm": 1.1319162845611572, "learning_rate": 4.7228299340682405e-05, "loss": 0.7051, "num_input_tokens_seen": 18401744, "step": 31715 }, { "epoch": 4.7244563598450995, "grad_norm": 1.0164331197738647, "learning_rate": 4.7226812052316325e-05, "loss": 0.7071, "num_input_tokens_seen": 18404720, "step": 31720 }, { "epoch": 4.725201072386059, "grad_norm": 1.12238347530365, "learning_rate": 4.722532438845085e-05, "loss": 0.5245, "num_input_tokens_seen": 18407536, "step": 31725 }, { "epoch": 4.725945784927018, "grad_norm": 1.2737493515014648, "learning_rate": 4.7223836349111106e-05, "loss": 0.5904, "num_input_tokens_seen": 18410544, "step": 31730 }, { "epoch": 4.7266904974679775, "grad_norm": 1.0781069993972778, "learning_rate": 4.722234793432224e-05, "loss": 0.6436, "num_input_tokens_seen": 18413424, "step": 31735 }, { "epoch": 4.727435210008936, "grad_norm": 2.3270466327667236, "learning_rate": 4.722085914410938e-05, "loss": 0.7408, "num_input_tokens_seen": 18416208, "step": 31740 }, { "epoch": 4.728179922549896, "grad_norm": 0.6767783164978027, "learning_rate": 4.7219369978497705e-05, "loss": 0.7033, "num_input_tokens_seen": 18419728, "step": 31745 }, { "epoch": 4.728924635090855, "grad_norm": 0.8534067869186401, "learning_rate": 4.7217880437512344e-05, "loss": 0.5065, "num_input_tokens_seen": 18422512, "step": 31750 }, { "epoch": 4.729669347631814, "grad_norm": 1.2120952606201172, "learning_rate": 4.7216390521178475e-05, "loss": 0.6021, "num_input_tokens_seen": 18425232, "step": 31755 }, { "epoch": 4.730414060172773, "grad_norm": 0.9727956652641296, "learning_rate": 4.721490022952128e-05, "loss": 0.5924, "num_input_tokens_seen": 18428016, "step": 31760 }, { "epoch": 4.731158772713733, "grad_norm": 1.2367690801620483, "learning_rate": 4.721340956256591e-05, "loss": 0.6608, "num_input_tokens_seen": 18431408, "step": 31765 }, { "epoch": 4.7319034852546915, "grad_norm": 1.2311967611312866, "learning_rate": 4.721191852033757e-05, "loss": 0.5534, "num_input_tokens_seen": 18434192, "step": 31770 }, { "epoch": 4.732648197795651, "grad_norm": 0.8108054399490356, "learning_rate": 4.7210427102861437e-05, "loss": 0.5995, "num_input_tokens_seen": 18437008, "step": 31775 }, { "epoch": 4.73339291033661, "grad_norm": 0.943384051322937, "learning_rate": 4.720893531016271e-05, "loss": 0.5801, "num_input_tokens_seen": 18439888, "step": 31780 }, { "epoch": 4.73413762287757, "grad_norm": 0.797512948513031, "learning_rate": 4.72074431422666e-05, "loss": 0.548, "num_input_tokens_seen": 18442704, "step": 31785 }, { "epoch": 4.734882335418528, "grad_norm": 0.8668508529663086, "learning_rate": 4.72059505991983e-05, "loss": 0.7073, "num_input_tokens_seen": 18445776, "step": 31790 }, { "epoch": 4.735627047959488, "grad_norm": 1.4869896173477173, "learning_rate": 4.720445768098304e-05, "loss": 0.7733, "num_input_tokens_seen": 18448720, "step": 31795 }, { "epoch": 4.736371760500447, "grad_norm": 1.1539586782455444, "learning_rate": 4.720296438764604e-05, "loss": 0.6277, "num_input_tokens_seen": 18451376, "step": 31800 }, { "epoch": 4.737116473041406, "grad_norm": 0.8172134160995483, "learning_rate": 4.7201470719212514e-05, "loss": 0.6826, "num_input_tokens_seen": 18454448, "step": 31805 }, { "epoch": 4.737861185582365, "grad_norm": 2.348203659057617, "learning_rate": 4.7199976675707716e-05, "loss": 0.5765, "num_input_tokens_seen": 18457456, "step": 31810 }, { "epoch": 4.738605898123325, "grad_norm": 1.8872275352478027, "learning_rate": 4.719848225715686e-05, "loss": 0.6996, "num_input_tokens_seen": 18460144, "step": 31815 }, { "epoch": 4.7393506106642835, "grad_norm": 1.2937955856323242, "learning_rate": 4.719698746358522e-05, "loss": 0.5968, "num_input_tokens_seen": 18463280, "step": 31820 }, { "epoch": 4.740095323205242, "grad_norm": 1.9418644905090332, "learning_rate": 4.719549229501803e-05, "loss": 0.7457, "num_input_tokens_seen": 18466000, "step": 31825 }, { "epoch": 4.740840035746202, "grad_norm": 0.858519434928894, "learning_rate": 4.7193996751480555e-05, "loss": 0.4705, "num_input_tokens_seen": 18468816, "step": 31830 }, { "epoch": 4.741584748287162, "grad_norm": 1.429315447807312, "learning_rate": 4.7192500832998065e-05, "loss": 0.8074, "num_input_tokens_seen": 18472048, "step": 31835 }, { "epoch": 4.74232946082812, "grad_norm": 0.9817267060279846, "learning_rate": 4.719100453959583e-05, "loss": 0.6072, "num_input_tokens_seen": 18474992, "step": 31840 }, { "epoch": 4.743074173369079, "grad_norm": 1.2041913270950317, "learning_rate": 4.718950787129912e-05, "loss": 0.6945, "num_input_tokens_seen": 18477616, "step": 31845 }, { "epoch": 4.743818885910039, "grad_norm": 1.5985363721847534, "learning_rate": 4.7188010828133233e-05, "loss": 0.5763, "num_input_tokens_seen": 18480560, "step": 31850 }, { "epoch": 4.744563598450998, "grad_norm": 1.1750831604003906, "learning_rate": 4.7186513410123455e-05, "loss": 0.6985, "num_input_tokens_seen": 18483152, "step": 31855 }, { "epoch": 4.745308310991957, "grad_norm": 0.9141858220100403, "learning_rate": 4.718501561729508e-05, "loss": 0.567, "num_input_tokens_seen": 18485968, "step": 31860 }, { "epoch": 4.746053023532916, "grad_norm": 0.933255136013031, "learning_rate": 4.718351744967342e-05, "loss": 0.6133, "num_input_tokens_seen": 18488944, "step": 31865 }, { "epoch": 4.746797736073876, "grad_norm": 0.8807607889175415, "learning_rate": 4.7182018907283776e-05, "loss": 0.6168, "num_input_tokens_seen": 18492048, "step": 31870 }, { "epoch": 4.747542448614834, "grad_norm": 2.5241644382476807, "learning_rate": 4.718051999015146e-05, "loss": 0.8584, "num_input_tokens_seen": 18494768, "step": 31875 }, { "epoch": 4.748287161155794, "grad_norm": 1.8691195249557495, "learning_rate": 4.7179020698301814e-05, "loss": 0.7851, "num_input_tokens_seen": 18497776, "step": 31880 }, { "epoch": 4.749031873696753, "grad_norm": 1.0750889778137207, "learning_rate": 4.717752103176014e-05, "loss": 0.5871, "num_input_tokens_seen": 18500496, "step": 31885 }, { "epoch": 4.749776586237712, "grad_norm": 1.3060431480407715, "learning_rate": 4.71760209905518e-05, "loss": 0.7619, "num_input_tokens_seen": 18503568, "step": 31890 }, { "epoch": 4.750521298778671, "grad_norm": 2.142836332321167, "learning_rate": 4.717452057470212e-05, "loss": 0.7723, "num_input_tokens_seen": 18506736, "step": 31895 }, { "epoch": 4.751266011319631, "grad_norm": 1.3685393333435059, "learning_rate": 4.7173019784236455e-05, "loss": 0.6045, "num_input_tokens_seen": 18509872, "step": 31900 }, { "epoch": 4.7520107238605895, "grad_norm": 1.139026165008545, "learning_rate": 4.717151861918015e-05, "loss": 0.7095, "num_input_tokens_seen": 18513200, "step": 31905 }, { "epoch": 4.752755436401549, "grad_norm": 0.9264312982559204, "learning_rate": 4.717001707955858e-05, "loss": 0.537, "num_input_tokens_seen": 18515856, "step": 31910 }, { "epoch": 4.753500148942508, "grad_norm": 1.1302886009216309, "learning_rate": 4.71685151653971e-05, "loss": 0.6178, "num_input_tokens_seen": 18518768, "step": 31915 }, { "epoch": 4.754244861483468, "grad_norm": 0.8963613510131836, "learning_rate": 4.716701287672109e-05, "loss": 0.5205, "num_input_tokens_seen": 18521776, "step": 31920 }, { "epoch": 4.754989574024426, "grad_norm": 1.7435822486877441, "learning_rate": 4.716551021355593e-05, "loss": 0.6899, "num_input_tokens_seen": 18524624, "step": 31925 }, { "epoch": 4.755734286565386, "grad_norm": 1.034565806388855, "learning_rate": 4.716400717592699e-05, "loss": 0.5332, "num_input_tokens_seen": 18527536, "step": 31930 }, { "epoch": 4.756478999106345, "grad_norm": 1.1420164108276367, "learning_rate": 4.716250376385969e-05, "loss": 0.6703, "num_input_tokens_seen": 18530288, "step": 31935 }, { "epoch": 4.757223711647304, "grad_norm": 1.1750754117965698, "learning_rate": 4.716099997737941e-05, "loss": 0.5979, "num_input_tokens_seen": 18533264, "step": 31940 }, { "epoch": 4.757968424188263, "grad_norm": 0.859747588634491, "learning_rate": 4.7159495816511546e-05, "loss": 0.6257, "num_input_tokens_seen": 18536144, "step": 31945 }, { "epoch": 4.758713136729223, "grad_norm": 1.3808958530426025, "learning_rate": 4.7157991281281536e-05, "loss": 0.8076, "num_input_tokens_seen": 18538800, "step": 31950 }, { "epoch": 4.759457849270182, "grad_norm": 1.040853500366211, "learning_rate": 4.715648637171478e-05, "loss": 0.6833, "num_input_tokens_seen": 18541456, "step": 31955 }, { "epoch": 4.760202561811141, "grad_norm": 0.9990437626838684, "learning_rate": 4.71549810878367e-05, "loss": 0.565, "num_input_tokens_seen": 18544176, "step": 31960 }, { "epoch": 4.7609472743521, "grad_norm": 1.314998984336853, "learning_rate": 4.7153475429672736e-05, "loss": 0.6557, "num_input_tokens_seen": 18548368, "step": 31965 }, { "epoch": 4.76169198689306, "grad_norm": 0.9701249003410339, "learning_rate": 4.715196939724832e-05, "loss": 0.7123, "num_input_tokens_seen": 18551216, "step": 31970 }, { "epoch": 4.762436699434018, "grad_norm": 1.7642937898635864, "learning_rate": 4.71504629905889e-05, "loss": 0.6793, "num_input_tokens_seen": 18553968, "step": 31975 }, { "epoch": 4.763181411974978, "grad_norm": 1.7533307075500488, "learning_rate": 4.714895620971992e-05, "loss": 0.6293, "num_input_tokens_seen": 18556816, "step": 31980 }, { "epoch": 4.763926124515937, "grad_norm": 1.3485276699066162, "learning_rate": 4.714744905466683e-05, "loss": 0.6083, "num_input_tokens_seen": 18559760, "step": 31985 }, { "epoch": 4.764670837056896, "grad_norm": 0.9537088871002197, "learning_rate": 4.71459415254551e-05, "loss": 0.6788, "num_input_tokens_seen": 18562544, "step": 31990 }, { "epoch": 4.765415549597855, "grad_norm": 0.8367472290992737, "learning_rate": 4.71444336221102e-05, "loss": 0.5096, "num_input_tokens_seen": 18565488, "step": 31995 }, { "epoch": 4.766160262138815, "grad_norm": 0.7924061417579651, "learning_rate": 4.71429253446576e-05, "loss": 0.7122, "num_input_tokens_seen": 18568624, "step": 32000 }, { "epoch": 4.766904974679774, "grad_norm": 1.6372394561767578, "learning_rate": 4.714141669312278e-05, "loss": 0.6799, "num_input_tokens_seen": 18571984, "step": 32005 }, { "epoch": 4.767649687220732, "grad_norm": 1.0555305480957031, "learning_rate": 4.713990766753123e-05, "loss": 0.7589, "num_input_tokens_seen": 18574800, "step": 32010 }, { "epoch": 4.768394399761692, "grad_norm": 2.001641273498535, "learning_rate": 4.7138398267908434e-05, "loss": 0.7842, "num_input_tokens_seen": 18577552, "step": 32015 }, { "epoch": 4.769139112302652, "grad_norm": 1.107722520828247, "learning_rate": 4.713688849427991e-05, "loss": 0.5964, "num_input_tokens_seen": 18580688, "step": 32020 }, { "epoch": 4.76988382484361, "grad_norm": 0.7288873195648193, "learning_rate": 4.713537834667115e-05, "loss": 0.6595, "num_input_tokens_seen": 18583600, "step": 32025 }, { "epoch": 4.770628537384569, "grad_norm": 0.9597337245941162, "learning_rate": 4.713386782510766e-05, "loss": 0.618, "num_input_tokens_seen": 18587024, "step": 32030 }, { "epoch": 4.771373249925529, "grad_norm": 0.8682212829589844, "learning_rate": 4.713235692961498e-05, "loss": 0.5524, "num_input_tokens_seen": 18589552, "step": 32035 }, { "epoch": 4.772117962466488, "grad_norm": 0.8941524028778076, "learning_rate": 4.713084566021863e-05, "loss": 0.6146, "num_input_tokens_seen": 18592496, "step": 32040 }, { "epoch": 4.772862675007447, "grad_norm": 1.4837781190872192, "learning_rate": 4.7129334016944124e-05, "loss": 0.637, "num_input_tokens_seen": 18595248, "step": 32045 }, { "epoch": 4.773607387548406, "grad_norm": 0.9784151911735535, "learning_rate": 4.7127821999817014e-05, "loss": 0.6876, "num_input_tokens_seen": 18598032, "step": 32050 }, { "epoch": 4.774352100089366, "grad_norm": 0.8666539192199707, "learning_rate": 4.712630960886284e-05, "loss": 0.5827, "num_input_tokens_seen": 18601200, "step": 32055 }, { "epoch": 4.775096812630324, "grad_norm": 1.041247010231018, "learning_rate": 4.7124796844107155e-05, "loss": 0.545, "num_input_tokens_seen": 18604176, "step": 32060 }, { "epoch": 4.775841525171284, "grad_norm": 0.9683874845504761, "learning_rate": 4.7123283705575514e-05, "loss": 0.7033, "num_input_tokens_seen": 18607024, "step": 32065 }, { "epoch": 4.776586237712243, "grad_norm": 0.8794212937355042, "learning_rate": 4.712177019329348e-05, "loss": 0.4838, "num_input_tokens_seen": 18609648, "step": 32070 }, { "epoch": 4.777330950253202, "grad_norm": 2.4060091972351074, "learning_rate": 4.712025630728662e-05, "loss": 0.669, "num_input_tokens_seen": 18612720, "step": 32075 }, { "epoch": 4.778075662794161, "grad_norm": 0.9796740412712097, "learning_rate": 4.711874204758051e-05, "loss": 0.5495, "num_input_tokens_seen": 18615408, "step": 32080 }, { "epoch": 4.778820375335121, "grad_norm": 1.345996618270874, "learning_rate": 4.7117227414200735e-05, "loss": 0.5531, "num_input_tokens_seen": 18618192, "step": 32085 }, { "epoch": 4.77956508787608, "grad_norm": 0.7052597403526306, "learning_rate": 4.711571240717289e-05, "loss": 0.4915, "num_input_tokens_seen": 18621104, "step": 32090 }, { "epoch": 4.780309800417039, "grad_norm": 0.9651089906692505, "learning_rate": 4.7114197026522555e-05, "loss": 0.8738, "num_input_tokens_seen": 18624112, "step": 32095 }, { "epoch": 4.781054512957998, "grad_norm": 1.173525094985962, "learning_rate": 4.711268127227534e-05, "loss": 0.5989, "num_input_tokens_seen": 18626864, "step": 32100 }, { "epoch": 4.781799225498958, "grad_norm": 1.5074561834335327, "learning_rate": 4.711116514445685e-05, "loss": 0.7922, "num_input_tokens_seen": 18629936, "step": 32105 }, { "epoch": 4.782543938039916, "grad_norm": 1.0436359643936157, "learning_rate": 4.710964864309269e-05, "loss": 0.7191, "num_input_tokens_seen": 18632784, "step": 32110 }, { "epoch": 4.783288650580876, "grad_norm": 0.8161416053771973, "learning_rate": 4.710813176820848e-05, "loss": 0.5637, "num_input_tokens_seen": 18635728, "step": 32115 }, { "epoch": 4.784033363121835, "grad_norm": 0.8590569496154785, "learning_rate": 4.710661451982987e-05, "loss": 0.6251, "num_input_tokens_seen": 18638448, "step": 32120 }, { "epoch": 4.7847780756627944, "grad_norm": 1.1077985763549805, "learning_rate": 4.7105096897982473e-05, "loss": 0.7597, "num_input_tokens_seen": 18641296, "step": 32125 }, { "epoch": 4.785522788203753, "grad_norm": 1.5848157405853271, "learning_rate": 4.710357890269193e-05, "loss": 0.6839, "num_input_tokens_seen": 18644528, "step": 32130 }, { "epoch": 4.786267500744713, "grad_norm": 0.9776115417480469, "learning_rate": 4.710206053398388e-05, "loss": 0.5784, "num_input_tokens_seen": 18647760, "step": 32135 }, { "epoch": 4.787012213285672, "grad_norm": 0.7160736918449402, "learning_rate": 4.710054179188399e-05, "loss": 0.648, "num_input_tokens_seen": 18650736, "step": 32140 }, { "epoch": 4.787756925826631, "grad_norm": 1.0798389911651611, "learning_rate": 4.7099022676417904e-05, "loss": 0.6579, "num_input_tokens_seen": 18653712, "step": 32145 }, { "epoch": 4.78850163836759, "grad_norm": 1.3382617235183716, "learning_rate": 4.709750318761129e-05, "loss": 0.7228, "num_input_tokens_seen": 18656752, "step": 32150 }, { "epoch": 4.78924635090855, "grad_norm": 0.7800323367118835, "learning_rate": 4.709598332548982e-05, "loss": 0.6607, "num_input_tokens_seen": 18659632, "step": 32155 }, { "epoch": 4.789991063449508, "grad_norm": 1.8379011154174805, "learning_rate": 4.709446309007917e-05, "loss": 0.6686, "num_input_tokens_seen": 18662480, "step": 32160 }, { "epoch": 4.790735775990468, "grad_norm": 1.1003130674362183, "learning_rate": 4.709294248140502e-05, "loss": 0.5459, "num_input_tokens_seen": 18665232, "step": 32165 }, { "epoch": 4.791480488531427, "grad_norm": 1.458228588104248, "learning_rate": 4.709142149949306e-05, "loss": 0.8171, "num_input_tokens_seen": 18668208, "step": 32170 }, { "epoch": 4.792225201072386, "grad_norm": 0.8770057559013367, "learning_rate": 4.708990014436899e-05, "loss": 0.7219, "num_input_tokens_seen": 18671280, "step": 32175 }, { "epoch": 4.792969913613345, "grad_norm": 1.0151714086532593, "learning_rate": 4.708837841605851e-05, "loss": 0.5662, "num_input_tokens_seen": 18674352, "step": 32180 }, { "epoch": 4.793714626154305, "grad_norm": 1.1815810203552246, "learning_rate": 4.7086856314587316e-05, "loss": 0.5952, "num_input_tokens_seen": 18677296, "step": 32185 }, { "epoch": 4.794459338695264, "grad_norm": 1.570272445678711, "learning_rate": 4.708533383998114e-05, "loss": 0.6175, "num_input_tokens_seen": 18681008, "step": 32190 }, { "epoch": 4.795204051236222, "grad_norm": 1.3415321111679077, "learning_rate": 4.7083810992265696e-05, "loss": 0.6729, "num_input_tokens_seen": 18684112, "step": 32195 }, { "epoch": 4.795948763777182, "grad_norm": 1.608786702156067, "learning_rate": 4.7082287771466705e-05, "loss": 0.601, "num_input_tokens_seen": 18686864, "step": 32200 }, { "epoch": 4.796693476318142, "grad_norm": 1.3593971729278564, "learning_rate": 4.7080764177609914e-05, "loss": 0.6866, "num_input_tokens_seen": 18689840, "step": 32205 }, { "epoch": 4.7974381888591004, "grad_norm": 0.7936270236968994, "learning_rate": 4.7079240210721046e-05, "loss": 0.6837, "num_input_tokens_seen": 18692752, "step": 32210 }, { "epoch": 4.798182901400059, "grad_norm": 1.3580232858657837, "learning_rate": 4.707771587082586e-05, "loss": 0.6373, "num_input_tokens_seen": 18695408, "step": 32215 }, { "epoch": 4.798927613941019, "grad_norm": 0.796360969543457, "learning_rate": 4.70761911579501e-05, "loss": 0.663, "num_input_tokens_seen": 18698224, "step": 32220 }, { "epoch": 4.799672326481978, "grad_norm": 1.7595628499984741, "learning_rate": 4.707466607211953e-05, "loss": 0.643, "num_input_tokens_seen": 18701072, "step": 32225 }, { "epoch": 4.800417039022937, "grad_norm": 0.8886547088623047, "learning_rate": 4.707314061335991e-05, "loss": 0.5569, "num_input_tokens_seen": 18703824, "step": 32230 }, { "epoch": 4.801161751563896, "grad_norm": 0.9570486545562744, "learning_rate": 4.707161478169702e-05, "loss": 0.5947, "num_input_tokens_seen": 18706992, "step": 32235 }, { "epoch": 4.801906464104856, "grad_norm": 1.7474982738494873, "learning_rate": 4.7070088577156625e-05, "loss": 0.6996, "num_input_tokens_seen": 18709968, "step": 32240 }, { "epoch": 4.802651176645814, "grad_norm": 1.1380447149276733, "learning_rate": 4.706856199976451e-05, "loss": 0.5621, "num_input_tokens_seen": 18712816, "step": 32245 }, { "epoch": 4.803395889186774, "grad_norm": 0.9339855909347534, "learning_rate": 4.706703504954647e-05, "loss": 0.627, "num_input_tokens_seen": 18716944, "step": 32250 }, { "epoch": 4.804140601727733, "grad_norm": 1.1798661947250366, "learning_rate": 4.706550772652831e-05, "loss": 0.5442, "num_input_tokens_seen": 18719792, "step": 32255 }, { "epoch": 4.8048853142686925, "grad_norm": 0.6964308619499207, "learning_rate": 4.7063980030735824e-05, "loss": 0.5128, "num_input_tokens_seen": 18722512, "step": 32260 }, { "epoch": 4.805630026809651, "grad_norm": 1.210168480873108, "learning_rate": 4.7062451962194806e-05, "loss": 0.6919, "num_input_tokens_seen": 18725424, "step": 32265 }, { "epoch": 4.806374739350611, "grad_norm": 0.6254410743713379, "learning_rate": 4.70609235209311e-05, "loss": 0.4838, "num_input_tokens_seen": 18728304, "step": 32270 }, { "epoch": 4.80711945189157, "grad_norm": 0.5277701020240784, "learning_rate": 4.705939470697051e-05, "loss": 0.5192, "num_input_tokens_seen": 18731152, "step": 32275 }, { "epoch": 4.807864164432529, "grad_norm": 0.8715654015541077, "learning_rate": 4.7057865520338865e-05, "loss": 0.5833, "num_input_tokens_seen": 18733968, "step": 32280 }, { "epoch": 4.808608876973488, "grad_norm": 1.3158564567565918, "learning_rate": 4.7056335961061994e-05, "loss": 0.6333, "num_input_tokens_seen": 18736816, "step": 32285 }, { "epoch": 4.809353589514448, "grad_norm": 1.0876312255859375, "learning_rate": 4.705480602916575e-05, "loss": 0.6218, "num_input_tokens_seen": 18739696, "step": 32290 }, { "epoch": 4.8100983020554064, "grad_norm": 1.5865373611450195, "learning_rate": 4.705327572467597e-05, "loss": 0.6805, "num_input_tokens_seen": 18742672, "step": 32295 }, { "epoch": 4.810843014596366, "grad_norm": 0.8241294026374817, "learning_rate": 4.705174504761851e-05, "loss": 0.7116, "num_input_tokens_seen": 18745808, "step": 32300 }, { "epoch": 4.811587727137325, "grad_norm": 0.9791670441627502, "learning_rate": 4.705021399801924e-05, "loss": 0.6898, "num_input_tokens_seen": 18748688, "step": 32305 }, { "epoch": 4.8123324396782845, "grad_norm": 1.1025068759918213, "learning_rate": 4.704868257590401e-05, "loss": 0.823, "num_input_tokens_seen": 18751664, "step": 32310 }, { "epoch": 4.813077152219243, "grad_norm": 1.1394709348678589, "learning_rate": 4.7047150781298693e-05, "loss": 0.6665, "num_input_tokens_seen": 18754384, "step": 32315 }, { "epoch": 4.813821864760203, "grad_norm": 0.5070401430130005, "learning_rate": 4.704561861422917e-05, "loss": 0.5301, "num_input_tokens_seen": 18757168, "step": 32320 }, { "epoch": 4.814566577301162, "grad_norm": 1.120245099067688, "learning_rate": 4.704408607472134e-05, "loss": 0.6144, "num_input_tokens_seen": 18760368, "step": 32325 }, { "epoch": 4.815311289842121, "grad_norm": 1.3195970058441162, "learning_rate": 4.704255316280106e-05, "loss": 0.7221, "num_input_tokens_seen": 18763472, "step": 32330 }, { "epoch": 4.81605600238308, "grad_norm": 0.8838299512863159, "learning_rate": 4.704101987849426e-05, "loss": 0.723, "num_input_tokens_seen": 18766352, "step": 32335 }, { "epoch": 4.816800714924039, "grad_norm": 1.0640504360198975, "learning_rate": 4.7039486221826834e-05, "loss": 0.6798, "num_input_tokens_seen": 18769264, "step": 32340 }, { "epoch": 4.8175454274649985, "grad_norm": 1.0445895195007324, "learning_rate": 4.703795219282469e-05, "loss": 0.6643, "num_input_tokens_seen": 18772272, "step": 32345 }, { "epoch": 4.818290140005958, "grad_norm": 0.8771000504493713, "learning_rate": 4.7036417791513735e-05, "loss": 0.636, "num_input_tokens_seen": 18775120, "step": 32350 }, { "epoch": 4.819034852546917, "grad_norm": 2.116168737411499, "learning_rate": 4.7034883017919896e-05, "loss": 0.6353, "num_input_tokens_seen": 18778032, "step": 32355 }, { "epoch": 4.819779565087876, "grad_norm": 0.8098379373550415, "learning_rate": 4.70333478720691e-05, "loss": 0.5419, "num_input_tokens_seen": 18780624, "step": 32360 }, { "epoch": 4.820524277628835, "grad_norm": 1.5451809167861938, "learning_rate": 4.703181235398729e-05, "loss": 0.4748, "num_input_tokens_seen": 18783344, "step": 32365 }, { "epoch": 4.821268990169795, "grad_norm": 0.8299695253372192, "learning_rate": 4.7030276463700405e-05, "loss": 0.5895, "num_input_tokens_seen": 18786128, "step": 32370 }, { "epoch": 4.822013702710754, "grad_norm": 0.8833634257316589, "learning_rate": 4.702874020123439e-05, "loss": 0.6591, "num_input_tokens_seen": 18789008, "step": 32375 }, { "epoch": 4.8227584152517124, "grad_norm": 0.7845536470413208, "learning_rate": 4.70272035666152e-05, "loss": 0.6576, "num_input_tokens_seen": 18792080, "step": 32380 }, { "epoch": 4.823503127792672, "grad_norm": 2.073716878890991, "learning_rate": 4.702566655986879e-05, "loss": 0.8789, "num_input_tokens_seen": 18795056, "step": 32385 }, { "epoch": 4.824247840333631, "grad_norm": 0.9604916572570801, "learning_rate": 4.702412918102113e-05, "loss": 0.7056, "num_input_tokens_seen": 18797648, "step": 32390 }, { "epoch": 4.8249925528745905, "grad_norm": 1.2021745443344116, "learning_rate": 4.702259143009819e-05, "loss": 0.5072, "num_input_tokens_seen": 18800528, "step": 32395 }, { "epoch": 4.825737265415549, "grad_norm": 2.0659561157226562, "learning_rate": 4.702105330712595e-05, "loss": 0.6471, "num_input_tokens_seen": 18803760, "step": 32400 }, { "epoch": 4.826481977956509, "grad_norm": 1.061882734298706, "learning_rate": 4.70195148121304e-05, "loss": 0.6648, "num_input_tokens_seen": 18806928, "step": 32405 }, { "epoch": 4.827226690497468, "grad_norm": 0.866793692111969, "learning_rate": 4.7017975945137524e-05, "loss": 0.8041, "num_input_tokens_seen": 18810096, "step": 32410 }, { "epoch": 4.827971403038427, "grad_norm": 2.8521058559417725, "learning_rate": 4.701643670617333e-05, "loss": 1.0317, "num_input_tokens_seen": 18813264, "step": 32415 }, { "epoch": 4.828716115579386, "grad_norm": 1.6626801490783691, "learning_rate": 4.70148970952638e-05, "loss": 0.6619, "num_input_tokens_seen": 18816240, "step": 32420 }, { "epoch": 4.829460828120346, "grad_norm": 1.1627497673034668, "learning_rate": 4.701335711243497e-05, "loss": 0.8082, "num_input_tokens_seen": 18819056, "step": 32425 }, { "epoch": 4.8302055406613045, "grad_norm": 1.505393385887146, "learning_rate": 4.7011816757712835e-05, "loss": 0.5907, "num_input_tokens_seen": 18822096, "step": 32430 }, { "epoch": 4.830950253202264, "grad_norm": 1.6121214628219604, "learning_rate": 4.701027603112343e-05, "loss": 0.8741, "num_input_tokens_seen": 18825008, "step": 32435 }, { "epoch": 4.831694965743223, "grad_norm": 1.3325698375701904, "learning_rate": 4.7008734932692795e-05, "loss": 0.6829, "num_input_tokens_seen": 18827664, "step": 32440 }, { "epoch": 4.8324396782841825, "grad_norm": 0.8786656260490417, "learning_rate": 4.700719346244694e-05, "loss": 0.5997, "num_input_tokens_seen": 18830832, "step": 32445 }, { "epoch": 4.833184390825141, "grad_norm": 0.8807623982429504, "learning_rate": 4.7005651620411914e-05, "loss": 0.5872, "num_input_tokens_seen": 18833680, "step": 32450 }, { "epoch": 4.833929103366101, "grad_norm": 2.92484974861145, "learning_rate": 4.7004109406613786e-05, "loss": 0.8104, "num_input_tokens_seen": 18836752, "step": 32455 }, { "epoch": 4.83467381590706, "grad_norm": 1.109075903892517, "learning_rate": 4.700256682107858e-05, "loss": 0.5053, "num_input_tokens_seen": 18839568, "step": 32460 }, { "epoch": 4.835418528448019, "grad_norm": 1.599335789680481, "learning_rate": 4.700102386383237e-05, "loss": 0.7152, "num_input_tokens_seen": 18842608, "step": 32465 }, { "epoch": 4.836163240988978, "grad_norm": 1.0645735263824463, "learning_rate": 4.699948053490123e-05, "loss": 0.5773, "num_input_tokens_seen": 18845904, "step": 32470 }, { "epoch": 4.836907953529938, "grad_norm": 0.9645915031433105, "learning_rate": 4.699793683431122e-05, "loss": 0.723, "num_input_tokens_seen": 18848880, "step": 32475 }, { "epoch": 4.8376526660708965, "grad_norm": 1.2782301902770996, "learning_rate": 4.699639276208843e-05, "loss": 0.6212, "num_input_tokens_seen": 18851824, "step": 32480 }, { "epoch": 4.838397378611856, "grad_norm": 1.1851712465286255, "learning_rate": 4.699484831825894e-05, "loss": 0.708, "num_input_tokens_seen": 18854672, "step": 32485 }, { "epoch": 4.839142091152815, "grad_norm": 0.971295177936554, "learning_rate": 4.699330350284884e-05, "loss": 0.6302, "num_input_tokens_seen": 18857712, "step": 32490 }, { "epoch": 4.8398868036937746, "grad_norm": 1.0332729816436768, "learning_rate": 4.6991758315884225e-05, "loss": 0.5087, "num_input_tokens_seen": 18860528, "step": 32495 }, { "epoch": 4.840631516234733, "grad_norm": 0.7364435195922852, "learning_rate": 4.699021275739121e-05, "loss": 0.632, "num_input_tokens_seen": 18863312, "step": 32500 }, { "epoch": 4.841376228775693, "grad_norm": 0.6610292792320251, "learning_rate": 4.69886668273959e-05, "loss": 0.5169, "num_input_tokens_seen": 18866192, "step": 32505 }, { "epoch": 4.842120941316652, "grad_norm": 2.076927900314331, "learning_rate": 4.698712052592441e-05, "loss": 0.7583, "num_input_tokens_seen": 18869072, "step": 32510 }, { "epoch": 4.842865653857611, "grad_norm": 1.0415081977844238, "learning_rate": 4.6985573853002875e-05, "loss": 0.5548, "num_input_tokens_seen": 18872176, "step": 32515 }, { "epoch": 4.84361036639857, "grad_norm": 1.5990445613861084, "learning_rate": 4.698402680865741e-05, "loss": 0.6701, "num_input_tokens_seen": 18874896, "step": 32520 }, { "epoch": 4.844355078939529, "grad_norm": 0.8993496298789978, "learning_rate": 4.6982479392914144e-05, "loss": 0.5819, "num_input_tokens_seen": 18877552, "step": 32525 }, { "epoch": 4.8450997914804885, "grad_norm": 1.9897701740264893, "learning_rate": 4.698093160579924e-05, "loss": 0.7839, "num_input_tokens_seen": 18880592, "step": 32530 }, { "epoch": 4.845844504021448, "grad_norm": 0.7565290331840515, "learning_rate": 4.697938344733884e-05, "loss": 0.6779, "num_input_tokens_seen": 18883344, "step": 32535 }, { "epoch": 4.846589216562407, "grad_norm": 1.0279371738433838, "learning_rate": 4.6977834917559095e-05, "loss": 0.694, "num_input_tokens_seen": 18886512, "step": 32540 }, { "epoch": 4.847333929103366, "grad_norm": 1.041277527809143, "learning_rate": 4.697628601648616e-05, "loss": 0.6633, "num_input_tokens_seen": 18889616, "step": 32545 }, { "epoch": 4.848078641644325, "grad_norm": 0.8599822521209717, "learning_rate": 4.697473674414621e-05, "loss": 0.5385, "num_input_tokens_seen": 18892400, "step": 32550 }, { "epoch": 4.848823354185284, "grad_norm": 0.8506945371627808, "learning_rate": 4.697318710056542e-05, "loss": 0.5086, "num_input_tokens_seen": 18895184, "step": 32555 }, { "epoch": 4.849568066726244, "grad_norm": 1.0353152751922607, "learning_rate": 4.697163708576997e-05, "loss": 0.7453, "num_input_tokens_seen": 18898000, "step": 32560 }, { "epoch": 4.8503127792672025, "grad_norm": 0.8247453570365906, "learning_rate": 4.697008669978603e-05, "loss": 0.5626, "num_input_tokens_seen": 18900944, "step": 32565 }, { "epoch": 4.851057491808162, "grad_norm": 0.9589558839797974, "learning_rate": 4.696853594263981e-05, "loss": 0.6257, "num_input_tokens_seen": 18904112, "step": 32570 }, { "epoch": 4.851802204349121, "grad_norm": 1.5811270475387573, "learning_rate": 4.6966984814357515e-05, "loss": 0.5997, "num_input_tokens_seen": 18907024, "step": 32575 }, { "epoch": 4.8525469168900806, "grad_norm": 0.9213332533836365, "learning_rate": 4.6965433314965325e-05, "loss": 0.5613, "num_input_tokens_seen": 18909968, "step": 32580 }, { "epoch": 4.853291629431039, "grad_norm": 0.9697911739349365, "learning_rate": 4.6963881444489464e-05, "loss": 0.5146, "num_input_tokens_seen": 18912848, "step": 32585 }, { "epoch": 4.854036341971999, "grad_norm": 1.1089802980422974, "learning_rate": 4.696232920295616e-05, "loss": 0.7253, "num_input_tokens_seen": 18915504, "step": 32590 }, { "epoch": 4.854781054512958, "grad_norm": 0.7415025234222412, "learning_rate": 4.696077659039161e-05, "loss": 0.6231, "num_input_tokens_seen": 18918064, "step": 32595 }, { "epoch": 4.855525767053917, "grad_norm": 1.2369087934494019, "learning_rate": 4.6959223606822066e-05, "loss": 0.6441, "num_input_tokens_seen": 18921072, "step": 32600 }, { "epoch": 4.856270479594876, "grad_norm": 1.6370470523834229, "learning_rate": 4.695767025227376e-05, "loss": 0.6686, "num_input_tokens_seen": 18924016, "step": 32605 }, { "epoch": 4.857015192135836, "grad_norm": 1.861082911491394, "learning_rate": 4.6956116526772934e-05, "loss": 0.6601, "num_input_tokens_seen": 18926768, "step": 32610 }, { "epoch": 4.8577599046767945, "grad_norm": 0.6531727313995361, "learning_rate": 4.6954562430345825e-05, "loss": 0.5912, "num_input_tokens_seen": 18929296, "step": 32615 }, { "epoch": 4.858504617217754, "grad_norm": 0.7202669382095337, "learning_rate": 4.695300796301871e-05, "loss": 0.7118, "num_input_tokens_seen": 18932624, "step": 32620 }, { "epoch": 4.859249329758713, "grad_norm": 1.7378551959991455, "learning_rate": 4.695145312481783e-05, "loss": 0.7433, "num_input_tokens_seen": 18935536, "step": 32625 }, { "epoch": 4.859994042299673, "grad_norm": 1.0003647804260254, "learning_rate": 4.694989791576946e-05, "loss": 0.6265, "num_input_tokens_seen": 18938480, "step": 32630 }, { "epoch": 4.860738754840631, "grad_norm": 1.8089966773986816, "learning_rate": 4.6948342335899874e-05, "loss": 0.6189, "num_input_tokens_seen": 18941328, "step": 32635 }, { "epoch": 4.861483467381591, "grad_norm": 2.106377363204956, "learning_rate": 4.694678638523535e-05, "loss": 0.7416, "num_input_tokens_seen": 18944432, "step": 32640 }, { "epoch": 4.86222817992255, "grad_norm": 1.4069205522537231, "learning_rate": 4.694523006380218e-05, "loss": 0.6784, "num_input_tokens_seen": 18947088, "step": 32645 }, { "epoch": 4.862972892463509, "grad_norm": 1.2941983938217163, "learning_rate": 4.694367337162665e-05, "loss": 0.7445, "num_input_tokens_seen": 18949968, "step": 32650 }, { "epoch": 4.863717605004468, "grad_norm": 0.759034276008606, "learning_rate": 4.694211630873506e-05, "loss": 0.7355, "num_input_tokens_seen": 18952880, "step": 32655 }, { "epoch": 4.864462317545428, "grad_norm": 0.898418664932251, "learning_rate": 4.694055887515372e-05, "loss": 0.5697, "num_input_tokens_seen": 18955824, "step": 32660 }, { "epoch": 4.8652070300863866, "grad_norm": 1.0078511238098145, "learning_rate": 4.6939001070908925e-05, "loss": 0.6377, "num_input_tokens_seen": 18959088, "step": 32665 }, { "epoch": 4.865951742627346, "grad_norm": 1.4333832263946533, "learning_rate": 4.693744289602702e-05, "loss": 0.7609, "num_input_tokens_seen": 18961808, "step": 32670 }, { "epoch": 4.866696455168305, "grad_norm": 1.2015801668167114, "learning_rate": 4.693588435053431e-05, "loss": 0.5513, "num_input_tokens_seen": 18964528, "step": 32675 }, { "epoch": 4.867441167709265, "grad_norm": 0.9745715260505676, "learning_rate": 4.693432543445712e-05, "loss": 0.6335, "num_input_tokens_seen": 18967280, "step": 32680 }, { "epoch": 4.868185880250223, "grad_norm": 1.4122880697250366, "learning_rate": 4.6932766147821804e-05, "loss": 0.466, "num_input_tokens_seen": 18969936, "step": 32685 }, { "epoch": 4.868930592791182, "grad_norm": 1.066638708114624, "learning_rate": 4.693120649065469e-05, "loss": 0.704, "num_input_tokens_seen": 18972752, "step": 32690 }, { "epoch": 4.869675305332142, "grad_norm": 1.7753970623016357, "learning_rate": 4.6929646462982135e-05, "loss": 0.7424, "num_input_tokens_seen": 18975536, "step": 32695 }, { "epoch": 4.870420017873101, "grad_norm": 0.9661994576454163, "learning_rate": 4.692808606483049e-05, "loss": 0.6971, "num_input_tokens_seen": 18978192, "step": 32700 }, { "epoch": 4.87116473041406, "grad_norm": 0.7835083603858948, "learning_rate": 4.692652529622612e-05, "loss": 0.5911, "num_input_tokens_seen": 18981136, "step": 32705 }, { "epoch": 4.871909442955019, "grad_norm": 1.1786599159240723, "learning_rate": 4.692496415719539e-05, "loss": 0.4983, "num_input_tokens_seen": 18984048, "step": 32710 }, { "epoch": 4.872654155495979, "grad_norm": 1.019026756286621, "learning_rate": 4.692340264776467e-05, "loss": 0.4743, "num_input_tokens_seen": 18986768, "step": 32715 }, { "epoch": 4.873398868036938, "grad_norm": 0.8571943044662476, "learning_rate": 4.6921840767960346e-05, "loss": 0.6373, "num_input_tokens_seen": 18989456, "step": 32720 }, { "epoch": 4.874143580577897, "grad_norm": 0.7091377377510071, "learning_rate": 4.69202785178088e-05, "loss": 0.5922, "num_input_tokens_seen": 18992464, "step": 32725 }, { "epoch": 4.874888293118856, "grad_norm": 1.5153443813323975, "learning_rate": 4.6918715897336434e-05, "loss": 0.6535, "num_input_tokens_seen": 18995312, "step": 32730 }, { "epoch": 4.875633005659815, "grad_norm": 0.8179867267608643, "learning_rate": 4.691715290656964e-05, "loss": 0.6076, "num_input_tokens_seen": 18998384, "step": 32735 }, { "epoch": 4.876377718200774, "grad_norm": 1.2691458463668823, "learning_rate": 4.6915589545534814e-05, "loss": 0.6357, "num_input_tokens_seen": 19001488, "step": 32740 }, { "epoch": 4.877122430741734, "grad_norm": 1.9884259700775146, "learning_rate": 4.691402581425839e-05, "loss": 0.6806, "num_input_tokens_seen": 19004560, "step": 32745 }, { "epoch": 4.8778671432826926, "grad_norm": 0.8524148464202881, "learning_rate": 4.691246171276676e-05, "loss": 0.6511, "num_input_tokens_seen": 19007504, "step": 32750 }, { "epoch": 4.878611855823652, "grad_norm": 1.0497605800628662, "learning_rate": 4.691089724108636e-05, "loss": 0.6209, "num_input_tokens_seen": 19010224, "step": 32755 }, { "epoch": 4.879356568364611, "grad_norm": 1.2090872526168823, "learning_rate": 4.6909332399243636e-05, "loss": 0.5858, "num_input_tokens_seen": 19012752, "step": 32760 }, { "epoch": 4.880101280905571, "grad_norm": 1.3004889488220215, "learning_rate": 4.690776718726499e-05, "loss": 0.5341, "num_input_tokens_seen": 19015952, "step": 32765 }, { "epoch": 4.880845993446529, "grad_norm": 1.5941874980926514, "learning_rate": 4.690620160517689e-05, "loss": 0.7731, "num_input_tokens_seen": 19018768, "step": 32770 }, { "epoch": 4.881590705987489, "grad_norm": 0.7530089020729065, "learning_rate": 4.690463565300579e-05, "loss": 0.6518, "num_input_tokens_seen": 19021360, "step": 32775 }, { "epoch": 4.882335418528448, "grad_norm": 1.1692008972167969, "learning_rate": 4.690306933077811e-05, "loss": 0.7385, "num_input_tokens_seen": 19024080, "step": 32780 }, { "epoch": 4.883080131069407, "grad_norm": 1.3383793830871582, "learning_rate": 4.6901502638520355e-05, "loss": 0.6537, "num_input_tokens_seen": 19026864, "step": 32785 }, { "epoch": 4.883824843610366, "grad_norm": 0.7511748671531677, "learning_rate": 4.689993557625897e-05, "loss": 0.6472, "num_input_tokens_seen": 19029648, "step": 32790 }, { "epoch": 4.884569556151326, "grad_norm": 1.4252935647964478, "learning_rate": 4.689836814402042e-05, "loss": 0.7917, "num_input_tokens_seen": 19032592, "step": 32795 }, { "epoch": 4.885314268692285, "grad_norm": 1.602649450302124, "learning_rate": 4.689680034183121e-05, "loss": 0.6374, "num_input_tokens_seen": 19035568, "step": 32800 }, { "epoch": 4.886058981233244, "grad_norm": 1.3096394538879395, "learning_rate": 4.689523216971781e-05, "loss": 0.7349, "num_input_tokens_seen": 19038384, "step": 32805 }, { "epoch": 4.886803693774203, "grad_norm": 1.0084974765777588, "learning_rate": 4.689366362770671e-05, "loss": 0.8338, "num_input_tokens_seen": 19041264, "step": 32810 }, { "epoch": 4.887548406315163, "grad_norm": 0.6644324660301208, "learning_rate": 4.689209471582442e-05, "loss": 0.6394, "num_input_tokens_seen": 19044208, "step": 32815 }, { "epoch": 4.888293118856121, "grad_norm": 1.290706753730774, "learning_rate": 4.689052543409743e-05, "loss": 0.7411, "num_input_tokens_seen": 19047408, "step": 32820 }, { "epoch": 4.889037831397081, "grad_norm": 0.6506419777870178, "learning_rate": 4.6888955782552274e-05, "loss": 0.5351, "num_input_tokens_seen": 19050032, "step": 32825 }, { "epoch": 4.88978254393804, "grad_norm": 1.2989283800125122, "learning_rate": 4.688738576121545e-05, "loss": 0.6399, "num_input_tokens_seen": 19053104, "step": 32830 }, { "epoch": 4.890527256478999, "grad_norm": 1.2428921461105347, "learning_rate": 4.68858153701135e-05, "loss": 0.9232, "num_input_tokens_seen": 19055792, "step": 32835 }, { "epoch": 4.891271969019958, "grad_norm": 0.8092052936553955, "learning_rate": 4.688424460927293e-05, "loss": 0.704, "num_input_tokens_seen": 19058800, "step": 32840 }, { "epoch": 4.892016681560918, "grad_norm": 2.111218214035034, "learning_rate": 4.688267347872029e-05, "loss": 0.8974, "num_input_tokens_seen": 19061488, "step": 32845 }, { "epoch": 4.892761394101877, "grad_norm": 0.9278725981712341, "learning_rate": 4.6881101978482124e-05, "loss": 0.6294, "num_input_tokens_seen": 19064656, "step": 32850 }, { "epoch": 4.893506106642836, "grad_norm": 0.7445558309555054, "learning_rate": 4.687953010858498e-05, "loss": 0.5516, "num_input_tokens_seen": 19067632, "step": 32855 }, { "epoch": 4.894250819183795, "grad_norm": 1.00894033908844, "learning_rate": 4.6877957869055414e-05, "loss": 0.5362, "num_input_tokens_seen": 19070320, "step": 32860 }, { "epoch": 4.894995531724755, "grad_norm": 1.5069115161895752, "learning_rate": 4.6876385259919984e-05, "loss": 0.7909, "num_input_tokens_seen": 19073424, "step": 32865 }, { "epoch": 4.895740244265713, "grad_norm": 1.214467167854309, "learning_rate": 4.687481228120526e-05, "loss": 0.6502, "num_input_tokens_seen": 19076400, "step": 32870 }, { "epoch": 4.896484956806672, "grad_norm": 1.3228604793548584, "learning_rate": 4.687323893293781e-05, "loss": 0.598, "num_input_tokens_seen": 19079152, "step": 32875 }, { "epoch": 4.897229669347632, "grad_norm": 1.5217719078063965, "learning_rate": 4.687166521514423e-05, "loss": 0.7135, "num_input_tokens_seen": 19081936, "step": 32880 }, { "epoch": 4.8979743818885915, "grad_norm": 1.019296407699585, "learning_rate": 4.687009112785109e-05, "loss": 0.6503, "num_input_tokens_seen": 19084656, "step": 32885 }, { "epoch": 4.89871909442955, "grad_norm": 2.4414281845092773, "learning_rate": 4.686851667108499e-05, "loss": 0.8263, "num_input_tokens_seen": 19087696, "step": 32890 }, { "epoch": 4.899463806970509, "grad_norm": 0.7456009984016418, "learning_rate": 4.686694184487253e-05, "loss": 0.5755, "num_input_tokens_seen": 19090672, "step": 32895 }, { "epoch": 4.900208519511469, "grad_norm": 0.7876120209693909, "learning_rate": 4.68653666492403e-05, "loss": 0.7016, "num_input_tokens_seen": 19093392, "step": 32900 }, { "epoch": 4.900953232052427, "grad_norm": 1.3072245121002197, "learning_rate": 4.686379108421493e-05, "loss": 0.7413, "num_input_tokens_seen": 19096112, "step": 32905 }, { "epoch": 4.901697944593387, "grad_norm": 0.9152738451957703, "learning_rate": 4.686221514982303e-05, "loss": 0.7839, "num_input_tokens_seen": 19098960, "step": 32910 }, { "epoch": 4.902442657134346, "grad_norm": 0.8386508822441101, "learning_rate": 4.686063884609122e-05, "loss": 0.5539, "num_input_tokens_seen": 19101680, "step": 32915 }, { "epoch": 4.903187369675305, "grad_norm": 1.3748455047607422, "learning_rate": 4.685906217304615e-05, "loss": 0.771, "num_input_tokens_seen": 19104272, "step": 32920 }, { "epoch": 4.903932082216264, "grad_norm": 1.8334441184997559, "learning_rate": 4.685748513071443e-05, "loss": 0.538, "num_input_tokens_seen": 19107216, "step": 32925 }, { "epoch": 4.904676794757224, "grad_norm": 0.9551135897636414, "learning_rate": 4.685590771912272e-05, "loss": 0.4852, "num_input_tokens_seen": 19110064, "step": 32930 }, { "epoch": 4.905421507298183, "grad_norm": 0.9624393582344055, "learning_rate": 4.685432993829765e-05, "loss": 0.5472, "num_input_tokens_seen": 19113072, "step": 32935 }, { "epoch": 4.906166219839142, "grad_norm": 0.6972488164901733, "learning_rate": 4.6852751788265895e-05, "loss": 0.6116, "num_input_tokens_seen": 19116048, "step": 32940 }, { "epoch": 4.906910932380101, "grad_norm": 0.9304494261741638, "learning_rate": 4.6851173269054116e-05, "loss": 0.6061, "num_input_tokens_seen": 19119152, "step": 32945 }, { "epoch": 4.907655644921061, "grad_norm": 0.8311294317245483, "learning_rate": 4.6849594380688966e-05, "loss": 0.5136, "num_input_tokens_seen": 19121968, "step": 32950 }, { "epoch": 4.908400357462019, "grad_norm": 1.024350881576538, "learning_rate": 4.684801512319712e-05, "loss": 0.662, "num_input_tokens_seen": 19124784, "step": 32955 }, { "epoch": 4.909145070002979, "grad_norm": 1.217550277709961, "learning_rate": 4.6846435496605275e-05, "loss": 0.5371, "num_input_tokens_seen": 19127888, "step": 32960 }, { "epoch": 4.909889782543938, "grad_norm": 1.3996400833129883, "learning_rate": 4.6844855500940096e-05, "loss": 0.6246, "num_input_tokens_seen": 19130960, "step": 32965 }, { "epoch": 4.9106344950848975, "grad_norm": 1.1829285621643066, "learning_rate": 4.684327513622829e-05, "loss": 0.7531, "num_input_tokens_seen": 19133872, "step": 32970 }, { "epoch": 4.911379207625856, "grad_norm": 0.8517496585845947, "learning_rate": 4.684169440249656e-05, "loss": 0.7082, "num_input_tokens_seen": 19136752, "step": 32975 }, { "epoch": 4.912123920166816, "grad_norm": 1.0066708326339722, "learning_rate": 4.684011329977159e-05, "loss": 0.7987, "num_input_tokens_seen": 19139440, "step": 32980 }, { "epoch": 4.912868632707775, "grad_norm": 0.6663056015968323, "learning_rate": 4.6838531828080104e-05, "loss": 0.6226, "num_input_tokens_seen": 19142544, "step": 32985 }, { "epoch": 4.913613345248734, "grad_norm": 1.048757553100586, "learning_rate": 4.6836949987448824e-05, "loss": 0.4372, "num_input_tokens_seen": 19145360, "step": 32990 }, { "epoch": 4.914358057789693, "grad_norm": 0.8104901909828186, "learning_rate": 4.6835367777904466e-05, "loss": 0.5718, "num_input_tokens_seen": 19149008, "step": 32995 }, { "epoch": 4.915102770330653, "grad_norm": 1.1991310119628906, "learning_rate": 4.6833785199473756e-05, "loss": 0.6948, "num_input_tokens_seen": 19151920, "step": 33000 }, { "epoch": 4.915847482871611, "grad_norm": 0.8108101487159729, "learning_rate": 4.683220225218344e-05, "loss": 0.706, "num_input_tokens_seen": 19154864, "step": 33005 }, { "epoch": 4.916592195412571, "grad_norm": 1.9491815567016602, "learning_rate": 4.683061893606026e-05, "loss": 0.7682, "num_input_tokens_seen": 19157744, "step": 33010 }, { "epoch": 4.91733690795353, "grad_norm": 4.01624870300293, "learning_rate": 4.682903525113096e-05, "loss": 0.7002, "num_input_tokens_seen": 19160752, "step": 33015 }, { "epoch": 4.9180816204944895, "grad_norm": 0.7771396040916443, "learning_rate": 4.682745119742229e-05, "loss": 0.6271, "num_input_tokens_seen": 19163824, "step": 33020 }, { "epoch": 4.918826333035448, "grad_norm": 1.1372863054275513, "learning_rate": 4.682586677496102e-05, "loss": 0.692, "num_input_tokens_seen": 19166672, "step": 33025 }, { "epoch": 4.919571045576408, "grad_norm": 1.268553614616394, "learning_rate": 4.6824281983773914e-05, "loss": 0.6753, "num_input_tokens_seen": 19169584, "step": 33030 }, { "epoch": 4.920315758117367, "grad_norm": 1.8256500959396362, "learning_rate": 4.682269682388775e-05, "loss": 0.7796, "num_input_tokens_seen": 19172720, "step": 33035 }, { "epoch": 4.921060470658325, "grad_norm": 0.9494181275367737, "learning_rate": 4.6821111295329294e-05, "loss": 0.7125, "num_input_tokens_seen": 19175568, "step": 33040 }, { "epoch": 4.921805183199285, "grad_norm": 1.068182110786438, "learning_rate": 4.681952539812534e-05, "loss": 0.837, "num_input_tokens_seen": 19178704, "step": 33045 }, { "epoch": 4.922549895740245, "grad_norm": 0.7756941914558411, "learning_rate": 4.681793913230269e-05, "loss": 0.5985, "num_input_tokens_seen": 19181776, "step": 33050 }, { "epoch": 4.9232946082812035, "grad_norm": 1.0683207511901855, "learning_rate": 4.6816352497888125e-05, "loss": 0.6101, "num_input_tokens_seen": 19184592, "step": 33055 }, { "epoch": 4.924039320822162, "grad_norm": 0.6978617310523987, "learning_rate": 4.6814765494908465e-05, "loss": 0.6685, "num_input_tokens_seen": 19187600, "step": 33060 }, { "epoch": 4.924784033363122, "grad_norm": 1.1771080493927002, "learning_rate": 4.681317812339051e-05, "loss": 0.7106, "num_input_tokens_seen": 19190224, "step": 33065 }, { "epoch": 4.9255287459040815, "grad_norm": 1.5636845827102661, "learning_rate": 4.681159038336108e-05, "loss": 0.5393, "num_input_tokens_seen": 19193392, "step": 33070 }, { "epoch": 4.92627345844504, "grad_norm": 1.2352356910705566, "learning_rate": 4.6810002274847e-05, "loss": 0.753, "num_input_tokens_seen": 19196336, "step": 33075 }, { "epoch": 4.927018170985999, "grad_norm": 0.9859482645988464, "learning_rate": 4.680841379787509e-05, "loss": 0.6555, "num_input_tokens_seen": 19199184, "step": 33080 }, { "epoch": 4.927762883526959, "grad_norm": 1.5993980169296265, "learning_rate": 4.6806824952472204e-05, "loss": 0.5708, "num_input_tokens_seen": 19202064, "step": 33085 }, { "epoch": 4.928507596067917, "grad_norm": 0.7443118691444397, "learning_rate": 4.6805235738665164e-05, "loss": 0.581, "num_input_tokens_seen": 19205072, "step": 33090 }, { "epoch": 4.929252308608877, "grad_norm": 1.3386874198913574, "learning_rate": 4.680364615648084e-05, "loss": 0.561, "num_input_tokens_seen": 19207792, "step": 33095 }, { "epoch": 4.929997021149836, "grad_norm": 1.2860509157180786, "learning_rate": 4.680205620594606e-05, "loss": 0.6262, "num_input_tokens_seen": 19210480, "step": 33100 }, { "epoch": 4.9307417336907955, "grad_norm": 1.2674927711486816, "learning_rate": 4.680046588708772e-05, "loss": 0.6758, "num_input_tokens_seen": 19213360, "step": 33105 }, { "epoch": 4.931486446231754, "grad_norm": 0.9813487529754639, "learning_rate": 4.679887519993265e-05, "loss": 0.7894, "num_input_tokens_seen": 19216112, "step": 33110 }, { "epoch": 4.932231158772714, "grad_norm": 1.074601173400879, "learning_rate": 4.679728414450774e-05, "loss": 0.5819, "num_input_tokens_seen": 19219024, "step": 33115 }, { "epoch": 4.932975871313673, "grad_norm": 1.141978144645691, "learning_rate": 4.679569272083987e-05, "loss": 0.6722, "num_input_tokens_seen": 19221680, "step": 33120 }, { "epoch": 4.933720583854632, "grad_norm": 1.409303903579712, "learning_rate": 4.6794100928955934e-05, "loss": 0.7549, "num_input_tokens_seen": 19224336, "step": 33125 }, { "epoch": 4.934465296395591, "grad_norm": 1.8155560493469238, "learning_rate": 4.67925087688828e-05, "loss": 0.7383, "num_input_tokens_seen": 19227120, "step": 33130 }, { "epoch": 4.935210008936551, "grad_norm": 1.32412850856781, "learning_rate": 4.679091624064738e-05, "loss": 0.632, "num_input_tokens_seen": 19230128, "step": 33135 }, { "epoch": 4.9359547214775095, "grad_norm": 1.5944105386734009, "learning_rate": 4.678932334427658e-05, "loss": 0.6494, "num_input_tokens_seen": 19232944, "step": 33140 }, { "epoch": 4.936699434018469, "grad_norm": 1.1048896312713623, "learning_rate": 4.678773007979731e-05, "loss": 0.7681, "num_input_tokens_seen": 19235664, "step": 33145 }, { "epoch": 4.937444146559428, "grad_norm": 1.1138445138931274, "learning_rate": 4.678613644723649e-05, "loss": 0.7529, "num_input_tokens_seen": 19238608, "step": 33150 }, { "epoch": 4.9381888591003875, "grad_norm": 0.7840583920478821, "learning_rate": 4.6784542446621026e-05, "loss": 0.6026, "num_input_tokens_seen": 19241584, "step": 33155 }, { "epoch": 4.938933571641346, "grad_norm": 0.9068744778633118, "learning_rate": 4.678294807797786e-05, "loss": 0.5952, "num_input_tokens_seen": 19244496, "step": 33160 }, { "epoch": 4.939678284182306, "grad_norm": 1.1448729038238525, "learning_rate": 4.6781353341333926e-05, "loss": 0.7598, "num_input_tokens_seen": 19247664, "step": 33165 }, { "epoch": 4.940422996723265, "grad_norm": 0.9813085198402405, "learning_rate": 4.6779758236716165e-05, "loss": 0.5896, "num_input_tokens_seen": 19250608, "step": 33170 }, { "epoch": 4.941167709264224, "grad_norm": 0.7867771983146667, "learning_rate": 4.677816276415153e-05, "loss": 0.5674, "num_input_tokens_seen": 19253776, "step": 33175 }, { "epoch": 4.941912421805183, "grad_norm": 0.7593666911125183, "learning_rate": 4.677656692366696e-05, "loss": 0.5726, "num_input_tokens_seen": 19256400, "step": 33180 }, { "epoch": 4.942657134346143, "grad_norm": 2.21781849861145, "learning_rate": 4.677497071528944e-05, "loss": 0.5928, "num_input_tokens_seen": 19259152, "step": 33185 }, { "epoch": 4.9434018468871015, "grad_norm": 1.9196778535842896, "learning_rate": 4.67733741390459e-05, "loss": 0.7699, "num_input_tokens_seen": 19261968, "step": 33190 }, { "epoch": 4.944146559428061, "grad_norm": 1.1220413446426392, "learning_rate": 4.677177719496335e-05, "loss": 0.5708, "num_input_tokens_seen": 19265136, "step": 33195 }, { "epoch": 4.94489127196902, "grad_norm": 1.669187307357788, "learning_rate": 4.677017988306874e-05, "loss": 0.6348, "num_input_tokens_seen": 19268080, "step": 33200 }, { "epoch": 4.945635984509979, "grad_norm": 1.3831310272216797, "learning_rate": 4.676858220338908e-05, "loss": 0.7021, "num_input_tokens_seen": 19270960, "step": 33205 }, { "epoch": 4.946380697050938, "grad_norm": 1.3898998498916626, "learning_rate": 4.676698415595134e-05, "loss": 0.721, "num_input_tokens_seen": 19273616, "step": 33210 }, { "epoch": 4.947125409591898, "grad_norm": 0.9202361106872559, "learning_rate": 4.676538574078253e-05, "loss": 0.6632, "num_input_tokens_seen": 19276400, "step": 33215 }, { "epoch": 4.947870122132857, "grad_norm": 1.0539013147354126, "learning_rate": 4.676378695790964e-05, "loss": 0.6587, "num_input_tokens_seen": 19279056, "step": 33220 }, { "epoch": 4.9486148346738155, "grad_norm": 1.4655349254608154, "learning_rate": 4.67621878073597e-05, "loss": 0.6056, "num_input_tokens_seen": 19281904, "step": 33225 }, { "epoch": 4.949359547214775, "grad_norm": 0.6279914975166321, "learning_rate": 4.676058828915971e-05, "loss": 0.559, "num_input_tokens_seen": 19284784, "step": 33230 }, { "epoch": 4.950104259755735, "grad_norm": 1.3338992595672607, "learning_rate": 4.67589884033367e-05, "loss": 0.7034, "num_input_tokens_seen": 19287952, "step": 33235 }, { "epoch": 4.9508489722966935, "grad_norm": 1.2029467821121216, "learning_rate": 4.675738814991769e-05, "loss": 0.751, "num_input_tokens_seen": 19290896, "step": 33240 }, { "epoch": 4.951593684837652, "grad_norm": 1.4435895681381226, "learning_rate": 4.6755787528929726e-05, "loss": 0.6228, "num_input_tokens_seen": 19293648, "step": 33245 }, { "epoch": 4.952338397378612, "grad_norm": 0.6410921812057495, "learning_rate": 4.675418654039984e-05, "loss": 0.5726, "num_input_tokens_seen": 19296592, "step": 33250 }, { "epoch": 4.953083109919571, "grad_norm": 1.2916697263717651, "learning_rate": 4.6752585184355084e-05, "loss": 0.6054, "num_input_tokens_seen": 19299504, "step": 33255 }, { "epoch": 4.95382782246053, "grad_norm": 1.3972159624099731, "learning_rate": 4.675098346082251e-05, "loss": 0.7206, "num_input_tokens_seen": 19302704, "step": 33260 }, { "epoch": 4.954572535001489, "grad_norm": 2.025561571121216, "learning_rate": 4.674938136982918e-05, "loss": 0.6272, "num_input_tokens_seen": 19305936, "step": 33265 }, { "epoch": 4.955317247542449, "grad_norm": 0.970517098903656, "learning_rate": 4.674777891140215e-05, "loss": 0.7214, "num_input_tokens_seen": 19308720, "step": 33270 }, { "epoch": 4.9560619600834075, "grad_norm": 1.2395551204681396, "learning_rate": 4.6746176085568506e-05, "loss": 0.5793, "num_input_tokens_seen": 19311440, "step": 33275 }, { "epoch": 4.956806672624367, "grad_norm": 1.1782214641571045, "learning_rate": 4.674457289235531e-05, "loss": 0.7343, "num_input_tokens_seen": 19314768, "step": 33280 }, { "epoch": 4.957551385165326, "grad_norm": 1.0461838245391846, "learning_rate": 4.674296933178967e-05, "loss": 0.6414, "num_input_tokens_seen": 19317584, "step": 33285 }, { "epoch": 4.9582960977062855, "grad_norm": 0.7113488912582397, "learning_rate": 4.674136540389864e-05, "loss": 0.4822, "num_input_tokens_seen": 19320496, "step": 33290 }, { "epoch": 4.959040810247244, "grad_norm": 0.9612667560577393, "learning_rate": 4.6739761108709356e-05, "loss": 0.7532, "num_input_tokens_seen": 19323760, "step": 33295 }, { "epoch": 4.959785522788204, "grad_norm": 1.498666524887085, "learning_rate": 4.673815644624889e-05, "loss": 0.6701, "num_input_tokens_seen": 19326512, "step": 33300 }, { "epoch": 4.960530235329163, "grad_norm": 1.0333902835845947, "learning_rate": 4.673655141654438e-05, "loss": 0.5587, "num_input_tokens_seen": 19329200, "step": 33305 }, { "epoch": 4.961274947870122, "grad_norm": 0.7524208426475525, "learning_rate": 4.673494601962292e-05, "loss": 0.5854, "num_input_tokens_seen": 19331920, "step": 33310 }, { "epoch": 4.962019660411081, "grad_norm": 1.7280852794647217, "learning_rate": 4.673334025551164e-05, "loss": 0.7208, "num_input_tokens_seen": 19334608, "step": 33315 }, { "epoch": 4.962764372952041, "grad_norm": 1.9561458826065063, "learning_rate": 4.6731734124237654e-05, "loss": 0.6058, "num_input_tokens_seen": 19337456, "step": 33320 }, { "epoch": 4.9635090854929995, "grad_norm": 1.1508893966674805, "learning_rate": 4.6730127625828113e-05, "loss": 0.6901, "num_input_tokens_seen": 19340528, "step": 33325 }, { "epoch": 4.964253798033959, "grad_norm": 1.139297604560852, "learning_rate": 4.672852076031015e-05, "loss": 0.7533, "num_input_tokens_seen": 19343248, "step": 33330 }, { "epoch": 4.964998510574918, "grad_norm": 1.416061282157898, "learning_rate": 4.6726913527710915e-05, "loss": 0.6559, "num_input_tokens_seen": 19345776, "step": 33335 }, { "epoch": 4.965743223115878, "grad_norm": 1.0533483028411865, "learning_rate": 4.672530592805756e-05, "loss": 0.702, "num_input_tokens_seen": 19348624, "step": 33340 }, { "epoch": 4.966487935656836, "grad_norm": 0.7607859373092651, "learning_rate": 4.672369796137724e-05, "loss": 0.6333, "num_input_tokens_seen": 19351664, "step": 33345 }, { "epoch": 4.967232648197796, "grad_norm": 0.7486453056335449, "learning_rate": 4.672208962769713e-05, "loss": 0.5479, "num_input_tokens_seen": 19354128, "step": 33350 }, { "epoch": 4.967977360738755, "grad_norm": 1.6687226295471191, "learning_rate": 4.672048092704438e-05, "loss": 0.6005, "num_input_tokens_seen": 19357008, "step": 33355 }, { "epoch": 4.968722073279714, "grad_norm": 0.8411224484443665, "learning_rate": 4.671887185944618e-05, "loss": 0.6815, "num_input_tokens_seen": 19359952, "step": 33360 }, { "epoch": 4.969466785820673, "grad_norm": 1.0152841806411743, "learning_rate": 4.671726242492972e-05, "loss": 0.5869, "num_input_tokens_seen": 19363088, "step": 33365 }, { "epoch": 4.970211498361633, "grad_norm": 1.4220126867294312, "learning_rate": 4.671565262352219e-05, "loss": 0.6371, "num_input_tokens_seen": 19365840, "step": 33370 }, { "epoch": 4.9709562109025915, "grad_norm": 0.9676434397697449, "learning_rate": 4.671404245525077e-05, "loss": 0.6039, "num_input_tokens_seen": 19368912, "step": 33375 }, { "epoch": 4.971700923443551, "grad_norm": 0.9774254560470581, "learning_rate": 4.671243192014267e-05, "loss": 0.7323, "num_input_tokens_seen": 19371792, "step": 33380 }, { "epoch": 4.97244563598451, "grad_norm": 0.9495405554771423, "learning_rate": 4.6710821018225104e-05, "loss": 0.778, "num_input_tokens_seen": 19374704, "step": 33385 }, { "epoch": 4.973190348525469, "grad_norm": 1.3559165000915527, "learning_rate": 4.670920974952529e-05, "loss": 0.5982, "num_input_tokens_seen": 19377456, "step": 33390 }, { "epoch": 4.973935061066428, "grad_norm": 0.9311854839324951, "learning_rate": 4.6707598114070436e-05, "loss": 0.5561, "num_input_tokens_seen": 19380496, "step": 33395 }, { "epoch": 4.974679773607388, "grad_norm": 1.2222323417663574, "learning_rate": 4.6705986111887765e-05, "loss": 0.7034, "num_input_tokens_seen": 19383504, "step": 33400 }, { "epoch": 4.975424486148347, "grad_norm": 2.212423086166382, "learning_rate": 4.6704373743004534e-05, "loss": 0.6284, "num_input_tokens_seen": 19386288, "step": 33405 }, { "epoch": 4.9761691986893055, "grad_norm": 1.0305765867233276, "learning_rate": 4.670276100744796e-05, "loss": 0.7658, "num_input_tokens_seen": 19388912, "step": 33410 }, { "epoch": 4.976913911230265, "grad_norm": 0.9926660060882568, "learning_rate": 4.67011479052453e-05, "loss": 0.689, "num_input_tokens_seen": 19392368, "step": 33415 }, { "epoch": 4.977658623771224, "grad_norm": 0.9314444661140442, "learning_rate": 4.66995344364238e-05, "loss": 0.68, "num_input_tokens_seen": 19395184, "step": 33420 }, { "epoch": 4.978403336312184, "grad_norm": 1.1502882242202759, "learning_rate": 4.6697920601010724e-05, "loss": 0.774, "num_input_tokens_seen": 19398032, "step": 33425 }, { "epoch": 4.979148048853142, "grad_norm": 0.7904874086380005, "learning_rate": 4.669630639903333e-05, "loss": 0.7415, "num_input_tokens_seen": 19400976, "step": 33430 }, { "epoch": 4.979892761394102, "grad_norm": 1.2564146518707275, "learning_rate": 4.669469183051889e-05, "loss": 0.6613, "num_input_tokens_seen": 19403760, "step": 33435 }, { "epoch": 4.980637473935061, "grad_norm": 1.7718594074249268, "learning_rate": 4.669307689549468e-05, "loss": 0.6871, "num_input_tokens_seen": 19406640, "step": 33440 }, { "epoch": 4.98138218647602, "grad_norm": 0.9011648297309875, "learning_rate": 4.6691461593987985e-05, "loss": 0.8169, "num_input_tokens_seen": 19410032, "step": 33445 }, { "epoch": 4.982126899016979, "grad_norm": 1.5302913188934326, "learning_rate": 4.668984592602609e-05, "loss": 0.764, "num_input_tokens_seen": 19412752, "step": 33450 }, { "epoch": 4.982871611557939, "grad_norm": 0.6004384160041809, "learning_rate": 4.66882298916363e-05, "loss": 0.6198, "num_input_tokens_seen": 19415728, "step": 33455 }, { "epoch": 4.9836163240988975, "grad_norm": 0.8308287858963013, "learning_rate": 4.66866134908459e-05, "loss": 0.6311, "num_input_tokens_seen": 19418992, "step": 33460 }, { "epoch": 4.984361036639857, "grad_norm": 1.1403661966323853, "learning_rate": 4.668499672368221e-05, "loss": 0.7601, "num_input_tokens_seen": 19422032, "step": 33465 }, { "epoch": 4.985105749180816, "grad_norm": 0.9159345626831055, "learning_rate": 4.668337959017254e-05, "loss": 0.5511, "num_input_tokens_seen": 19425328, "step": 33470 }, { "epoch": 4.985850461721776, "grad_norm": 1.5590705871582031, "learning_rate": 4.668176209034421e-05, "loss": 0.6189, "num_input_tokens_seen": 19428304, "step": 33475 }, { "epoch": 4.986595174262734, "grad_norm": 0.8087899684906006, "learning_rate": 4.668014422422455e-05, "loss": 0.5839, "num_input_tokens_seen": 19431312, "step": 33480 }, { "epoch": 4.987339886803694, "grad_norm": 1.382352352142334, "learning_rate": 4.6678525991840886e-05, "loss": 0.5961, "num_input_tokens_seen": 19433968, "step": 33485 }, { "epoch": 4.988084599344653, "grad_norm": 0.8864513039588928, "learning_rate": 4.667690739322055e-05, "loss": 0.5911, "num_input_tokens_seen": 19436528, "step": 33490 }, { "epoch": 4.988829311885612, "grad_norm": 0.9255064129829407, "learning_rate": 4.667528842839091e-05, "loss": 0.6395, "num_input_tokens_seen": 19439248, "step": 33495 }, { "epoch": 4.989574024426571, "grad_norm": 0.9824342131614685, "learning_rate": 4.6673669097379294e-05, "loss": 0.6592, "num_input_tokens_seen": 19442096, "step": 33500 }, { "epoch": 4.990318736967531, "grad_norm": 1.2570176124572754, "learning_rate": 4.6672049400213056e-05, "loss": 0.7635, "num_input_tokens_seen": 19445136, "step": 33505 }, { "epoch": 4.99106344950849, "grad_norm": 0.8688631057739258, "learning_rate": 4.6670429336919585e-05, "loss": 0.6048, "num_input_tokens_seen": 19448112, "step": 33510 }, { "epoch": 4.991808162049449, "grad_norm": 1.3152276277542114, "learning_rate": 4.666880890752623e-05, "loss": 0.8423, "num_input_tokens_seen": 19451120, "step": 33515 }, { "epoch": 4.992552874590408, "grad_norm": 0.9778397679328918, "learning_rate": 4.6667188112060365e-05, "loss": 0.7165, "num_input_tokens_seen": 19454000, "step": 33520 }, { "epoch": 4.993297587131368, "grad_norm": 1.738377571105957, "learning_rate": 4.666556695054939e-05, "loss": 0.7667, "num_input_tokens_seen": 19456848, "step": 33525 }, { "epoch": 4.994042299672326, "grad_norm": 1.3387444019317627, "learning_rate": 4.666394542302068e-05, "loss": 0.6837, "num_input_tokens_seen": 19459536, "step": 33530 }, { "epoch": 4.994787012213286, "grad_norm": 0.7495951056480408, "learning_rate": 4.6662323529501625e-05, "loss": 0.7093, "num_input_tokens_seen": 19462384, "step": 33535 }, { "epoch": 4.995531724754245, "grad_norm": 0.7972093224525452, "learning_rate": 4.666070127001963e-05, "loss": 0.5943, "num_input_tokens_seen": 19465296, "step": 33540 }, { "epoch": 4.996276437295204, "grad_norm": 0.8940253257751465, "learning_rate": 4.6659078644602103e-05, "loss": 0.6432, "num_input_tokens_seen": 19468336, "step": 33545 }, { "epoch": 4.997021149836163, "grad_norm": 2.27038836479187, "learning_rate": 4.665745565327646e-05, "loss": 0.7377, "num_input_tokens_seen": 19471216, "step": 33550 }, { "epoch": 4.997765862377122, "grad_norm": 0.5090866684913635, "learning_rate": 4.665583229607011e-05, "loss": 0.6954, "num_input_tokens_seen": 19474160, "step": 33555 }, { "epoch": 4.998510574918082, "grad_norm": 0.6274273991584778, "learning_rate": 4.6654208573010484e-05, "loss": 0.6039, "num_input_tokens_seen": 19476912, "step": 33560 }, { "epoch": 4.999255287459041, "grad_norm": 0.6556003093719482, "learning_rate": 4.665258448412502e-05, "loss": 0.6735, "num_input_tokens_seen": 19479824, "step": 33565 }, { "epoch": 5.0, "grad_norm": 1.8613089323043823, "learning_rate": 4.665096002944114e-05, "loss": 0.6469, "num_input_tokens_seen": 19482128, "step": 33570 }, { "epoch": 5.0, "eval_loss": 0.6594095826148987, "eval_runtime": 47.0207, "eval_samples_per_second": 63.461, "eval_steps_per_second": 15.865, "num_input_tokens_seen": 19482128, "step": 33570 }, { "epoch": 5.000744712540959, "grad_norm": 1.941075325012207, "learning_rate": 4.6649335208986294e-05, "loss": 0.6722, "num_input_tokens_seen": 19484976, "step": 33575 }, { "epoch": 5.001489425081918, "grad_norm": 0.6601496934890747, "learning_rate": 4.6647710022787935e-05, "loss": 0.548, "num_input_tokens_seen": 19487792, "step": 33580 }, { "epoch": 5.002234137622877, "grad_norm": 0.7774683237075806, "learning_rate": 4.664608447087352e-05, "loss": 0.4539, "num_input_tokens_seen": 19490448, "step": 33585 }, { "epoch": 5.002978850163837, "grad_norm": 2.5846235752105713, "learning_rate": 4.664445855327051e-05, "loss": 0.758, "num_input_tokens_seen": 19493136, "step": 33590 }, { "epoch": 5.003723562704796, "grad_norm": 1.030240535736084, "learning_rate": 4.664283227000636e-05, "loss": 0.5665, "num_input_tokens_seen": 19496144, "step": 33595 }, { "epoch": 5.004468275245755, "grad_norm": 0.7481265664100647, "learning_rate": 4.664120562110857e-05, "loss": 0.586, "num_input_tokens_seen": 19499056, "step": 33600 }, { "epoch": 5.005212987786714, "grad_norm": 0.5842487812042236, "learning_rate": 4.6639578606604596e-05, "loss": 0.72, "num_input_tokens_seen": 19502064, "step": 33605 }, { "epoch": 5.005957700327674, "grad_norm": 0.7416608333587646, "learning_rate": 4.6637951226521935e-05, "loss": 0.5749, "num_input_tokens_seen": 19504752, "step": 33610 }, { "epoch": 5.006702412868632, "grad_norm": 1.393593668937683, "learning_rate": 4.663632348088809e-05, "loss": 0.6647, "num_input_tokens_seen": 19507696, "step": 33615 }, { "epoch": 5.007447125409592, "grad_norm": 1.2960033416748047, "learning_rate": 4.663469536973054e-05, "loss": 0.7477, "num_input_tokens_seen": 19510672, "step": 33620 }, { "epoch": 5.008191837950551, "grad_norm": 0.7178297638893127, "learning_rate": 4.6633066893076804e-05, "loss": 0.6541, "num_input_tokens_seen": 19513424, "step": 33625 }, { "epoch": 5.00893655049151, "grad_norm": 1.3473622798919678, "learning_rate": 4.663143805095439e-05, "loss": 0.6201, "num_input_tokens_seen": 19516240, "step": 33630 }, { "epoch": 5.009681263032469, "grad_norm": 0.5776090025901794, "learning_rate": 4.662980884339081e-05, "loss": 0.7138, "num_input_tokens_seen": 19518896, "step": 33635 }, { "epoch": 5.010425975573429, "grad_norm": 1.835959553718567, "learning_rate": 4.66281792704136e-05, "loss": 0.7907, "num_input_tokens_seen": 19521584, "step": 33640 }, { "epoch": 5.011170688114388, "grad_norm": 1.3279361724853516, "learning_rate": 4.6626549332050284e-05, "loss": 0.565, "num_input_tokens_seen": 19524880, "step": 33645 }, { "epoch": 5.011915400655347, "grad_norm": 0.8227694630622864, "learning_rate": 4.6624919028328394e-05, "loss": 0.5935, "num_input_tokens_seen": 19527536, "step": 33650 }, { "epoch": 5.012660113196306, "grad_norm": 0.9704237580299377, "learning_rate": 4.6623288359275474e-05, "loss": 0.6539, "num_input_tokens_seen": 19530128, "step": 33655 }, { "epoch": 5.013404825737266, "grad_norm": 1.2900360822677612, "learning_rate": 4.662165732491907e-05, "loss": 0.5852, "num_input_tokens_seen": 19533328, "step": 33660 }, { "epoch": 5.014149538278224, "grad_norm": 1.486230492591858, "learning_rate": 4.662002592528675e-05, "loss": 0.7072, "num_input_tokens_seen": 19536272, "step": 33665 }, { "epoch": 5.014894250819184, "grad_norm": 1.0929737091064453, "learning_rate": 4.661839416040606e-05, "loss": 0.7567, "num_input_tokens_seen": 19539184, "step": 33670 }, { "epoch": 5.015638963360143, "grad_norm": 1.3106287717819214, "learning_rate": 4.6616762030304576e-05, "loss": 0.5023, "num_input_tokens_seen": 19541904, "step": 33675 }, { "epoch": 5.0163836759011025, "grad_norm": 1.8652191162109375, "learning_rate": 4.661512953500987e-05, "loss": 0.8172, "num_input_tokens_seen": 19545008, "step": 33680 }, { "epoch": 5.017128388442061, "grad_norm": 0.9471785426139832, "learning_rate": 4.661349667454951e-05, "loss": 0.6903, "num_input_tokens_seen": 19547600, "step": 33685 }, { "epoch": 5.017873100983021, "grad_norm": 1.316052794456482, "learning_rate": 4.6611863448951096e-05, "loss": 0.7509, "num_input_tokens_seen": 19550640, "step": 33690 }, { "epoch": 5.01861781352398, "grad_norm": 1.5785446166992188, "learning_rate": 4.661022985824222e-05, "loss": 0.7008, "num_input_tokens_seen": 19553744, "step": 33695 }, { "epoch": 5.019362526064939, "grad_norm": 2.3438680171966553, "learning_rate": 4.660859590245046e-05, "loss": 0.6661, "num_input_tokens_seen": 19556784, "step": 33700 }, { "epoch": 5.020107238605898, "grad_norm": 1.5353522300720215, "learning_rate": 4.6606961581603446e-05, "loss": 0.5837, "num_input_tokens_seen": 19559792, "step": 33705 }, { "epoch": 5.020851951146858, "grad_norm": 0.8101649880409241, "learning_rate": 4.6605326895728773e-05, "loss": 0.5968, "num_input_tokens_seen": 19562896, "step": 33710 }, { "epoch": 5.021596663687816, "grad_norm": 1.7542176246643066, "learning_rate": 4.6603691844854065e-05, "loss": 0.6825, "num_input_tokens_seen": 19565488, "step": 33715 }, { "epoch": 5.022341376228776, "grad_norm": 1.692948818206787, "learning_rate": 4.660205642900693e-05, "loss": 0.6301, "num_input_tokens_seen": 19568176, "step": 33720 }, { "epoch": 5.023086088769735, "grad_norm": 0.8381657004356384, "learning_rate": 4.660042064821501e-05, "loss": 0.4803, "num_input_tokens_seen": 19570960, "step": 33725 }, { "epoch": 5.0238308013106945, "grad_norm": 0.7323443293571472, "learning_rate": 4.659878450250595e-05, "loss": 0.4877, "num_input_tokens_seen": 19574320, "step": 33730 }, { "epoch": 5.024575513851653, "grad_norm": 0.8880890607833862, "learning_rate": 4.6597147991907365e-05, "loss": 0.7693, "num_input_tokens_seen": 19577040, "step": 33735 }, { "epoch": 5.025320226392613, "grad_norm": 1.1304582357406616, "learning_rate": 4.659551111644692e-05, "loss": 0.6684, "num_input_tokens_seen": 19580080, "step": 33740 }, { "epoch": 5.026064938933572, "grad_norm": 1.1534600257873535, "learning_rate": 4.659387387615226e-05, "loss": 0.6669, "num_input_tokens_seen": 19582960, "step": 33745 }, { "epoch": 5.02680965147453, "grad_norm": 0.706057608127594, "learning_rate": 4.659223627105105e-05, "loss": 0.4797, "num_input_tokens_seen": 19585552, "step": 33750 }, { "epoch": 5.02755436401549, "grad_norm": 1.292069435119629, "learning_rate": 4.659059830117095e-05, "loss": 0.6526, "num_input_tokens_seen": 19588560, "step": 33755 }, { "epoch": 5.028299076556449, "grad_norm": 1.0982379913330078, "learning_rate": 4.658895996653964e-05, "loss": 0.7403, "num_input_tokens_seen": 19591248, "step": 33760 }, { "epoch": 5.0290437890974085, "grad_norm": 2.5398244857788086, "learning_rate": 4.658732126718479e-05, "loss": 0.656, "num_input_tokens_seen": 19594160, "step": 33765 }, { "epoch": 5.029788501638367, "grad_norm": 0.7105979919433594, "learning_rate": 4.6585682203134094e-05, "loss": 0.5651, "num_input_tokens_seen": 19596912, "step": 33770 }, { "epoch": 5.030533214179327, "grad_norm": 1.1245728731155396, "learning_rate": 4.658404277441523e-05, "loss": 0.4905, "num_input_tokens_seen": 19599664, "step": 33775 }, { "epoch": 5.031277926720286, "grad_norm": 0.9220134019851685, "learning_rate": 4.65824029810559e-05, "loss": 0.6317, "num_input_tokens_seen": 19602416, "step": 33780 }, { "epoch": 5.032022639261245, "grad_norm": 0.7668527960777283, "learning_rate": 4.658076282308381e-05, "loss": 0.6585, "num_input_tokens_seen": 19605360, "step": 33785 }, { "epoch": 5.032767351802204, "grad_norm": 0.7701371908187866, "learning_rate": 4.657912230052667e-05, "loss": 0.428, "num_input_tokens_seen": 19608304, "step": 33790 }, { "epoch": 5.033512064343164, "grad_norm": 0.6866862773895264, "learning_rate": 4.657748141341218e-05, "loss": 0.5859, "num_input_tokens_seen": 19611184, "step": 33795 }, { "epoch": 5.034256776884122, "grad_norm": 1.6310100555419922, "learning_rate": 4.657584016176808e-05, "loss": 0.8068, "num_input_tokens_seen": 19614224, "step": 33800 }, { "epoch": 5.035001489425082, "grad_norm": 0.732025146484375, "learning_rate": 4.657419854562208e-05, "loss": 0.6083, "num_input_tokens_seen": 19617424, "step": 33805 }, { "epoch": 5.035746201966041, "grad_norm": 0.6523655652999878, "learning_rate": 4.657255656500193e-05, "loss": 0.5275, "num_input_tokens_seen": 19620016, "step": 33810 }, { "epoch": 5.0364909145070005, "grad_norm": 2.9453647136688232, "learning_rate": 4.657091421993536e-05, "loss": 0.836, "num_input_tokens_seen": 19622992, "step": 33815 }, { "epoch": 5.037235627047959, "grad_norm": 1.1374950408935547, "learning_rate": 4.656927151045012e-05, "loss": 0.6215, "num_input_tokens_seen": 19625872, "step": 33820 }, { "epoch": 5.037980339588919, "grad_norm": 1.0662978887557983, "learning_rate": 4.656762843657396e-05, "loss": 0.8419, "num_input_tokens_seen": 19628784, "step": 33825 }, { "epoch": 5.038725052129878, "grad_norm": 0.7186338901519775, "learning_rate": 4.656598499833463e-05, "loss": 0.6562, "num_input_tokens_seen": 19631600, "step": 33830 }, { "epoch": 5.039469764670837, "grad_norm": 1.4439146518707275, "learning_rate": 4.6564341195759915e-05, "loss": 0.6891, "num_input_tokens_seen": 19634576, "step": 33835 }, { "epoch": 5.040214477211796, "grad_norm": 1.0414410829544067, "learning_rate": 4.656269702887757e-05, "loss": 0.5838, "num_input_tokens_seen": 19637648, "step": 33840 }, { "epoch": 5.040959189752756, "grad_norm": 0.7830044031143188, "learning_rate": 4.656105249771536e-05, "loss": 0.6158, "num_input_tokens_seen": 19640464, "step": 33845 }, { "epoch": 5.0417039022937145, "grad_norm": 1.5551137924194336, "learning_rate": 4.65594076023011e-05, "loss": 0.7126, "num_input_tokens_seen": 19643056, "step": 33850 }, { "epoch": 5.042448614834674, "grad_norm": 2.4276485443115234, "learning_rate": 4.655776234266255e-05, "loss": 0.7178, "num_input_tokens_seen": 19645936, "step": 33855 }, { "epoch": 5.043193327375633, "grad_norm": 1.7408331632614136, "learning_rate": 4.655611671882752e-05, "loss": 0.7308, "num_input_tokens_seen": 19648976, "step": 33860 }, { "epoch": 5.0439380399165925, "grad_norm": 2.2137794494628906, "learning_rate": 4.655447073082381e-05, "loss": 0.6824, "num_input_tokens_seen": 19651984, "step": 33865 }, { "epoch": 5.044682752457551, "grad_norm": 2.4517855644226074, "learning_rate": 4.6552824378679216e-05, "loss": 0.8252, "num_input_tokens_seen": 19654864, "step": 33870 }, { "epoch": 5.045427464998511, "grad_norm": 1.4993332624435425, "learning_rate": 4.655117766242156e-05, "loss": 0.5518, "num_input_tokens_seen": 19657744, "step": 33875 }, { "epoch": 5.04617217753947, "grad_norm": 1.042261004447937, "learning_rate": 4.654953058207866e-05, "loss": 0.6429, "num_input_tokens_seen": 19660560, "step": 33880 }, { "epoch": 5.046916890080429, "grad_norm": 1.1869417428970337, "learning_rate": 4.654788313767835e-05, "loss": 0.6178, "num_input_tokens_seen": 19663152, "step": 33885 }, { "epoch": 5.047661602621388, "grad_norm": 1.4068942070007324, "learning_rate": 4.654623532924845e-05, "loss": 0.7309, "num_input_tokens_seen": 19665904, "step": 33890 }, { "epoch": 5.048406315162348, "grad_norm": 1.0720030069351196, "learning_rate": 4.6544587156816806e-05, "loss": 0.6212, "num_input_tokens_seen": 19668624, "step": 33895 }, { "epoch": 5.0491510277033065, "grad_norm": 2.477029323577881, "learning_rate": 4.6542938620411256e-05, "loss": 0.762, "num_input_tokens_seen": 19671504, "step": 33900 }, { "epoch": 5.049895740244266, "grad_norm": 1.4315077066421509, "learning_rate": 4.654128972005966e-05, "loss": 0.5914, "num_input_tokens_seen": 19674416, "step": 33905 }, { "epoch": 5.050640452785225, "grad_norm": 1.06773841381073, "learning_rate": 4.653964045578986e-05, "loss": 0.6698, "num_input_tokens_seen": 19677328, "step": 33910 }, { "epoch": 5.0513851653261845, "grad_norm": 0.8951758146286011, "learning_rate": 4.6537990827629726e-05, "loss": 0.5471, "num_input_tokens_seen": 19680240, "step": 33915 }, { "epoch": 5.052129877867143, "grad_norm": 1.261080265045166, "learning_rate": 4.653634083560713e-05, "loss": 0.7673, "num_input_tokens_seen": 19683344, "step": 33920 }, { "epoch": 5.052874590408102, "grad_norm": 1.0016525983810425, "learning_rate": 4.653469047974994e-05, "loss": 0.5596, "num_input_tokens_seen": 19686576, "step": 33925 }, { "epoch": 5.053619302949062, "grad_norm": 1.441031575202942, "learning_rate": 4.653303976008604e-05, "loss": 0.7458, "num_input_tokens_seen": 19689616, "step": 33930 }, { "epoch": 5.0543640154900205, "grad_norm": 2.2905869483947754, "learning_rate": 4.6531388676643325e-05, "loss": 0.7202, "num_input_tokens_seen": 19692432, "step": 33935 }, { "epoch": 5.05510872803098, "grad_norm": 0.8239604234695435, "learning_rate": 4.6529737229449676e-05, "loss": 0.643, "num_input_tokens_seen": 19695408, "step": 33940 }, { "epoch": 5.055853440571939, "grad_norm": 1.2592039108276367, "learning_rate": 4.6528085418533004e-05, "loss": 0.5232, "num_input_tokens_seen": 19698352, "step": 33945 }, { "epoch": 5.0565981531128985, "grad_norm": 0.8198142051696777, "learning_rate": 4.652643324392121e-05, "loss": 0.6847, "num_input_tokens_seen": 19701712, "step": 33950 }, { "epoch": 5.057342865653857, "grad_norm": 0.8527895212173462, "learning_rate": 4.65247807056422e-05, "loss": 0.6343, "num_input_tokens_seen": 19704592, "step": 33955 }, { "epoch": 5.058087578194817, "grad_norm": 1.175125241279602, "learning_rate": 4.65231278037239e-05, "loss": 0.7222, "num_input_tokens_seen": 19707664, "step": 33960 }, { "epoch": 5.058832290735776, "grad_norm": 0.6884059309959412, "learning_rate": 4.652147453819423e-05, "loss": 0.5539, "num_input_tokens_seen": 19710704, "step": 33965 }, { "epoch": 5.059577003276735, "grad_norm": 1.541831135749817, "learning_rate": 4.651982090908112e-05, "loss": 0.6053, "num_input_tokens_seen": 19713520, "step": 33970 }, { "epoch": 5.060321715817694, "grad_norm": 0.8938604593276978, "learning_rate": 4.6518166916412506e-05, "loss": 0.7686, "num_input_tokens_seen": 19716560, "step": 33975 }, { "epoch": 5.061066428358654, "grad_norm": 1.3614188432693481, "learning_rate": 4.651651256021634e-05, "loss": 0.6969, "num_input_tokens_seen": 19719504, "step": 33980 }, { "epoch": 5.0618111408996125, "grad_norm": 1.3846538066864014, "learning_rate": 4.651485784052055e-05, "loss": 0.6779, "num_input_tokens_seen": 19722352, "step": 33985 }, { "epoch": 5.062555853440572, "grad_norm": 0.9527895450592041, "learning_rate": 4.6513202757353116e-05, "loss": 0.6562, "num_input_tokens_seen": 19725104, "step": 33990 }, { "epoch": 5.063300565981531, "grad_norm": 0.9141375422477722, "learning_rate": 4.6511547310741984e-05, "loss": 0.5943, "num_input_tokens_seen": 19727952, "step": 33995 }, { "epoch": 5.0640452785224905, "grad_norm": 1.1860939264297485, "learning_rate": 4.650989150071512e-05, "loss": 0.5213, "num_input_tokens_seen": 19730992, "step": 34000 }, { "epoch": 5.064789991063449, "grad_norm": 0.9027901887893677, "learning_rate": 4.6508235327300496e-05, "loss": 0.729, "num_input_tokens_seen": 19734288, "step": 34005 }, { "epoch": 5.065534703604409, "grad_norm": 1.4266221523284912, "learning_rate": 4.65065787905261e-05, "loss": 0.4951, "num_input_tokens_seen": 19737264, "step": 34010 }, { "epoch": 5.066279416145368, "grad_norm": 1.2577887773513794, "learning_rate": 4.650492189041992e-05, "loss": 0.7068, "num_input_tokens_seen": 19740048, "step": 34015 }, { "epoch": 5.067024128686327, "grad_norm": 0.890890896320343, "learning_rate": 4.650326462700993e-05, "loss": 0.7059, "num_input_tokens_seen": 19742736, "step": 34020 }, { "epoch": 5.067768841227286, "grad_norm": 1.5068318843841553, "learning_rate": 4.650160700032416e-05, "loss": 0.7786, "num_input_tokens_seen": 19745424, "step": 34025 }, { "epoch": 5.068513553768246, "grad_norm": 1.5193674564361572, "learning_rate": 4.649994901039057e-05, "loss": 0.6806, "num_input_tokens_seen": 19748176, "step": 34030 }, { "epoch": 5.0692582663092045, "grad_norm": 2.0444164276123047, "learning_rate": 4.6498290657237205e-05, "loss": 0.6551, "num_input_tokens_seen": 19750928, "step": 34035 }, { "epoch": 5.070002978850164, "grad_norm": 1.0239676237106323, "learning_rate": 4.649663194089207e-05, "loss": 0.703, "num_input_tokens_seen": 19754000, "step": 34040 }, { "epoch": 5.070747691391123, "grad_norm": 0.9055261611938477, "learning_rate": 4.649497286138318e-05, "loss": 0.6183, "num_input_tokens_seen": 19756816, "step": 34045 }, { "epoch": 5.071492403932083, "grad_norm": 0.668676495552063, "learning_rate": 4.6493313418738564e-05, "loss": 0.7495, "num_input_tokens_seen": 19759888, "step": 34050 }, { "epoch": 5.072237116473041, "grad_norm": 1.3301560878753662, "learning_rate": 4.649165361298628e-05, "loss": 0.6395, "num_input_tokens_seen": 19762608, "step": 34055 }, { "epoch": 5.072981829014001, "grad_norm": 1.586277723312378, "learning_rate": 4.6489993444154334e-05, "loss": 0.7087, "num_input_tokens_seen": 19765392, "step": 34060 }, { "epoch": 5.07372654155496, "grad_norm": 0.883097767829895, "learning_rate": 4.64883329122708e-05, "loss": 0.8083, "num_input_tokens_seen": 19768336, "step": 34065 }, { "epoch": 5.074471254095919, "grad_norm": 0.7009603977203369, "learning_rate": 4.648667201736372e-05, "loss": 0.6227, "num_input_tokens_seen": 19771152, "step": 34070 }, { "epoch": 5.075215966636878, "grad_norm": 0.8556592464447021, "learning_rate": 4.648501075946116e-05, "loss": 0.6276, "num_input_tokens_seen": 19773712, "step": 34075 }, { "epoch": 5.075960679177838, "grad_norm": 0.8813208341598511, "learning_rate": 4.648334913859117e-05, "loss": 0.4875, "num_input_tokens_seen": 19776784, "step": 34080 }, { "epoch": 5.0767053917187965, "grad_norm": 1.508885145187378, "learning_rate": 4.648168715478183e-05, "loss": 0.6045, "num_input_tokens_seen": 19779568, "step": 34085 }, { "epoch": 5.077450104259755, "grad_norm": 1.7017215490341187, "learning_rate": 4.648002480806123e-05, "loss": 0.7057, "num_input_tokens_seen": 19782256, "step": 34090 }, { "epoch": 5.078194816800715, "grad_norm": 0.7363334894180298, "learning_rate": 4.647836209845744e-05, "loss": 0.5965, "num_input_tokens_seen": 19784976, "step": 34095 }, { "epoch": 5.078939529341674, "grad_norm": 1.5354281663894653, "learning_rate": 4.647669902599854e-05, "loss": 0.7889, "num_input_tokens_seen": 19787824, "step": 34100 }, { "epoch": 5.079684241882633, "grad_norm": 1.7621020078659058, "learning_rate": 4.6475035590712646e-05, "loss": 0.7098, "num_input_tokens_seen": 19790640, "step": 34105 }, { "epoch": 5.080428954423592, "grad_norm": 1.0551501512527466, "learning_rate": 4.6473371792627854e-05, "loss": 0.6165, "num_input_tokens_seen": 19793552, "step": 34110 }, { "epoch": 5.081173666964552, "grad_norm": 1.18185293674469, "learning_rate": 4.6471707631772267e-05, "loss": 0.7819, "num_input_tokens_seen": 19796528, "step": 34115 }, { "epoch": 5.0819183795055105, "grad_norm": 2.2290022373199463, "learning_rate": 4.6470043108174e-05, "loss": 0.756, "num_input_tokens_seen": 19799216, "step": 34120 }, { "epoch": 5.08266309204647, "grad_norm": 0.92330002784729, "learning_rate": 4.6468378221861175e-05, "loss": 0.6312, "num_input_tokens_seen": 19801904, "step": 34125 }, { "epoch": 5.083407804587429, "grad_norm": 1.272557258605957, "learning_rate": 4.646671297286193e-05, "loss": 0.6277, "num_input_tokens_seen": 19804784, "step": 34130 }, { "epoch": 5.084152517128389, "grad_norm": 1.7536181211471558, "learning_rate": 4.646504736120438e-05, "loss": 0.7869, "num_input_tokens_seen": 19807728, "step": 34135 }, { "epoch": 5.084897229669347, "grad_norm": 1.6705471277236938, "learning_rate": 4.646338138691667e-05, "loss": 0.6333, "num_input_tokens_seen": 19810384, "step": 34140 }, { "epoch": 5.085641942210307, "grad_norm": 0.737788736820221, "learning_rate": 4.646171505002694e-05, "loss": 0.6753, "num_input_tokens_seen": 19813232, "step": 34145 }, { "epoch": 5.086386654751266, "grad_norm": 1.5242443084716797, "learning_rate": 4.646004835056336e-05, "loss": 0.7485, "num_input_tokens_seen": 19815984, "step": 34150 }, { "epoch": 5.087131367292225, "grad_norm": 0.8875161409378052, "learning_rate": 4.645838128855406e-05, "loss": 0.6178, "num_input_tokens_seen": 19818992, "step": 34155 }, { "epoch": 5.087876079833184, "grad_norm": 0.7970091700553894, "learning_rate": 4.6456713864027234e-05, "loss": 0.5545, "num_input_tokens_seen": 19821968, "step": 34160 }, { "epoch": 5.088620792374144, "grad_norm": 0.9411143660545349, "learning_rate": 4.645504607701102e-05, "loss": 0.6492, "num_input_tokens_seen": 19825040, "step": 34165 }, { "epoch": 5.0893655049151025, "grad_norm": 1.2589191198349, "learning_rate": 4.645337792753362e-05, "loss": 0.8196, "num_input_tokens_seen": 19828080, "step": 34170 }, { "epoch": 5.090110217456062, "grad_norm": 0.6167510151863098, "learning_rate": 4.64517094156232e-05, "loss": 0.6562, "num_input_tokens_seen": 19830896, "step": 34175 }, { "epoch": 5.090854929997021, "grad_norm": 1.3863145112991333, "learning_rate": 4.645004054130795e-05, "loss": 0.6017, "num_input_tokens_seen": 19833712, "step": 34180 }, { "epoch": 5.091599642537981, "grad_norm": 0.9454824924468994, "learning_rate": 4.644837130461607e-05, "loss": 0.6473, "num_input_tokens_seen": 19836592, "step": 34185 }, { "epoch": 5.092344355078939, "grad_norm": 0.8572995066642761, "learning_rate": 4.644670170557575e-05, "loss": 0.5965, "num_input_tokens_seen": 19839856, "step": 34190 }, { "epoch": 5.093089067619899, "grad_norm": 0.8363500237464905, "learning_rate": 4.644503174421521e-05, "loss": 0.6802, "num_input_tokens_seen": 19842832, "step": 34195 }, { "epoch": 5.093833780160858, "grad_norm": 1.186739206314087, "learning_rate": 4.644336142056265e-05, "loss": 0.6608, "num_input_tokens_seen": 19845616, "step": 34200 }, { "epoch": 5.094578492701817, "grad_norm": 0.895918071269989, "learning_rate": 4.644169073464629e-05, "loss": 0.5675, "num_input_tokens_seen": 19848464, "step": 34205 }, { "epoch": 5.095323205242776, "grad_norm": 2.2704222202301025, "learning_rate": 4.644001968649436e-05, "loss": 0.7125, "num_input_tokens_seen": 19851344, "step": 34210 }, { "epoch": 5.096067917783736, "grad_norm": 0.5545040965080261, "learning_rate": 4.643834827613508e-05, "loss": 0.7044, "num_input_tokens_seen": 19854512, "step": 34215 }, { "epoch": 5.096812630324695, "grad_norm": 1.0577874183654785, "learning_rate": 4.643667650359671e-05, "loss": 0.8518, "num_input_tokens_seen": 19857424, "step": 34220 }, { "epoch": 5.097557342865654, "grad_norm": 1.0516126155853271, "learning_rate": 4.643500436890746e-05, "loss": 0.5763, "num_input_tokens_seen": 19860144, "step": 34225 }, { "epoch": 5.098302055406613, "grad_norm": 0.9753502607345581, "learning_rate": 4.6433331872095615e-05, "loss": 0.6301, "num_input_tokens_seen": 19863312, "step": 34230 }, { "epoch": 5.099046767947573, "grad_norm": 0.8760557174682617, "learning_rate": 4.643165901318941e-05, "loss": 0.5461, "num_input_tokens_seen": 19866352, "step": 34235 }, { "epoch": 5.099791480488531, "grad_norm": 1.131568431854248, "learning_rate": 4.6429985792217095e-05, "loss": 0.5812, "num_input_tokens_seen": 19869168, "step": 34240 }, { "epoch": 5.100536193029491, "grad_norm": 0.7589659690856934, "learning_rate": 4.642831220920696e-05, "loss": 0.7111, "num_input_tokens_seen": 19872528, "step": 34245 }, { "epoch": 5.10128090557045, "grad_norm": 1.0734976530075073, "learning_rate": 4.642663826418726e-05, "loss": 0.7249, "num_input_tokens_seen": 19875248, "step": 34250 }, { "epoch": 5.102025618111409, "grad_norm": 0.749255895614624, "learning_rate": 4.64249639571863e-05, "loss": 0.6719, "num_input_tokens_seen": 19877936, "step": 34255 }, { "epoch": 5.102770330652368, "grad_norm": 1.0709545612335205, "learning_rate": 4.642328928823234e-05, "loss": 0.5774, "num_input_tokens_seen": 19880912, "step": 34260 }, { "epoch": 5.103515043193327, "grad_norm": 0.7354466319084167, "learning_rate": 4.6421614257353676e-05, "loss": 0.5487, "num_input_tokens_seen": 19883856, "step": 34265 }, { "epoch": 5.104259755734287, "grad_norm": 1.064365267753601, "learning_rate": 4.6419938864578615e-05, "loss": 0.6441, "num_input_tokens_seen": 19886960, "step": 34270 }, { "epoch": 5.105004468275245, "grad_norm": 0.494718462228775, "learning_rate": 4.641826310993546e-05, "loss": 0.7067, "num_input_tokens_seen": 19889840, "step": 34275 }, { "epoch": 5.105749180816205, "grad_norm": 1.5754015445709229, "learning_rate": 4.641658699345251e-05, "loss": 0.7225, "num_input_tokens_seen": 19892688, "step": 34280 }, { "epoch": 5.106493893357164, "grad_norm": 1.3642545938491821, "learning_rate": 4.64149105151581e-05, "loss": 0.8302, "num_input_tokens_seen": 19895856, "step": 34285 }, { "epoch": 5.107238605898123, "grad_norm": 1.3805214166641235, "learning_rate": 4.641323367508054e-05, "loss": 0.648, "num_input_tokens_seen": 19898576, "step": 34290 }, { "epoch": 5.107983318439082, "grad_norm": 2.42535662651062, "learning_rate": 4.641155647324816e-05, "loss": 0.7929, "num_input_tokens_seen": 19901328, "step": 34295 }, { "epoch": 5.108728030980042, "grad_norm": 0.8975737690925598, "learning_rate": 4.6409878909689286e-05, "loss": 0.7535, "num_input_tokens_seen": 19903984, "step": 34300 }, { "epoch": 5.109472743521001, "grad_norm": 1.52736496925354, "learning_rate": 4.6408200984432276e-05, "loss": 0.5159, "num_input_tokens_seen": 19906864, "step": 34305 }, { "epoch": 5.11021745606196, "grad_norm": 0.8692038059234619, "learning_rate": 4.640652269750547e-05, "loss": 0.5377, "num_input_tokens_seen": 19909680, "step": 34310 }, { "epoch": 5.110962168602919, "grad_norm": 0.9060810804367065, "learning_rate": 4.640484404893722e-05, "loss": 0.4693, "num_input_tokens_seen": 19912464, "step": 34315 }, { "epoch": 5.111706881143879, "grad_norm": 1.363762378692627, "learning_rate": 4.640316503875588e-05, "loss": 0.5831, "num_input_tokens_seen": 19915280, "step": 34320 }, { "epoch": 5.112451593684837, "grad_norm": 0.8272868394851685, "learning_rate": 4.640148566698982e-05, "loss": 0.6008, "num_input_tokens_seen": 19917904, "step": 34325 }, { "epoch": 5.113196306225797, "grad_norm": 1.121535062789917, "learning_rate": 4.639980593366742e-05, "loss": 0.5419, "num_input_tokens_seen": 19920560, "step": 34330 }, { "epoch": 5.113941018766756, "grad_norm": 1.0381544828414917, "learning_rate": 4.639812583881704e-05, "loss": 0.5553, "num_input_tokens_seen": 19923248, "step": 34335 }, { "epoch": 5.114685731307715, "grad_norm": 1.9794520139694214, "learning_rate": 4.6396445382467067e-05, "loss": 0.7449, "num_input_tokens_seen": 19926064, "step": 34340 }, { "epoch": 5.115430443848674, "grad_norm": 1.2721556425094604, "learning_rate": 4.639476456464591e-05, "loss": 0.6925, "num_input_tokens_seen": 19929040, "step": 34345 }, { "epoch": 5.116175156389634, "grad_norm": 1.1485544443130493, "learning_rate": 4.639308338538194e-05, "loss": 0.5734, "num_input_tokens_seen": 19931888, "step": 34350 }, { "epoch": 5.116919868930593, "grad_norm": 0.9522345662117004, "learning_rate": 4.639140184470357e-05, "loss": 0.7678, "num_input_tokens_seen": 19934736, "step": 34355 }, { "epoch": 5.117664581471552, "grad_norm": 0.8363208174705505, "learning_rate": 4.638971994263921e-05, "loss": 0.7223, "num_input_tokens_seen": 19937392, "step": 34360 }, { "epoch": 5.118409294012511, "grad_norm": 0.7432029247283936, "learning_rate": 4.6388037679217274e-05, "loss": 0.7522, "num_input_tokens_seen": 19940112, "step": 34365 }, { "epoch": 5.119154006553471, "grad_norm": 1.322191834449768, "learning_rate": 4.638635505446617e-05, "loss": 0.6939, "num_input_tokens_seen": 19942992, "step": 34370 }, { "epoch": 5.119898719094429, "grad_norm": 0.731890082359314, "learning_rate": 4.638467206841434e-05, "loss": 0.5406, "num_input_tokens_seen": 19946256, "step": 34375 }, { "epoch": 5.120643431635389, "grad_norm": 0.6276252865791321, "learning_rate": 4.6382988721090214e-05, "loss": 0.5939, "num_input_tokens_seen": 19949232, "step": 34380 }, { "epoch": 5.121388144176348, "grad_norm": 0.9356642365455627, "learning_rate": 4.638130501252221e-05, "loss": 0.7619, "num_input_tokens_seen": 19952208, "step": 34385 }, { "epoch": 5.1221328567173074, "grad_norm": 1.4230875968933105, "learning_rate": 4.6379620942738814e-05, "loss": 0.7903, "num_input_tokens_seen": 19954960, "step": 34390 }, { "epoch": 5.122877569258266, "grad_norm": 1.549049735069275, "learning_rate": 4.637793651176843e-05, "loss": 0.5502, "num_input_tokens_seen": 19957904, "step": 34395 }, { "epoch": 5.123622281799226, "grad_norm": 1.4146665334701538, "learning_rate": 4.637625171963954e-05, "loss": 0.746, "num_input_tokens_seen": 19960720, "step": 34400 }, { "epoch": 5.124366994340185, "grad_norm": 1.119391679763794, "learning_rate": 4.63745665663806e-05, "loss": 0.5609, "num_input_tokens_seen": 19963312, "step": 34405 }, { "epoch": 5.125111706881144, "grad_norm": 1.5632075071334839, "learning_rate": 4.63728810520201e-05, "loss": 0.7163, "num_input_tokens_seen": 19966192, "step": 34410 }, { "epoch": 5.125856419422103, "grad_norm": 2.040402889251709, "learning_rate": 4.637119517658648e-05, "loss": 0.6941, "num_input_tokens_seen": 19969296, "step": 34415 }, { "epoch": 5.126601131963063, "grad_norm": 0.6821606159210205, "learning_rate": 4.636950894010825e-05, "loss": 0.4417, "num_input_tokens_seen": 19972208, "step": 34420 }, { "epoch": 5.127345844504021, "grad_norm": 0.6528443098068237, "learning_rate": 4.636782234261388e-05, "loss": 0.6498, "num_input_tokens_seen": 19975088, "step": 34425 }, { "epoch": 5.128090557044981, "grad_norm": 1.0237473249435425, "learning_rate": 4.6366135384131866e-05, "loss": 0.4559, "num_input_tokens_seen": 19977840, "step": 34430 }, { "epoch": 5.12883526958594, "grad_norm": 2.319746255874634, "learning_rate": 4.6364448064690716e-05, "loss": 0.6059, "num_input_tokens_seen": 19980496, "step": 34435 }, { "epoch": 5.129579982126899, "grad_norm": 1.2905478477478027, "learning_rate": 4.636276038431892e-05, "loss": 0.654, "num_input_tokens_seen": 19983696, "step": 34440 }, { "epoch": 5.130324694667858, "grad_norm": 1.02695894241333, "learning_rate": 4.636107234304501e-05, "loss": 0.4802, "num_input_tokens_seen": 19986576, "step": 34445 }, { "epoch": 5.131069407208817, "grad_norm": 0.9652453660964966, "learning_rate": 4.635938394089748e-05, "loss": 0.5889, "num_input_tokens_seen": 19989200, "step": 34450 }, { "epoch": 5.131814119749777, "grad_norm": 1.2235363721847534, "learning_rate": 4.635769517790488e-05, "loss": 0.6915, "num_input_tokens_seen": 19991920, "step": 34455 }, { "epoch": 5.132558832290735, "grad_norm": 0.8196308612823486, "learning_rate": 4.635600605409572e-05, "loss": 0.6049, "num_input_tokens_seen": 19994800, "step": 34460 }, { "epoch": 5.133303544831695, "grad_norm": 1.8097635507583618, "learning_rate": 4.6354316569498545e-05, "loss": 0.5936, "num_input_tokens_seen": 19997680, "step": 34465 }, { "epoch": 5.134048257372654, "grad_norm": 1.7522633075714111, "learning_rate": 4.63526267241419e-05, "loss": 0.664, "num_input_tokens_seen": 20000368, "step": 34470 }, { "epoch": 5.1347929699136134, "grad_norm": 2.5272841453552246, "learning_rate": 4.6350936518054325e-05, "loss": 0.5958, "num_input_tokens_seen": 20003152, "step": 34475 }, { "epoch": 5.135537682454572, "grad_norm": 1.2543648481369019, "learning_rate": 4.634924595126437e-05, "loss": 0.6858, "num_input_tokens_seen": 20006064, "step": 34480 }, { "epoch": 5.136282394995532, "grad_norm": 1.0091115236282349, "learning_rate": 4.6347555023800616e-05, "loss": 0.7186, "num_input_tokens_seen": 20008432, "step": 34485 }, { "epoch": 5.137027107536491, "grad_norm": 0.8710534572601318, "learning_rate": 4.634586373569161e-05, "loss": 0.7165, "num_input_tokens_seen": 20011440, "step": 34490 }, { "epoch": 5.13777182007745, "grad_norm": 0.8949601054191589, "learning_rate": 4.634417208696593e-05, "loss": 0.6393, "num_input_tokens_seen": 20014064, "step": 34495 }, { "epoch": 5.138516532618409, "grad_norm": 1.721434235572815, "learning_rate": 4.634248007765216e-05, "loss": 0.5862, "num_input_tokens_seen": 20016976, "step": 34500 }, { "epoch": 5.139261245159369, "grad_norm": 0.823065996170044, "learning_rate": 4.6340787707778874e-05, "loss": 0.7051, "num_input_tokens_seen": 20020016, "step": 34505 }, { "epoch": 5.140005957700327, "grad_norm": 1.0671961307525635, "learning_rate": 4.633909497737468e-05, "loss": 0.7139, "num_input_tokens_seen": 20022768, "step": 34510 }, { "epoch": 5.140750670241287, "grad_norm": 1.0915822982788086, "learning_rate": 4.6337401886468156e-05, "loss": 0.7337, "num_input_tokens_seen": 20025712, "step": 34515 }, { "epoch": 5.141495382782246, "grad_norm": 1.0214645862579346, "learning_rate": 4.633570843508792e-05, "loss": 0.721, "num_input_tokens_seen": 20028368, "step": 34520 }, { "epoch": 5.1422400953232055, "grad_norm": 1.1286752223968506, "learning_rate": 4.633401462326257e-05, "loss": 0.6607, "num_input_tokens_seen": 20031472, "step": 34525 }, { "epoch": 5.142984807864164, "grad_norm": 2.594846725463867, "learning_rate": 4.633232045102072e-05, "loss": 0.7887, "num_input_tokens_seen": 20034192, "step": 34530 }, { "epoch": 5.143729520405124, "grad_norm": 0.7512416839599609, "learning_rate": 4.6330625918391e-05, "loss": 0.6721, "num_input_tokens_seen": 20037264, "step": 34535 }, { "epoch": 5.144474232946083, "grad_norm": 0.925808310508728, "learning_rate": 4.6328931025402045e-05, "loss": 0.7614, "num_input_tokens_seen": 20040304, "step": 34540 }, { "epoch": 5.145218945487042, "grad_norm": 0.8313277363777161, "learning_rate": 4.6327235772082466e-05, "loss": 0.7588, "num_input_tokens_seen": 20043248, "step": 34545 }, { "epoch": 5.145963658028001, "grad_norm": 1.5481488704681396, "learning_rate": 4.632554015846092e-05, "loss": 0.596, "num_input_tokens_seen": 20046064, "step": 34550 }, { "epoch": 5.146708370568961, "grad_norm": 1.7631648778915405, "learning_rate": 4.6323844184566045e-05, "loss": 0.6137, "num_input_tokens_seen": 20049392, "step": 34555 }, { "epoch": 5.1474530831099194, "grad_norm": 1.5485889911651611, "learning_rate": 4.6322147850426504e-05, "loss": 0.6175, "num_input_tokens_seen": 20052304, "step": 34560 }, { "epoch": 5.148197795650879, "grad_norm": 0.8861560225486755, "learning_rate": 4.6320451156070934e-05, "loss": 0.7438, "num_input_tokens_seen": 20054960, "step": 34565 }, { "epoch": 5.148942508191838, "grad_norm": 0.7479998469352722, "learning_rate": 4.6318754101528014e-05, "loss": 0.7733, "num_input_tokens_seen": 20058000, "step": 34570 }, { "epoch": 5.1496872207327975, "grad_norm": 0.9367376565933228, "learning_rate": 4.631705668682641e-05, "loss": 0.7271, "num_input_tokens_seen": 20061072, "step": 34575 }, { "epoch": 5.150431933273756, "grad_norm": 1.101477026939392, "learning_rate": 4.631535891199481e-05, "loss": 0.7795, "num_input_tokens_seen": 20063824, "step": 34580 }, { "epoch": 5.151176645814716, "grad_norm": 1.2050701379776, "learning_rate": 4.6313660777061874e-05, "loss": 0.7744, "num_input_tokens_seen": 20066896, "step": 34585 }, { "epoch": 5.151921358355675, "grad_norm": 1.1629180908203125, "learning_rate": 4.63119622820563e-05, "loss": 0.6526, "num_input_tokens_seen": 20069648, "step": 34590 }, { "epoch": 5.152666070896634, "grad_norm": 0.6268573999404907, "learning_rate": 4.6310263427006786e-05, "loss": 0.6703, "num_input_tokens_seen": 20072528, "step": 34595 }, { "epoch": 5.153410783437593, "grad_norm": 1.367041826248169, "learning_rate": 4.6308564211942044e-05, "loss": 0.6573, "num_input_tokens_seen": 20075504, "step": 34600 }, { "epoch": 5.154155495978552, "grad_norm": 1.21519935131073, "learning_rate": 4.6306864636890745e-05, "loss": 0.6515, "num_input_tokens_seen": 20078672, "step": 34605 }, { "epoch": 5.1549002085195115, "grad_norm": 0.6233052611351013, "learning_rate": 4.6305164701881634e-05, "loss": 0.5593, "num_input_tokens_seen": 20081296, "step": 34610 }, { "epoch": 5.15564492106047, "grad_norm": 1.2662347555160522, "learning_rate": 4.6303464406943416e-05, "loss": 0.5496, "num_input_tokens_seen": 20084208, "step": 34615 }, { "epoch": 5.15638963360143, "grad_norm": 1.7668830156326294, "learning_rate": 4.630176375210482e-05, "loss": 0.6349, "num_input_tokens_seen": 20087088, "step": 34620 }, { "epoch": 5.157134346142389, "grad_norm": 1.386165976524353, "learning_rate": 4.630006273739458e-05, "loss": 0.7556, "num_input_tokens_seen": 20090032, "step": 34625 }, { "epoch": 5.157879058683348, "grad_norm": 1.0707162618637085, "learning_rate": 4.629836136284143e-05, "loss": 0.4713, "num_input_tokens_seen": 20092848, "step": 34630 }, { "epoch": 5.158623771224307, "grad_norm": 0.7049884796142578, "learning_rate": 4.62966596284741e-05, "loss": 0.6305, "num_input_tokens_seen": 20095792, "step": 34635 }, { "epoch": 5.159368483765267, "grad_norm": 1.5604193210601807, "learning_rate": 4.629495753432136e-05, "loss": 0.6425, "num_input_tokens_seen": 20099056, "step": 34640 }, { "epoch": 5.1601131963062254, "grad_norm": 1.144996166229248, "learning_rate": 4.629325508041195e-05, "loss": 0.5594, "num_input_tokens_seen": 20101712, "step": 34645 }, { "epoch": 5.160857908847185, "grad_norm": 0.9882920384407043, "learning_rate": 4.629155226677464e-05, "loss": 0.5852, "num_input_tokens_seen": 20104720, "step": 34650 }, { "epoch": 5.161602621388144, "grad_norm": 1.1404693126678467, "learning_rate": 4.628984909343819e-05, "loss": 0.6665, "num_input_tokens_seen": 20107632, "step": 34655 }, { "epoch": 5.1623473339291035, "grad_norm": 0.7660881280899048, "learning_rate": 4.6288145560431385e-05, "loss": 0.6428, "num_input_tokens_seen": 20110768, "step": 34660 }, { "epoch": 5.163092046470062, "grad_norm": 1.81410813331604, "learning_rate": 4.628644166778299e-05, "loss": 0.7501, "num_input_tokens_seen": 20113584, "step": 34665 }, { "epoch": 5.163836759011022, "grad_norm": 0.8287895321846008, "learning_rate": 4.6284737415521806e-05, "loss": 0.6286, "num_input_tokens_seen": 20116656, "step": 34670 }, { "epoch": 5.164581471551981, "grad_norm": 1.7568386793136597, "learning_rate": 4.628303280367661e-05, "loss": 0.53, "num_input_tokens_seen": 20119696, "step": 34675 }, { "epoch": 5.16532618409294, "grad_norm": 0.932927131652832, "learning_rate": 4.6281327832276204e-05, "loss": 0.586, "num_input_tokens_seen": 20122448, "step": 34680 }, { "epoch": 5.166070896633899, "grad_norm": 1.0930978059768677, "learning_rate": 4.627962250134939e-05, "loss": 0.6587, "num_input_tokens_seen": 20125328, "step": 34685 }, { "epoch": 5.166815609174859, "grad_norm": 2.1614573001861572, "learning_rate": 4.627791681092499e-05, "loss": 0.6872, "num_input_tokens_seen": 20128272, "step": 34690 }, { "epoch": 5.1675603217158175, "grad_norm": 0.9098652601242065, "learning_rate": 4.627621076103181e-05, "loss": 0.4789, "num_input_tokens_seen": 20131216, "step": 34695 }, { "epoch": 5.168305034256777, "grad_norm": 1.2296085357666016, "learning_rate": 4.627450435169868e-05, "loss": 0.6462, "num_input_tokens_seen": 20134384, "step": 34700 }, { "epoch": 5.169049746797736, "grad_norm": 1.8581088781356812, "learning_rate": 4.627279758295441e-05, "loss": 0.6861, "num_input_tokens_seen": 20137360, "step": 34705 }, { "epoch": 5.1697944593386955, "grad_norm": 0.8363774418830872, "learning_rate": 4.627109045482785e-05, "loss": 0.5713, "num_input_tokens_seen": 20140432, "step": 34710 }, { "epoch": 5.170539171879654, "grad_norm": 0.725006103515625, "learning_rate": 4.626938296734784e-05, "loss": 0.5693, "num_input_tokens_seen": 20143344, "step": 34715 }, { "epoch": 5.171283884420614, "grad_norm": 1.842197060585022, "learning_rate": 4.626767512054321e-05, "loss": 0.7413, "num_input_tokens_seen": 20146384, "step": 34720 }, { "epoch": 5.172028596961573, "grad_norm": 1.3186157941818237, "learning_rate": 4.626596691444284e-05, "loss": 0.6291, "num_input_tokens_seen": 20149136, "step": 34725 }, { "epoch": 5.172773309502532, "grad_norm": 1.1481807231903076, "learning_rate": 4.626425834907556e-05, "loss": 0.6618, "num_input_tokens_seen": 20151824, "step": 34730 }, { "epoch": 5.173518022043491, "grad_norm": 0.760738730430603, "learning_rate": 4.6262549424470253e-05, "loss": 0.6022, "num_input_tokens_seen": 20154544, "step": 34735 }, { "epoch": 5.174262734584451, "grad_norm": 0.9314637184143066, "learning_rate": 4.626084014065578e-05, "loss": 0.5818, "num_input_tokens_seen": 20157488, "step": 34740 }, { "epoch": 5.1750074471254095, "grad_norm": 1.0512055158615112, "learning_rate": 4.625913049766103e-05, "loss": 0.6531, "num_input_tokens_seen": 20159984, "step": 34745 }, { "epoch": 5.175752159666369, "grad_norm": 1.0222011804580688, "learning_rate": 4.625742049551487e-05, "loss": 0.6245, "num_input_tokens_seen": 20162704, "step": 34750 }, { "epoch": 5.176496872207328, "grad_norm": 2.0511322021484375, "learning_rate": 4.6255710134246197e-05, "loss": 0.7392, "num_input_tokens_seen": 20165616, "step": 34755 }, { "epoch": 5.1772415847482876, "grad_norm": 1.175645112991333, "learning_rate": 4.6253999413883905e-05, "loss": 0.672, "num_input_tokens_seen": 20168432, "step": 34760 }, { "epoch": 5.177986297289246, "grad_norm": 0.8907849788665771, "learning_rate": 4.6252288334456887e-05, "loss": 0.8125, "num_input_tokens_seen": 20171408, "step": 34765 }, { "epoch": 5.178731009830206, "grad_norm": 0.5859609842300415, "learning_rate": 4.625057689599407e-05, "loss": 0.5507, "num_input_tokens_seen": 20174224, "step": 34770 }, { "epoch": 5.179475722371165, "grad_norm": 1.490386962890625, "learning_rate": 4.6248865098524346e-05, "loss": 0.8078, "num_input_tokens_seen": 20177360, "step": 34775 }, { "epoch": 5.180220434912124, "grad_norm": 0.9126954674720764, "learning_rate": 4.6247152942076646e-05, "loss": 0.5755, "num_input_tokens_seen": 20180176, "step": 34780 }, { "epoch": 5.180965147453083, "grad_norm": 1.0447152853012085, "learning_rate": 4.624544042667989e-05, "loss": 0.617, "num_input_tokens_seen": 20183024, "step": 34785 }, { "epoch": 5.181709859994042, "grad_norm": 0.864642322063446, "learning_rate": 4.624372755236301e-05, "loss": 0.6859, "num_input_tokens_seen": 20185872, "step": 34790 }, { "epoch": 5.1824545725350015, "grad_norm": 1.0736372470855713, "learning_rate": 4.624201431915495e-05, "loss": 0.6001, "num_input_tokens_seen": 20188592, "step": 34795 }, { "epoch": 5.18319928507596, "grad_norm": 0.9731498956680298, "learning_rate": 4.624030072708464e-05, "loss": 0.7305, "num_input_tokens_seen": 20191504, "step": 34800 }, { "epoch": 5.18394399761692, "grad_norm": 0.9998155236244202, "learning_rate": 4.623858677618104e-05, "loss": 0.5604, "num_input_tokens_seen": 20194576, "step": 34805 }, { "epoch": 5.184688710157879, "grad_norm": 0.8952499032020569, "learning_rate": 4.62368724664731e-05, "loss": 0.7059, "num_input_tokens_seen": 20197456, "step": 34810 }, { "epoch": 5.185433422698838, "grad_norm": 1.1360944509506226, "learning_rate": 4.623515779798979e-05, "loss": 0.7228, "num_input_tokens_seen": 20200368, "step": 34815 }, { "epoch": 5.186178135239797, "grad_norm": 1.7538098096847534, "learning_rate": 4.623344277076007e-05, "loss": 0.6445, "num_input_tokens_seen": 20203152, "step": 34820 }, { "epoch": 5.186922847780757, "grad_norm": 1.1542778015136719, "learning_rate": 4.6231727384812916e-05, "loss": 0.5844, "num_input_tokens_seen": 20205840, "step": 34825 }, { "epoch": 5.1876675603217155, "grad_norm": 0.9072728753089905, "learning_rate": 4.62300116401773e-05, "loss": 0.5963, "num_input_tokens_seen": 20208656, "step": 34830 }, { "epoch": 5.188412272862675, "grad_norm": 1.3361515998840332, "learning_rate": 4.622829553688222e-05, "loss": 0.6535, "num_input_tokens_seen": 20211344, "step": 34835 }, { "epoch": 5.189156985403634, "grad_norm": 1.5134336948394775, "learning_rate": 4.622657907495667e-05, "loss": 0.6773, "num_input_tokens_seen": 20214288, "step": 34840 }, { "epoch": 5.1899016979445936, "grad_norm": 1.498673915863037, "learning_rate": 4.6224862254429623e-05, "loss": 0.6202, "num_input_tokens_seen": 20216912, "step": 34845 }, { "epoch": 5.190646410485552, "grad_norm": 1.0596556663513184, "learning_rate": 4.622314507533011e-05, "loss": 0.5657, "num_input_tokens_seen": 20219984, "step": 34850 }, { "epoch": 5.191391123026512, "grad_norm": 1.022755742073059, "learning_rate": 4.622142753768713e-05, "loss": 0.4541, "num_input_tokens_seen": 20222736, "step": 34855 }, { "epoch": 5.192135835567471, "grad_norm": 1.4700831174850464, "learning_rate": 4.6219709641529695e-05, "loss": 0.6594, "num_input_tokens_seen": 20225488, "step": 34860 }, { "epoch": 5.19288054810843, "grad_norm": 1.1970138549804688, "learning_rate": 4.621799138688684e-05, "loss": 0.7548, "num_input_tokens_seen": 20228336, "step": 34865 }, { "epoch": 5.193625260649389, "grad_norm": 1.191615343093872, "learning_rate": 4.6216272773787586e-05, "loss": 0.6551, "num_input_tokens_seen": 20231568, "step": 34870 }, { "epoch": 5.194369973190349, "grad_norm": 1.0753486156463623, "learning_rate": 4.621455380226096e-05, "loss": 0.7622, "num_input_tokens_seen": 20234512, "step": 34875 }, { "epoch": 5.1951146857313075, "grad_norm": 1.1770179271697998, "learning_rate": 4.6212834472336016e-05, "loss": 0.6086, "num_input_tokens_seen": 20237456, "step": 34880 }, { "epoch": 5.195859398272267, "grad_norm": 0.8370994329452515, "learning_rate": 4.6211114784041784e-05, "loss": 0.6657, "num_input_tokens_seen": 20240848, "step": 34885 }, { "epoch": 5.196604110813226, "grad_norm": 1.36841881275177, "learning_rate": 4.620939473740733e-05, "loss": 0.7682, "num_input_tokens_seen": 20243696, "step": 34890 }, { "epoch": 5.197348823354186, "grad_norm": 0.9242243766784668, "learning_rate": 4.6207674332461716e-05, "loss": 0.5482, "num_input_tokens_seen": 20246608, "step": 34895 }, { "epoch": 5.198093535895144, "grad_norm": 0.8941484093666077, "learning_rate": 4.620595356923399e-05, "loss": 0.5239, "num_input_tokens_seen": 20249360, "step": 34900 }, { "epoch": 5.198838248436104, "grad_norm": 0.9317057132720947, "learning_rate": 4.620423244775323e-05, "loss": 0.6851, "num_input_tokens_seen": 20252464, "step": 34905 }, { "epoch": 5.199582960977063, "grad_norm": 1.3147895336151123, "learning_rate": 4.6202510968048515e-05, "loss": 0.6668, "num_input_tokens_seen": 20255056, "step": 34910 }, { "epoch": 5.200327673518022, "grad_norm": 1.9372271299362183, "learning_rate": 4.620078913014893e-05, "loss": 0.6431, "num_input_tokens_seen": 20258160, "step": 34915 }, { "epoch": 5.201072386058981, "grad_norm": 1.551796555519104, "learning_rate": 4.619906693408357e-05, "loss": 0.6729, "num_input_tokens_seen": 20261136, "step": 34920 }, { "epoch": 5.201817098599941, "grad_norm": 0.9119646549224854, "learning_rate": 4.61973443798815e-05, "loss": 0.6274, "num_input_tokens_seen": 20264176, "step": 34925 }, { "epoch": 5.2025618111408996, "grad_norm": 1.1198762655258179, "learning_rate": 4.6195621467571856e-05, "loss": 0.6399, "num_input_tokens_seen": 20266960, "step": 34930 }, { "epoch": 5.203306523681859, "grad_norm": 1.1243510246276855, "learning_rate": 4.619389819718371e-05, "loss": 0.6781, "num_input_tokens_seen": 20269776, "step": 34935 }, { "epoch": 5.204051236222818, "grad_norm": 1.2226324081420898, "learning_rate": 4.619217456874622e-05, "loss": 0.6843, "num_input_tokens_seen": 20272272, "step": 34940 }, { "epoch": 5.204795948763778, "grad_norm": 0.894498348236084, "learning_rate": 4.619045058228847e-05, "loss": 0.5975, "num_input_tokens_seen": 20275088, "step": 34945 }, { "epoch": 5.205540661304736, "grad_norm": 1.4039422273635864, "learning_rate": 4.6188726237839586e-05, "loss": 0.6363, "num_input_tokens_seen": 20277808, "step": 34950 }, { "epoch": 5.206285373845695, "grad_norm": 0.641571044921875, "learning_rate": 4.6187001535428716e-05, "loss": 0.5718, "num_input_tokens_seen": 20280496, "step": 34955 }, { "epoch": 5.207030086386655, "grad_norm": 0.6255486011505127, "learning_rate": 4.618527647508498e-05, "loss": 0.5553, "num_input_tokens_seen": 20283824, "step": 34960 }, { "epoch": 5.2077747989276135, "grad_norm": 1.0280005931854248, "learning_rate": 4.618355105683754e-05, "loss": 0.5682, "num_input_tokens_seen": 20286864, "step": 34965 }, { "epoch": 5.208519511468573, "grad_norm": 0.8539465069770813, "learning_rate": 4.618182528071553e-05, "loss": 0.5724, "num_input_tokens_seen": 20289648, "step": 34970 }, { "epoch": 5.209264224009532, "grad_norm": 1.5849038362503052, "learning_rate": 4.618009914674811e-05, "loss": 0.8143, "num_input_tokens_seen": 20293072, "step": 34975 }, { "epoch": 5.210008936550492, "grad_norm": 0.8046510815620422, "learning_rate": 4.617837265496444e-05, "loss": 0.6277, "num_input_tokens_seen": 20295920, "step": 34980 }, { "epoch": 5.21075364909145, "grad_norm": 0.8392783999443054, "learning_rate": 4.617664580539369e-05, "loss": 0.604, "num_input_tokens_seen": 20298768, "step": 34985 }, { "epoch": 5.21149836163241, "grad_norm": 0.7874752879142761, "learning_rate": 4.6174918598065034e-05, "loss": 0.4982, "num_input_tokens_seen": 20301584, "step": 34990 }, { "epoch": 5.212243074173369, "grad_norm": 1.4524343013763428, "learning_rate": 4.617319103300764e-05, "loss": 0.6635, "num_input_tokens_seen": 20304624, "step": 34995 }, { "epoch": 5.212987786714328, "grad_norm": 1.2104957103729248, "learning_rate": 4.6171463110250725e-05, "loss": 0.6081, "num_input_tokens_seen": 20307664, "step": 35000 }, { "epoch": 5.213732499255287, "grad_norm": 0.8043115735054016, "learning_rate": 4.616973482982344e-05, "loss": 0.7613, "num_input_tokens_seen": 20310640, "step": 35005 }, { "epoch": 5.214477211796247, "grad_norm": 1.097558856010437, "learning_rate": 4.616800619175501e-05, "loss": 0.7459, "num_input_tokens_seen": 20313328, "step": 35010 }, { "epoch": 5.2152219243372056, "grad_norm": 1.1157865524291992, "learning_rate": 4.616627719607462e-05, "loss": 0.6774, "num_input_tokens_seen": 20316080, "step": 35015 }, { "epoch": 5.215966636878165, "grad_norm": 0.9533094763755798, "learning_rate": 4.6164547842811494e-05, "loss": 0.7019, "num_input_tokens_seen": 20318960, "step": 35020 }, { "epoch": 5.216711349419124, "grad_norm": 0.7444694638252258, "learning_rate": 4.6162818131994845e-05, "loss": 0.5436, "num_input_tokens_seen": 20321936, "step": 35025 }, { "epoch": 5.217456061960084, "grad_norm": 1.3804377317428589, "learning_rate": 4.616108806365389e-05, "loss": 0.7252, "num_input_tokens_seen": 20325072, "step": 35030 }, { "epoch": 5.218200774501042, "grad_norm": 1.3967747688293457, "learning_rate": 4.6159357637817855e-05, "loss": 0.8678, "num_input_tokens_seen": 20328272, "step": 35035 }, { "epoch": 5.218945487042002, "grad_norm": 0.9534544944763184, "learning_rate": 4.6157626854515986e-05, "loss": 0.7167, "num_input_tokens_seen": 20331152, "step": 35040 }, { "epoch": 5.219690199582961, "grad_norm": 0.8430191278457642, "learning_rate": 4.615589571377752e-05, "loss": 0.5769, "num_input_tokens_seen": 20334000, "step": 35045 }, { "epoch": 5.22043491212392, "grad_norm": 1.0452700853347778, "learning_rate": 4.6154164215631685e-05, "loss": 0.7958, "num_input_tokens_seen": 20336976, "step": 35050 }, { "epoch": 5.221179624664879, "grad_norm": 1.3725706338882446, "learning_rate": 4.615243236010775e-05, "loss": 0.6319, "num_input_tokens_seen": 20339952, "step": 35055 }, { "epoch": 5.221924337205839, "grad_norm": 1.7370814085006714, "learning_rate": 4.615070014723497e-05, "loss": 0.6182, "num_input_tokens_seen": 20342832, "step": 35060 }, { "epoch": 5.222669049746798, "grad_norm": 1.5809282064437866, "learning_rate": 4.614896757704261e-05, "loss": 0.5722, "num_input_tokens_seen": 20345584, "step": 35065 }, { "epoch": 5.223413762287757, "grad_norm": 0.6165457963943481, "learning_rate": 4.614723464955993e-05, "loss": 0.5324, "num_input_tokens_seen": 20348432, "step": 35070 }, { "epoch": 5.224158474828716, "grad_norm": 1.214214563369751, "learning_rate": 4.6145501364816226e-05, "loss": 0.6374, "num_input_tokens_seen": 20351632, "step": 35075 }, { "epoch": 5.224903187369676, "grad_norm": 0.9867839813232422, "learning_rate": 4.614376772284075e-05, "loss": 0.6923, "num_input_tokens_seen": 20354608, "step": 35080 }, { "epoch": 5.225647899910634, "grad_norm": 1.5182476043701172, "learning_rate": 4.6142033723662825e-05, "loss": 0.8905, "num_input_tokens_seen": 20357456, "step": 35085 }, { "epoch": 5.226392612451594, "grad_norm": 1.0870457887649536, "learning_rate": 4.614029936731172e-05, "loss": 0.7044, "num_input_tokens_seen": 20360784, "step": 35090 }, { "epoch": 5.227137324992553, "grad_norm": 1.6208932399749756, "learning_rate": 4.613856465381674e-05, "loss": 0.5327, "num_input_tokens_seen": 20363600, "step": 35095 }, { "epoch": 5.227882037533512, "grad_norm": 1.3570375442504883, "learning_rate": 4.6136829583207197e-05, "loss": 0.7734, "num_input_tokens_seen": 20366608, "step": 35100 }, { "epoch": 5.228626750074471, "grad_norm": 1.0380178689956665, "learning_rate": 4.6135094155512405e-05, "loss": 0.6385, "num_input_tokens_seen": 20369712, "step": 35105 }, { "epoch": 5.229371462615431, "grad_norm": 1.1599782705307007, "learning_rate": 4.613335837076168e-05, "loss": 0.7257, "num_input_tokens_seen": 20372880, "step": 35110 }, { "epoch": 5.23011617515639, "grad_norm": 1.2451436519622803, "learning_rate": 4.6131622228984336e-05, "loss": 0.4955, "num_input_tokens_seen": 20375760, "step": 35115 }, { "epoch": 5.230860887697349, "grad_norm": 1.2387967109680176, "learning_rate": 4.6129885730209715e-05, "loss": 0.6285, "num_input_tokens_seen": 20378704, "step": 35120 }, { "epoch": 5.231605600238308, "grad_norm": 0.9260090589523315, "learning_rate": 4.612814887446715e-05, "loss": 0.5882, "num_input_tokens_seen": 20381360, "step": 35125 }, { "epoch": 5.232350312779268, "grad_norm": 1.2143882513046265, "learning_rate": 4.6126411661785984e-05, "loss": 0.6989, "num_input_tokens_seen": 20384240, "step": 35130 }, { "epoch": 5.233095025320226, "grad_norm": 1.6647900342941284, "learning_rate": 4.612467409219556e-05, "loss": 0.5872, "num_input_tokens_seen": 20387024, "step": 35135 }, { "epoch": 5.233839737861185, "grad_norm": 1.4411979913711548, "learning_rate": 4.612293616572525e-05, "loss": 0.6437, "num_input_tokens_seen": 20390192, "step": 35140 }, { "epoch": 5.234584450402145, "grad_norm": 1.6187280416488647, "learning_rate": 4.612119788240439e-05, "loss": 0.6633, "num_input_tokens_seen": 20393232, "step": 35145 }, { "epoch": 5.235329162943104, "grad_norm": 2.0548717975616455, "learning_rate": 4.6119459242262366e-05, "loss": 0.6731, "num_input_tokens_seen": 20396368, "step": 35150 }, { "epoch": 5.236073875484063, "grad_norm": 1.2665175199508667, "learning_rate": 4.611772024532854e-05, "loss": 0.7198, "num_input_tokens_seen": 20399280, "step": 35155 }, { "epoch": 5.236818588025022, "grad_norm": 1.3461850881576538, "learning_rate": 4.611598089163229e-05, "loss": 0.7659, "num_input_tokens_seen": 20402256, "step": 35160 }, { "epoch": 5.237563300565982, "grad_norm": 1.082144021987915, "learning_rate": 4.611424118120301e-05, "loss": 0.5447, "num_input_tokens_seen": 20404880, "step": 35165 }, { "epoch": 5.23830801310694, "grad_norm": 0.6783885955810547, "learning_rate": 4.611250111407008e-05, "loss": 0.5988, "num_input_tokens_seen": 20408016, "step": 35170 }, { "epoch": 5.2390527256479, "grad_norm": 0.9982669949531555, "learning_rate": 4.6110760690262907e-05, "loss": 0.709, "num_input_tokens_seen": 20410832, "step": 35175 }, { "epoch": 5.239797438188859, "grad_norm": 1.7246558666229248, "learning_rate": 4.610901990981088e-05, "loss": 0.7473, "num_input_tokens_seen": 20413520, "step": 35180 }, { "epoch": 5.240542150729818, "grad_norm": 0.7537409663200378, "learning_rate": 4.6107278772743426e-05, "loss": 0.5274, "num_input_tokens_seen": 20416528, "step": 35185 }, { "epoch": 5.241286863270777, "grad_norm": 1.4384667873382568, "learning_rate": 4.610553727908994e-05, "loss": 0.5714, "num_input_tokens_seen": 20419504, "step": 35190 }, { "epoch": 5.242031575811737, "grad_norm": 0.8494195342063904, "learning_rate": 4.6103795428879856e-05, "loss": 0.5714, "num_input_tokens_seen": 20422352, "step": 35195 }, { "epoch": 5.242776288352696, "grad_norm": 0.8382068872451782, "learning_rate": 4.6102053222142595e-05, "loss": 0.6676, "num_input_tokens_seen": 20425328, "step": 35200 }, { "epoch": 5.243521000893655, "grad_norm": 0.8994199633598328, "learning_rate": 4.61003106589076e-05, "loss": 0.6808, "num_input_tokens_seen": 20428272, "step": 35205 }, { "epoch": 5.244265713434614, "grad_norm": 2.1956534385681152, "learning_rate": 4.6098567739204294e-05, "loss": 0.6481, "num_input_tokens_seen": 20431280, "step": 35210 }, { "epoch": 5.245010425975574, "grad_norm": 0.8493280410766602, "learning_rate": 4.609682446306213e-05, "loss": 0.645, "num_input_tokens_seen": 20433936, "step": 35215 }, { "epoch": 5.245755138516532, "grad_norm": 1.0841196775436401, "learning_rate": 4.6095080830510564e-05, "loss": 0.8278, "num_input_tokens_seen": 20437008, "step": 35220 }, { "epoch": 5.246499851057492, "grad_norm": 1.0133799314498901, "learning_rate": 4.6093336841579044e-05, "loss": 0.6469, "num_input_tokens_seen": 20439824, "step": 35225 }, { "epoch": 5.247244563598451, "grad_norm": 1.3787106275558472, "learning_rate": 4.609159249629704e-05, "loss": 0.6317, "num_input_tokens_seen": 20442640, "step": 35230 }, { "epoch": 5.2479892761394105, "grad_norm": 1.216461420059204, "learning_rate": 4.6089847794694005e-05, "loss": 0.6575, "num_input_tokens_seen": 20445488, "step": 35235 }, { "epoch": 5.248733988680369, "grad_norm": 1.0352237224578857, "learning_rate": 4.6088102736799445e-05, "loss": 0.5621, "num_input_tokens_seen": 20448304, "step": 35240 }, { "epoch": 5.249478701221329, "grad_norm": 0.8161165714263916, "learning_rate": 4.60863573226428e-05, "loss": 0.6159, "num_input_tokens_seen": 20451440, "step": 35245 }, { "epoch": 5.250223413762288, "grad_norm": 1.6236940622329712, "learning_rate": 4.6084611552253595e-05, "loss": 0.6933, "num_input_tokens_seen": 20454480, "step": 35250 }, { "epoch": 5.250968126303247, "grad_norm": 0.9047327637672424, "learning_rate": 4.6082865425661307e-05, "loss": 0.5996, "num_input_tokens_seen": 20457648, "step": 35255 }, { "epoch": 5.251712838844206, "grad_norm": 1.1752551794052124, "learning_rate": 4.608111894289543e-05, "loss": 0.5503, "num_input_tokens_seen": 20460496, "step": 35260 }, { "epoch": 5.252457551385166, "grad_norm": 1.2775295972824097, "learning_rate": 4.607937210398548e-05, "loss": 0.7782, "num_input_tokens_seen": 20463408, "step": 35265 }, { "epoch": 5.253202263926124, "grad_norm": 1.15651535987854, "learning_rate": 4.607762490896096e-05, "loss": 0.6392, "num_input_tokens_seen": 20466384, "step": 35270 }, { "epoch": 5.253946976467084, "grad_norm": 0.573189377784729, "learning_rate": 4.6075877357851384e-05, "loss": 0.7004, "num_input_tokens_seen": 20470064, "step": 35275 }, { "epoch": 5.254691689008043, "grad_norm": 0.9636988043785095, "learning_rate": 4.607412945068629e-05, "loss": 0.762, "num_input_tokens_seen": 20473072, "step": 35280 }, { "epoch": 5.2554364015490025, "grad_norm": 1.0234946012496948, "learning_rate": 4.607238118749519e-05, "loss": 0.6375, "num_input_tokens_seen": 20476176, "step": 35285 }, { "epoch": 5.256181114089961, "grad_norm": 1.253599762916565, "learning_rate": 4.607063256830763e-05, "loss": 0.6059, "num_input_tokens_seen": 20479056, "step": 35290 }, { "epoch": 5.256925826630921, "grad_norm": 1.013322114944458, "learning_rate": 4.6068883593153147e-05, "loss": 0.5804, "num_input_tokens_seen": 20481904, "step": 35295 }, { "epoch": 5.25767053917188, "grad_norm": 1.8799097537994385, "learning_rate": 4.606713426206129e-05, "loss": 0.6616, "num_input_tokens_seen": 20484656, "step": 35300 }, { "epoch": 5.258415251712838, "grad_norm": 1.0216422080993652, "learning_rate": 4.606538457506161e-05, "loss": 0.5778, "num_input_tokens_seen": 20487632, "step": 35305 }, { "epoch": 5.259159964253798, "grad_norm": 0.800462543964386, "learning_rate": 4.606363453218367e-05, "loss": 0.6015, "num_input_tokens_seen": 20490448, "step": 35310 }, { "epoch": 5.259904676794757, "grad_norm": 1.216198205947876, "learning_rate": 4.606188413345704e-05, "loss": 0.5254, "num_input_tokens_seen": 20493168, "step": 35315 }, { "epoch": 5.2606493893357165, "grad_norm": 1.0259227752685547, "learning_rate": 4.6060133378911265e-05, "loss": 0.6702, "num_input_tokens_seen": 20496016, "step": 35320 }, { "epoch": 5.261394101876675, "grad_norm": 1.3969401121139526, "learning_rate": 4.605838226857595e-05, "loss": 0.5797, "num_input_tokens_seen": 20498992, "step": 35325 }, { "epoch": 5.262138814417635, "grad_norm": 1.393334984779358, "learning_rate": 4.605663080248067e-05, "loss": 0.7182, "num_input_tokens_seen": 20502256, "step": 35330 }, { "epoch": 5.262883526958594, "grad_norm": 1.725060224533081, "learning_rate": 4.6054878980655015e-05, "loss": 0.6924, "num_input_tokens_seen": 20505328, "step": 35335 }, { "epoch": 5.263628239499553, "grad_norm": 0.8622992038726807, "learning_rate": 4.605312680312858e-05, "loss": 0.5576, "num_input_tokens_seen": 20508272, "step": 35340 }, { "epoch": 5.264372952040512, "grad_norm": 1.0179808139801025, "learning_rate": 4.605137426993096e-05, "loss": 0.6661, "num_input_tokens_seen": 20511280, "step": 35345 }, { "epoch": 5.265117664581472, "grad_norm": 0.9883893728256226, "learning_rate": 4.6049621381091776e-05, "loss": 0.4952, "num_input_tokens_seen": 20514256, "step": 35350 }, { "epoch": 5.26586237712243, "grad_norm": 0.7998796701431274, "learning_rate": 4.604786813664063e-05, "loss": 0.7115, "num_input_tokens_seen": 20517296, "step": 35355 }, { "epoch": 5.26660708966339, "grad_norm": 1.8259915113449097, "learning_rate": 4.6046114536607133e-05, "loss": 0.5951, "num_input_tokens_seen": 20520304, "step": 35360 }, { "epoch": 5.267351802204349, "grad_norm": 1.1151765584945679, "learning_rate": 4.604436058102093e-05, "loss": 0.5488, "num_input_tokens_seen": 20523248, "step": 35365 }, { "epoch": 5.2680965147453085, "grad_norm": 1.261250376701355, "learning_rate": 4.6042606269911645e-05, "loss": 0.6194, "num_input_tokens_seen": 20526000, "step": 35370 }, { "epoch": 5.268841227286267, "grad_norm": 0.7082167863845825, "learning_rate": 4.604085160330891e-05, "loss": 0.5295, "num_input_tokens_seen": 20528816, "step": 35375 }, { "epoch": 5.269585939827227, "grad_norm": 0.6426918506622314, "learning_rate": 4.603909658124238e-05, "loss": 0.6936, "num_input_tokens_seen": 20531440, "step": 35380 }, { "epoch": 5.270330652368186, "grad_norm": 0.9437152743339539, "learning_rate": 4.6037341203741686e-05, "loss": 0.6843, "num_input_tokens_seen": 20534576, "step": 35385 }, { "epoch": 5.271075364909145, "grad_norm": 1.34348726272583, "learning_rate": 4.6035585470836494e-05, "loss": 0.73, "num_input_tokens_seen": 20537328, "step": 35390 }, { "epoch": 5.271820077450104, "grad_norm": 0.7015059590339661, "learning_rate": 4.603382938255647e-05, "loss": 0.515, "num_input_tokens_seen": 20540272, "step": 35395 }, { "epoch": 5.272564789991064, "grad_norm": 0.9555190205574036, "learning_rate": 4.603207293893128e-05, "loss": 0.6681, "num_input_tokens_seen": 20543088, "step": 35400 }, { "epoch": 5.2733095025320225, "grad_norm": 1.0370099544525146, "learning_rate": 4.6030316139990595e-05, "loss": 0.7367, "num_input_tokens_seen": 20546032, "step": 35405 }, { "epoch": 5.274054215072982, "grad_norm": 0.9267684817314148, "learning_rate": 4.602855898576408e-05, "loss": 0.6255, "num_input_tokens_seen": 20548784, "step": 35410 }, { "epoch": 5.274798927613941, "grad_norm": 0.9263507127761841, "learning_rate": 4.6026801476281436e-05, "loss": 0.5597, "num_input_tokens_seen": 20551856, "step": 35415 }, { "epoch": 5.2755436401549005, "grad_norm": 1.8372154235839844, "learning_rate": 4.602504361157236e-05, "loss": 0.6144, "num_input_tokens_seen": 20554384, "step": 35420 }, { "epoch": 5.276288352695859, "grad_norm": 0.8790621757507324, "learning_rate": 4.602328539166654e-05, "loss": 0.5729, "num_input_tokens_seen": 20557264, "step": 35425 }, { "epoch": 5.277033065236819, "grad_norm": 2.0548813343048096, "learning_rate": 4.602152681659368e-05, "loss": 0.6346, "num_input_tokens_seen": 20560016, "step": 35430 }, { "epoch": 5.277777777777778, "grad_norm": 0.8517191410064697, "learning_rate": 4.601976788638349e-05, "loss": 0.6297, "num_input_tokens_seen": 20563216, "step": 35435 }, { "epoch": 5.278522490318737, "grad_norm": 0.9360923767089844, "learning_rate": 4.601800860106568e-05, "loss": 0.6829, "num_input_tokens_seen": 20566000, "step": 35440 }, { "epoch": 5.279267202859696, "grad_norm": 0.7094508409500122, "learning_rate": 4.601624896066998e-05, "loss": 0.6647, "num_input_tokens_seen": 20568752, "step": 35445 }, { "epoch": 5.280011915400656, "grad_norm": 0.907315731048584, "learning_rate": 4.601448896522611e-05, "loss": 0.6104, "num_input_tokens_seen": 20571664, "step": 35450 }, { "epoch": 5.2807566279416145, "grad_norm": 1.6220135688781738, "learning_rate": 4.601272861476381e-05, "loss": 0.6786, "num_input_tokens_seen": 20574448, "step": 35455 }, { "epoch": 5.281501340482574, "grad_norm": 1.398934245109558, "learning_rate": 4.601096790931282e-05, "loss": 0.6458, "num_input_tokens_seen": 20577616, "step": 35460 }, { "epoch": 5.282246053023533, "grad_norm": 0.8701167702674866, "learning_rate": 4.6009206848902874e-05, "loss": 0.5362, "num_input_tokens_seen": 20580848, "step": 35465 }, { "epoch": 5.282990765564492, "grad_norm": 0.9729161858558655, "learning_rate": 4.6007445433563734e-05, "loss": 0.5587, "num_input_tokens_seen": 20583760, "step": 35470 }, { "epoch": 5.283735478105451, "grad_norm": 2.085768461227417, "learning_rate": 4.600568366332516e-05, "loss": 0.6942, "num_input_tokens_seen": 20586768, "step": 35475 }, { "epoch": 5.284480190646411, "grad_norm": 0.9340671896934509, "learning_rate": 4.60039215382169e-05, "loss": 0.6052, "num_input_tokens_seen": 20589712, "step": 35480 }, { "epoch": 5.28522490318737, "grad_norm": 1.8950965404510498, "learning_rate": 4.6002159058268744e-05, "loss": 0.6106, "num_input_tokens_seen": 20592368, "step": 35485 }, { "epoch": 5.2859696157283285, "grad_norm": 2.6774981021881104, "learning_rate": 4.600039622351045e-05, "loss": 0.8388, "num_input_tokens_seen": 20595344, "step": 35490 }, { "epoch": 5.286714328269288, "grad_norm": 2.1558644771575928, "learning_rate": 4.59986330339718e-05, "loss": 0.6443, "num_input_tokens_seen": 20598384, "step": 35495 }, { "epoch": 5.287459040810247, "grad_norm": 0.8753038048744202, "learning_rate": 4.59968694896826e-05, "loss": 0.5361, "num_input_tokens_seen": 20601136, "step": 35500 }, { "epoch": 5.2882037533512065, "grad_norm": 1.3499301671981812, "learning_rate": 4.599510559067263e-05, "loss": 0.5929, "num_input_tokens_seen": 20604176, "step": 35505 }, { "epoch": 5.288948465892165, "grad_norm": 1.7371511459350586, "learning_rate": 4.599334133697167e-05, "loss": 0.7041, "num_input_tokens_seen": 20607312, "step": 35510 }, { "epoch": 5.289693178433125, "grad_norm": 0.9927018880844116, "learning_rate": 4.5991576728609565e-05, "loss": 0.6028, "num_input_tokens_seen": 20610000, "step": 35515 }, { "epoch": 5.290437890974084, "grad_norm": 1.0937690734863281, "learning_rate": 4.5989811765616094e-05, "loss": 0.6837, "num_input_tokens_seen": 20612880, "step": 35520 }, { "epoch": 5.291182603515043, "grad_norm": 0.8404620885848999, "learning_rate": 4.5988046448021096e-05, "loss": 0.5433, "num_input_tokens_seen": 20616048, "step": 35525 }, { "epoch": 5.291927316056002, "grad_norm": 0.9777063727378845, "learning_rate": 4.598628077585438e-05, "loss": 0.4946, "num_input_tokens_seen": 20618832, "step": 35530 }, { "epoch": 5.292672028596962, "grad_norm": 0.9500380754470825, "learning_rate": 4.598451474914578e-05, "loss": 0.742, "num_input_tokens_seen": 20621552, "step": 35535 }, { "epoch": 5.2934167411379205, "grad_norm": 1.090910792350769, "learning_rate": 4.598274836792513e-05, "loss": 0.8033, "num_input_tokens_seen": 20624624, "step": 35540 }, { "epoch": 5.29416145367888, "grad_norm": 0.8676635026931763, "learning_rate": 4.5980981632222275e-05, "loss": 0.5019, "num_input_tokens_seen": 20627696, "step": 35545 }, { "epoch": 5.294906166219839, "grad_norm": 0.9510501623153687, "learning_rate": 4.5979214542067056e-05, "loss": 0.5727, "num_input_tokens_seen": 20630704, "step": 35550 }, { "epoch": 5.2956508787607985, "grad_norm": 1.3015934228897095, "learning_rate": 4.597744709748933e-05, "loss": 0.6803, "num_input_tokens_seen": 20633520, "step": 35555 }, { "epoch": 5.296395591301757, "grad_norm": 1.1417620182037354, "learning_rate": 4.597567929851896e-05, "loss": 0.78, "num_input_tokens_seen": 20636336, "step": 35560 }, { "epoch": 5.297140303842717, "grad_norm": 1.1116387844085693, "learning_rate": 4.59739111451858e-05, "loss": 0.6422, "num_input_tokens_seen": 20639120, "step": 35565 }, { "epoch": 5.297885016383676, "grad_norm": 1.0516067743301392, "learning_rate": 4.5972142637519735e-05, "loss": 0.5643, "num_input_tokens_seen": 20641904, "step": 35570 }, { "epoch": 5.298629728924635, "grad_norm": 0.6952174305915833, "learning_rate": 4.597037377555063e-05, "loss": 0.7443, "num_input_tokens_seen": 20645008, "step": 35575 }, { "epoch": 5.299374441465594, "grad_norm": 0.8876860737800598, "learning_rate": 4.5968604559308374e-05, "loss": 0.7683, "num_input_tokens_seen": 20647984, "step": 35580 }, { "epoch": 5.300119154006554, "grad_norm": 1.6670823097229004, "learning_rate": 4.596683498882286e-05, "loss": 0.6537, "num_input_tokens_seen": 20650928, "step": 35585 }, { "epoch": 5.3008638665475125, "grad_norm": 1.1869549751281738, "learning_rate": 4.596506506412398e-05, "loss": 0.8196, "num_input_tokens_seen": 20653712, "step": 35590 }, { "epoch": 5.301608579088472, "grad_norm": 1.046509861946106, "learning_rate": 4.596329478524163e-05, "loss": 0.7666, "num_input_tokens_seen": 20656560, "step": 35595 }, { "epoch": 5.302353291629431, "grad_norm": 0.9690435528755188, "learning_rate": 4.596152415220572e-05, "loss": 0.5909, "num_input_tokens_seen": 20659344, "step": 35600 }, { "epoch": 5.303098004170391, "grad_norm": 1.3600393533706665, "learning_rate": 4.595975316504616e-05, "loss": 0.6813, "num_input_tokens_seen": 20662416, "step": 35605 }, { "epoch": 5.303842716711349, "grad_norm": 1.1504024267196655, "learning_rate": 4.595798182379288e-05, "loss": 0.6763, "num_input_tokens_seen": 20665296, "step": 35610 }, { "epoch": 5.304587429252309, "grad_norm": 2.2950336933135986, "learning_rate": 4.595621012847579e-05, "loss": 0.8196, "num_input_tokens_seen": 20667920, "step": 35615 }, { "epoch": 5.305332141793268, "grad_norm": 1.8460192680358887, "learning_rate": 4.5954438079124836e-05, "loss": 0.8468, "num_input_tokens_seen": 20671120, "step": 35620 }, { "epoch": 5.306076854334227, "grad_norm": 1.1639809608459473, "learning_rate": 4.595266567576995e-05, "loss": 0.6098, "num_input_tokens_seen": 20674096, "step": 35625 }, { "epoch": 5.306821566875186, "grad_norm": 0.6853513121604919, "learning_rate": 4.595089291844106e-05, "loss": 0.7671, "num_input_tokens_seen": 20676784, "step": 35630 }, { "epoch": 5.307566279416146, "grad_norm": 1.0140025615692139, "learning_rate": 4.594911980716814e-05, "loss": 0.6254, "num_input_tokens_seen": 20679824, "step": 35635 }, { "epoch": 5.3083109919571045, "grad_norm": 1.4934102296829224, "learning_rate": 4.594734634198112e-05, "loss": 0.6417, "num_input_tokens_seen": 20682544, "step": 35640 }, { "epoch": 5.309055704498064, "grad_norm": 1.2752753496170044, "learning_rate": 4.594557252290998e-05, "loss": 0.5915, "num_input_tokens_seen": 20685488, "step": 35645 }, { "epoch": 5.309800417039023, "grad_norm": 1.3541632890701294, "learning_rate": 4.594379834998469e-05, "loss": 0.6479, "num_input_tokens_seen": 20688304, "step": 35650 }, { "epoch": 5.310545129579982, "grad_norm": 1.051841139793396, "learning_rate": 4.594202382323521e-05, "loss": 0.6982, "num_input_tokens_seen": 20691024, "step": 35655 }, { "epoch": 5.311289842120941, "grad_norm": 0.9964194297790527, "learning_rate": 4.594024894269151e-05, "loss": 0.6527, "num_input_tokens_seen": 20693840, "step": 35660 }, { "epoch": 5.3120345546619, "grad_norm": 1.0664546489715576, "learning_rate": 4.59384737083836e-05, "loss": 0.6807, "num_input_tokens_seen": 20696816, "step": 35665 }, { "epoch": 5.31277926720286, "grad_norm": 1.1155273914337158, "learning_rate": 4.5936698120341445e-05, "loss": 0.5098, "num_input_tokens_seen": 20699856, "step": 35670 }, { "epoch": 5.3135239797438185, "grad_norm": 1.7944456338882446, "learning_rate": 4.593492217859506e-05, "loss": 0.5871, "num_input_tokens_seen": 20702544, "step": 35675 }, { "epoch": 5.314268692284778, "grad_norm": 0.9234346747398376, "learning_rate": 4.593314588317445e-05, "loss": 0.5739, "num_input_tokens_seen": 20705264, "step": 35680 }, { "epoch": 5.315013404825737, "grad_norm": 1.3190691471099854, "learning_rate": 4.5931369234109614e-05, "loss": 0.6819, "num_input_tokens_seen": 20708080, "step": 35685 }, { "epoch": 5.315758117366697, "grad_norm": 1.076220989227295, "learning_rate": 4.592959223143056e-05, "loss": 0.6145, "num_input_tokens_seen": 20710864, "step": 35690 }, { "epoch": 5.316502829907655, "grad_norm": 0.9630262851715088, "learning_rate": 4.592781487516732e-05, "loss": 0.7317, "num_input_tokens_seen": 20713840, "step": 35695 }, { "epoch": 5.317247542448615, "grad_norm": 1.3622487783432007, "learning_rate": 4.592603716534992e-05, "loss": 0.6876, "num_input_tokens_seen": 20716528, "step": 35700 }, { "epoch": 5.317992254989574, "grad_norm": 1.0074892044067383, "learning_rate": 4.5924259102008386e-05, "loss": 0.5743, "num_input_tokens_seen": 20719216, "step": 35705 }, { "epoch": 5.318736967530533, "grad_norm": 1.6539503335952759, "learning_rate": 4.592248068517276e-05, "loss": 0.7183, "num_input_tokens_seen": 20721904, "step": 35710 }, { "epoch": 5.319481680071492, "grad_norm": 2.0260822772979736, "learning_rate": 4.59207019148731e-05, "loss": 0.8687, "num_input_tokens_seen": 20725072, "step": 35715 }, { "epoch": 5.320226392612452, "grad_norm": 2.46465802192688, "learning_rate": 4.591892279113943e-05, "loss": 0.7661, "num_input_tokens_seen": 20727760, "step": 35720 }, { "epoch": 5.3209711051534105, "grad_norm": 0.8483741879463196, "learning_rate": 4.591714331400183e-05, "loss": 0.7221, "num_input_tokens_seen": 20730992, "step": 35725 }, { "epoch": 5.32171581769437, "grad_norm": 1.6249299049377441, "learning_rate": 4.5915363483490346e-05, "loss": 0.6336, "num_input_tokens_seen": 20733584, "step": 35730 }, { "epoch": 5.322460530235329, "grad_norm": 1.4244718551635742, "learning_rate": 4.591358329963505e-05, "loss": 0.696, "num_input_tokens_seen": 20736784, "step": 35735 }, { "epoch": 5.323205242776289, "grad_norm": 0.7797162532806396, "learning_rate": 4.5911802762466034e-05, "loss": 0.6684, "num_input_tokens_seen": 20739856, "step": 35740 }, { "epoch": 5.323949955317247, "grad_norm": 0.9379225969314575, "learning_rate": 4.5910021872013355e-05, "loss": 0.6295, "num_input_tokens_seen": 20742928, "step": 35745 }, { "epoch": 5.324694667858207, "grad_norm": 0.922128438949585, "learning_rate": 4.590824062830711e-05, "loss": 0.7194, "num_input_tokens_seen": 20746096, "step": 35750 }, { "epoch": 5.325439380399166, "grad_norm": 0.9505957961082458, "learning_rate": 4.590645903137739e-05, "loss": 0.6108, "num_input_tokens_seen": 20749008, "step": 35755 }, { "epoch": 5.326184092940125, "grad_norm": 0.8854063749313354, "learning_rate": 4.590467708125429e-05, "loss": 0.6151, "num_input_tokens_seen": 20752016, "step": 35760 }, { "epoch": 5.326928805481084, "grad_norm": 0.8921523094177246, "learning_rate": 4.590289477796792e-05, "loss": 0.7902, "num_input_tokens_seen": 20755248, "step": 35765 }, { "epoch": 5.327673518022044, "grad_norm": 1.379398226737976, "learning_rate": 4.590111212154839e-05, "loss": 0.7511, "num_input_tokens_seen": 20758224, "step": 35770 }, { "epoch": 5.328418230563003, "grad_norm": 0.7767008543014526, "learning_rate": 4.589932911202581e-05, "loss": 0.6132, "num_input_tokens_seen": 20761232, "step": 35775 }, { "epoch": 5.329162943103962, "grad_norm": 1.0766643285751343, "learning_rate": 4.5897545749430305e-05, "loss": 0.6414, "num_input_tokens_seen": 20764272, "step": 35780 }, { "epoch": 5.329907655644921, "grad_norm": 1.7520593404769897, "learning_rate": 4.5895762033792e-05, "loss": 0.7952, "num_input_tokens_seen": 20767024, "step": 35785 }, { "epoch": 5.330652368185881, "grad_norm": 1.063795566558838, "learning_rate": 4.589397796514104e-05, "loss": 0.6514, "num_input_tokens_seen": 20769968, "step": 35790 }, { "epoch": 5.331397080726839, "grad_norm": 0.923823893070221, "learning_rate": 4.5892193543507556e-05, "loss": 0.7169, "num_input_tokens_seen": 20772816, "step": 35795 }, { "epoch": 5.332141793267799, "grad_norm": 0.8380663394927979, "learning_rate": 4.58904087689217e-05, "loss": 0.6191, "num_input_tokens_seen": 20775568, "step": 35800 }, { "epoch": 5.332886505808758, "grad_norm": 0.9721935391426086, "learning_rate": 4.5888623641413615e-05, "loss": 0.6142, "num_input_tokens_seen": 20778384, "step": 35805 }, { "epoch": 5.333631218349717, "grad_norm": 1.0589897632598877, "learning_rate": 4.588683816101347e-05, "loss": 0.5701, "num_input_tokens_seen": 20780944, "step": 35810 }, { "epoch": 5.334375930890676, "grad_norm": 0.9462103843688965, "learning_rate": 4.588505232775141e-05, "loss": 0.6462, "num_input_tokens_seen": 20783792, "step": 35815 }, { "epoch": 5.335120643431635, "grad_norm": 1.0467499494552612, "learning_rate": 4.588326614165763e-05, "loss": 0.5694, "num_input_tokens_seen": 20786704, "step": 35820 }, { "epoch": 5.335865355972595, "grad_norm": 1.0544413328170776, "learning_rate": 4.5881479602762286e-05, "loss": 0.5364, "num_input_tokens_seen": 20789904, "step": 35825 }, { "epoch": 5.336610068513554, "grad_norm": 0.646080732345581, "learning_rate": 4.587969271109557e-05, "loss": 0.6515, "num_input_tokens_seen": 20792912, "step": 35830 }, { "epoch": 5.337354781054513, "grad_norm": 1.5636909008026123, "learning_rate": 4.5877905466687666e-05, "loss": 0.5533, "num_input_tokens_seen": 20795728, "step": 35835 }, { "epoch": 5.338099493595472, "grad_norm": 0.9980746507644653, "learning_rate": 4.5876117869568766e-05, "loss": 0.5133, "num_input_tokens_seen": 20798480, "step": 35840 }, { "epoch": 5.338844206136431, "grad_norm": 0.9289918541908264, "learning_rate": 4.587432991976908e-05, "loss": 0.6749, "num_input_tokens_seen": 20801552, "step": 35845 }, { "epoch": 5.33958891867739, "grad_norm": 1.249320149421692, "learning_rate": 4.58725416173188e-05, "loss": 0.618, "num_input_tokens_seen": 20804496, "step": 35850 }, { "epoch": 5.34033363121835, "grad_norm": 0.8938997387886047, "learning_rate": 4.587075296224814e-05, "loss": 0.5091, "num_input_tokens_seen": 20807472, "step": 35855 }, { "epoch": 5.341078343759309, "grad_norm": 0.7812583446502686, "learning_rate": 4.586896395458733e-05, "loss": 0.8164, "num_input_tokens_seen": 20810448, "step": 35860 }, { "epoch": 5.341823056300268, "grad_norm": 1.2554415464401245, "learning_rate": 4.586717459436658e-05, "loss": 0.7063, "num_input_tokens_seen": 20813488, "step": 35865 }, { "epoch": 5.342567768841227, "grad_norm": 0.9949275255203247, "learning_rate": 4.586538488161612e-05, "loss": 0.6601, "num_input_tokens_seen": 20816208, "step": 35870 }, { "epoch": 5.343312481382187, "grad_norm": 0.570570170879364, "learning_rate": 4.58635948163662e-05, "loss": 0.5333, "num_input_tokens_seen": 20819120, "step": 35875 }, { "epoch": 5.344057193923145, "grad_norm": 0.8153759241104126, "learning_rate": 4.586180439864704e-05, "loss": 0.4086, "num_input_tokens_seen": 20822128, "step": 35880 }, { "epoch": 5.344801906464105, "grad_norm": 0.7909853458404541, "learning_rate": 4.586001362848889e-05, "loss": 0.636, "num_input_tokens_seen": 20825168, "step": 35885 }, { "epoch": 5.345546619005064, "grad_norm": 0.511161208152771, "learning_rate": 4.5858222505922026e-05, "loss": 0.4856, "num_input_tokens_seen": 20827824, "step": 35890 }, { "epoch": 5.346291331546023, "grad_norm": 1.569208025932312, "learning_rate": 4.585643103097669e-05, "loss": 0.717, "num_input_tokens_seen": 20830576, "step": 35895 }, { "epoch": 5.347036044086982, "grad_norm": 0.7769020199775696, "learning_rate": 4.5854639203683146e-05, "loss": 0.516, "num_input_tokens_seen": 20833328, "step": 35900 }, { "epoch": 5.347780756627942, "grad_norm": 1.132956862449646, "learning_rate": 4.5852847024071664e-05, "loss": 0.5626, "num_input_tokens_seen": 20836208, "step": 35905 }, { "epoch": 5.348525469168901, "grad_norm": 1.423773169517517, "learning_rate": 4.585105449217253e-05, "loss": 0.7554, "num_input_tokens_seen": 20838896, "step": 35910 }, { "epoch": 5.34927018170986, "grad_norm": 1.1771572828292847, "learning_rate": 4.5849261608016026e-05, "loss": 0.7209, "num_input_tokens_seen": 20842128, "step": 35915 }, { "epoch": 5.350014894250819, "grad_norm": 0.666174054145813, "learning_rate": 4.584746837163243e-05, "loss": 0.5476, "num_input_tokens_seen": 20845104, "step": 35920 }, { "epoch": 5.350759606791779, "grad_norm": 1.086572527885437, "learning_rate": 4.584567478305205e-05, "loss": 0.787, "num_input_tokens_seen": 20848144, "step": 35925 }, { "epoch": 5.351504319332737, "grad_norm": 0.9162296652793884, "learning_rate": 4.584388084230518e-05, "loss": 0.6211, "num_input_tokens_seen": 20851088, "step": 35930 }, { "epoch": 5.352249031873697, "grad_norm": 0.8924786448478699, "learning_rate": 4.584208654942212e-05, "loss": 0.6936, "num_input_tokens_seen": 20853968, "step": 35935 }, { "epoch": 5.352993744414656, "grad_norm": 0.8036784529685974, "learning_rate": 4.584029190443321e-05, "loss": 0.8866, "num_input_tokens_seen": 20856944, "step": 35940 }, { "epoch": 5.3537384569556155, "grad_norm": 1.4610140323638916, "learning_rate": 4.583849690736873e-05, "loss": 0.7061, "num_input_tokens_seen": 20859856, "step": 35945 }, { "epoch": 5.354483169496574, "grad_norm": 1.0092533826828003, "learning_rate": 4.583670155825903e-05, "loss": 0.5892, "num_input_tokens_seen": 20862608, "step": 35950 }, { "epoch": 5.355227882037534, "grad_norm": 1.0776528120040894, "learning_rate": 4.5834905857134436e-05, "loss": 0.682, "num_input_tokens_seen": 20865616, "step": 35955 }, { "epoch": 5.355972594578493, "grad_norm": 1.265114188194275, "learning_rate": 4.583310980402529e-05, "loss": 0.5329, "num_input_tokens_seen": 20868368, "step": 35960 }, { "epoch": 5.356717307119452, "grad_norm": 1.6922008991241455, "learning_rate": 4.5831313398961915e-05, "loss": 0.6521, "num_input_tokens_seen": 20871568, "step": 35965 }, { "epoch": 5.357462019660411, "grad_norm": 1.3480305671691895, "learning_rate": 4.5829516641974676e-05, "loss": 0.6994, "num_input_tokens_seen": 20874320, "step": 35970 }, { "epoch": 5.358206732201371, "grad_norm": 1.0738990306854248, "learning_rate": 4.582771953309393e-05, "loss": 0.637, "num_input_tokens_seen": 20877296, "step": 35975 }, { "epoch": 5.358951444742329, "grad_norm": 0.6464168429374695, "learning_rate": 4.582592207235002e-05, "loss": 0.6783, "num_input_tokens_seen": 20879984, "step": 35980 }, { "epoch": 5.359696157283288, "grad_norm": 1.0995397567749023, "learning_rate": 4.5824124259773336e-05, "loss": 0.625, "num_input_tokens_seen": 20882896, "step": 35985 }, { "epoch": 5.360440869824248, "grad_norm": 1.2951027154922485, "learning_rate": 4.582232609539423e-05, "loss": 0.5806, "num_input_tokens_seen": 20885968, "step": 35990 }, { "epoch": 5.3611855823652075, "grad_norm": 0.9928922653198242, "learning_rate": 4.582052757924309e-05, "loss": 0.6237, "num_input_tokens_seen": 20888784, "step": 35995 }, { "epoch": 5.361930294906166, "grad_norm": 1.0295482873916626, "learning_rate": 4.5818728711350296e-05, "loss": 0.6416, "num_input_tokens_seen": 20891888, "step": 36000 }, { "epoch": 5.362675007447125, "grad_norm": 1.1740673780441284, "learning_rate": 4.581692949174624e-05, "loss": 0.8367, "num_input_tokens_seen": 20894672, "step": 36005 }, { "epoch": 5.363419719988085, "grad_norm": 1.5542677640914917, "learning_rate": 4.581512992046132e-05, "loss": 0.6236, "num_input_tokens_seen": 20897712, "step": 36010 }, { "epoch": 5.364164432529043, "grad_norm": 1.1814944744110107, "learning_rate": 4.5813329997525925e-05, "loss": 0.5944, "num_input_tokens_seen": 20900688, "step": 36015 }, { "epoch": 5.364909145070003, "grad_norm": 0.9000009894371033, "learning_rate": 4.5811529722970484e-05, "loss": 0.6803, "num_input_tokens_seen": 20903632, "step": 36020 }, { "epoch": 5.365653857610962, "grad_norm": 0.9394601583480835, "learning_rate": 4.5809729096825396e-05, "loss": 0.584, "num_input_tokens_seen": 20906448, "step": 36025 }, { "epoch": 5.3663985701519215, "grad_norm": 0.9746211171150208, "learning_rate": 4.580792811912109e-05, "loss": 0.538, "num_input_tokens_seen": 20909648, "step": 36030 }, { "epoch": 5.36714328269288, "grad_norm": 1.0236520767211914, "learning_rate": 4.5806126789887984e-05, "loss": 0.7353, "num_input_tokens_seen": 20912688, "step": 36035 }, { "epoch": 5.36788799523384, "grad_norm": 0.8448100090026855, "learning_rate": 4.580432510915651e-05, "loss": 0.5913, "num_input_tokens_seen": 20915536, "step": 36040 }, { "epoch": 5.368632707774799, "grad_norm": 1.705469012260437, "learning_rate": 4.580252307695711e-05, "loss": 0.7007, "num_input_tokens_seen": 20918416, "step": 36045 }, { "epoch": 5.369377420315758, "grad_norm": 2.0692484378814697, "learning_rate": 4.580072069332022e-05, "loss": 0.67, "num_input_tokens_seen": 20921104, "step": 36050 }, { "epoch": 5.370122132856717, "grad_norm": 0.8929986953735352, "learning_rate": 4.57989179582763e-05, "loss": 0.6868, "num_input_tokens_seen": 20924400, "step": 36055 }, { "epoch": 5.370866845397677, "grad_norm": 0.6427026391029358, "learning_rate": 4.57971148718558e-05, "loss": 0.5272, "num_input_tokens_seen": 20927440, "step": 36060 }, { "epoch": 5.371611557938635, "grad_norm": 1.055265188217163, "learning_rate": 4.579531143408918e-05, "loss": 0.6229, "num_input_tokens_seen": 20930192, "step": 36065 }, { "epoch": 5.372356270479595, "grad_norm": 0.7575196027755737, "learning_rate": 4.579350764500691e-05, "loss": 0.542, "num_input_tokens_seen": 20933136, "step": 36070 }, { "epoch": 5.373100983020554, "grad_norm": 1.0058019161224365, "learning_rate": 4.579170350463946e-05, "loss": 0.6976, "num_input_tokens_seen": 20936048, "step": 36075 }, { "epoch": 5.3738456955615135, "grad_norm": 0.7214799523353577, "learning_rate": 4.5789899013017315e-05, "loss": 0.6346, "num_input_tokens_seen": 20939120, "step": 36080 }, { "epoch": 5.374590408102472, "grad_norm": 1.0504318475723267, "learning_rate": 4.578809417017095e-05, "loss": 0.677, "num_input_tokens_seen": 20941808, "step": 36085 }, { "epoch": 5.375335120643432, "grad_norm": 0.8639521598815918, "learning_rate": 4.578628897613087e-05, "loss": 0.7366, "num_input_tokens_seen": 20944624, "step": 36090 }, { "epoch": 5.376079833184391, "grad_norm": 0.943500280380249, "learning_rate": 4.578448343092756e-05, "loss": 0.8739, "num_input_tokens_seen": 20947568, "step": 36095 }, { "epoch": 5.37682454572535, "grad_norm": 1.617395281791687, "learning_rate": 4.5782677534591524e-05, "loss": 0.6198, "num_input_tokens_seen": 20950416, "step": 36100 }, { "epoch": 5.377569258266309, "grad_norm": 1.0374085903167725, "learning_rate": 4.578087128715328e-05, "loss": 0.8087, "num_input_tokens_seen": 20953360, "step": 36105 }, { "epoch": 5.378313970807269, "grad_norm": 1.3126089572906494, "learning_rate": 4.577906468864333e-05, "loss": 0.5289, "num_input_tokens_seen": 20956144, "step": 36110 }, { "epoch": 5.3790586833482275, "grad_norm": 0.9787261486053467, "learning_rate": 4.577725773909221e-05, "loss": 0.6228, "num_input_tokens_seen": 20958928, "step": 36115 }, { "epoch": 5.379803395889187, "grad_norm": 0.9424620866775513, "learning_rate": 4.577545043853042e-05, "loss": 0.6807, "num_input_tokens_seen": 20961904, "step": 36120 }, { "epoch": 5.380548108430146, "grad_norm": 1.3424077033996582, "learning_rate": 4.577364278698852e-05, "loss": 0.6771, "num_input_tokens_seen": 20964592, "step": 36125 }, { "epoch": 5.3812928209711055, "grad_norm": 1.264898657798767, "learning_rate": 4.577183478449705e-05, "loss": 0.5406, "num_input_tokens_seen": 20967376, "step": 36130 }, { "epoch": 5.382037533512064, "grad_norm": 1.0224015712738037, "learning_rate": 4.5770026431086524e-05, "loss": 0.55, "num_input_tokens_seen": 20970352, "step": 36135 }, { "epoch": 5.382782246053024, "grad_norm": 0.8326187133789062, "learning_rate": 4.576821772678752e-05, "loss": 0.5816, "num_input_tokens_seen": 20973168, "step": 36140 }, { "epoch": 5.383526958593983, "grad_norm": 0.9730901122093201, "learning_rate": 4.576640867163059e-05, "loss": 0.6177, "num_input_tokens_seen": 20976304, "step": 36145 }, { "epoch": 5.384271671134942, "grad_norm": 1.3380976915359497, "learning_rate": 4.5764599265646286e-05, "loss": 0.6937, "num_input_tokens_seen": 20979376, "step": 36150 }, { "epoch": 5.385016383675901, "grad_norm": 0.875387966632843, "learning_rate": 4.576278950886518e-05, "loss": 0.7282, "num_input_tokens_seen": 20982160, "step": 36155 }, { "epoch": 5.385761096216861, "grad_norm": 0.9682324528694153, "learning_rate": 4.576097940131785e-05, "loss": 0.631, "num_input_tokens_seen": 20984880, "step": 36160 }, { "epoch": 5.3865058087578195, "grad_norm": 1.3844560384750366, "learning_rate": 4.5759168943034875e-05, "loss": 0.5579, "num_input_tokens_seen": 20987696, "step": 36165 }, { "epoch": 5.387250521298778, "grad_norm": 1.6386975049972534, "learning_rate": 4.5757358134046835e-05, "loss": 0.5458, "num_input_tokens_seen": 20990704, "step": 36170 }, { "epoch": 5.387995233839738, "grad_norm": 1.0814706087112427, "learning_rate": 4.5755546974384336e-05, "loss": 0.7013, "num_input_tokens_seen": 20993680, "step": 36175 }, { "epoch": 5.388739946380697, "grad_norm": 1.1374026536941528, "learning_rate": 4.575373546407795e-05, "loss": 0.6623, "num_input_tokens_seen": 20996816, "step": 36180 }, { "epoch": 5.389484658921656, "grad_norm": 0.9709941744804382, "learning_rate": 4.5751923603158305e-05, "loss": 0.5428, "num_input_tokens_seen": 20999568, "step": 36185 }, { "epoch": 5.390229371462615, "grad_norm": 1.0162218809127808, "learning_rate": 4.5750111391656005e-05, "loss": 0.7986, "num_input_tokens_seen": 21002256, "step": 36190 }, { "epoch": 5.390974084003575, "grad_norm": 1.2285938262939453, "learning_rate": 4.574829882960166e-05, "loss": 0.6973, "num_input_tokens_seen": 21005040, "step": 36195 }, { "epoch": 5.3917187965445335, "grad_norm": 0.9801589250564575, "learning_rate": 4.5746485917025894e-05, "loss": 0.586, "num_input_tokens_seen": 21008208, "step": 36200 }, { "epoch": 5.392463509085493, "grad_norm": 0.8796279430389404, "learning_rate": 4.574467265395933e-05, "loss": 0.6926, "num_input_tokens_seen": 21011312, "step": 36205 }, { "epoch": 5.393208221626452, "grad_norm": 1.0918325185775757, "learning_rate": 4.574285904043261e-05, "loss": 0.6357, "num_input_tokens_seen": 21014192, "step": 36210 }, { "epoch": 5.3939529341674115, "grad_norm": 0.9065203070640564, "learning_rate": 4.574104507647637e-05, "loss": 0.6662, "num_input_tokens_seen": 21017200, "step": 36215 }, { "epoch": 5.39469764670837, "grad_norm": 1.9088636636734009, "learning_rate": 4.5739230762121255e-05, "loss": 0.6153, "num_input_tokens_seen": 21019856, "step": 36220 }, { "epoch": 5.39544235924933, "grad_norm": 1.0902925729751587, "learning_rate": 4.573741609739791e-05, "loss": 0.5045, "num_input_tokens_seen": 21022800, "step": 36225 }, { "epoch": 5.396187071790289, "grad_norm": 0.8307470083236694, "learning_rate": 4.5735601082336995e-05, "loss": 0.6705, "num_input_tokens_seen": 21025840, "step": 36230 }, { "epoch": 5.396931784331248, "grad_norm": 1.331996202468872, "learning_rate": 4.573378571696918e-05, "loss": 0.5807, "num_input_tokens_seen": 21028752, "step": 36235 }, { "epoch": 5.397676496872207, "grad_norm": 1.5661368370056152, "learning_rate": 4.573197000132512e-05, "loss": 0.6002, "num_input_tokens_seen": 21031760, "step": 36240 }, { "epoch": 5.398421209413167, "grad_norm": 0.9781122207641602, "learning_rate": 4.57301539354355e-05, "loss": 0.7588, "num_input_tokens_seen": 21034800, "step": 36245 }, { "epoch": 5.3991659219541255, "grad_norm": 1.851960301399231, "learning_rate": 4.572833751933101e-05, "loss": 0.8311, "num_input_tokens_seen": 21037360, "step": 36250 }, { "epoch": 5.399910634495085, "grad_norm": 1.1664094924926758, "learning_rate": 4.5726520753042314e-05, "loss": 0.6694, "num_input_tokens_seen": 21040336, "step": 36255 }, { "epoch": 5.400655347036044, "grad_norm": 1.117274284362793, "learning_rate": 4.572470363660012e-05, "loss": 0.6786, "num_input_tokens_seen": 21043184, "step": 36260 }, { "epoch": 5.4014000595770035, "grad_norm": 1.4506003856658936, "learning_rate": 4.572288617003512e-05, "loss": 0.6329, "num_input_tokens_seen": 21046096, "step": 36265 }, { "epoch": 5.402144772117962, "grad_norm": 1.1007182598114014, "learning_rate": 4.5721068353378016e-05, "loss": 0.6352, "num_input_tokens_seen": 21048784, "step": 36270 }, { "epoch": 5.402889484658922, "grad_norm": 1.0984936952590942, "learning_rate": 4.571925018665953e-05, "loss": 0.5305, "num_input_tokens_seen": 21051376, "step": 36275 }, { "epoch": 5.403634197199881, "grad_norm": 1.5727248191833496, "learning_rate": 4.5717431669910364e-05, "loss": 0.6925, "num_input_tokens_seen": 21054128, "step": 36280 }, { "epoch": 5.40437890974084, "grad_norm": 1.216405987739563, "learning_rate": 4.571561280316125e-05, "loss": 0.6051, "num_input_tokens_seen": 21056944, "step": 36285 }, { "epoch": 5.405123622281799, "grad_norm": 1.2629913091659546, "learning_rate": 4.571379358644291e-05, "loss": 0.6624, "num_input_tokens_seen": 21059696, "step": 36290 }, { "epoch": 5.405868334822759, "grad_norm": 0.972988486289978, "learning_rate": 4.571197401978608e-05, "loss": 0.6313, "num_input_tokens_seen": 21062736, "step": 36295 }, { "epoch": 5.4066130473637175, "grad_norm": 0.868367612361908, "learning_rate": 4.5710154103221504e-05, "loss": 0.7073, "num_input_tokens_seen": 21066160, "step": 36300 }, { "epoch": 5.407357759904677, "grad_norm": 1.2276010513305664, "learning_rate": 4.570833383677991e-05, "loss": 0.64, "num_input_tokens_seen": 21068944, "step": 36305 }, { "epoch": 5.408102472445636, "grad_norm": 0.9968491196632385, "learning_rate": 4.570651322049208e-05, "loss": 0.7289, "num_input_tokens_seen": 21071856, "step": 36310 }, { "epoch": 5.408847184986596, "grad_norm": 1.4079781770706177, "learning_rate": 4.570469225438875e-05, "loss": 0.6652, "num_input_tokens_seen": 21074992, "step": 36315 }, { "epoch": 5.409591897527554, "grad_norm": 0.8282140493392944, "learning_rate": 4.570287093850068e-05, "loss": 0.6383, "num_input_tokens_seen": 21077744, "step": 36320 }, { "epoch": 5.410336610068514, "grad_norm": 0.8303356766700745, "learning_rate": 4.570104927285865e-05, "loss": 0.5241, "num_input_tokens_seen": 21080272, "step": 36325 }, { "epoch": 5.411081322609473, "grad_norm": 0.9049105048179626, "learning_rate": 4.5699227257493434e-05, "loss": 0.6777, "num_input_tokens_seen": 21083376, "step": 36330 }, { "epoch": 5.4118260351504315, "grad_norm": 1.5806126594543457, "learning_rate": 4.5697404892435816e-05, "loss": 0.7196, "num_input_tokens_seen": 21086096, "step": 36335 }, { "epoch": 5.412570747691391, "grad_norm": 0.8102875351905823, "learning_rate": 4.5695582177716566e-05, "loss": 0.6641, "num_input_tokens_seen": 21089072, "step": 36340 }, { "epoch": 5.413315460232351, "grad_norm": 0.9386937618255615, "learning_rate": 4.56937591133665e-05, "loss": 0.6683, "num_input_tokens_seen": 21092112, "step": 36345 }, { "epoch": 5.4140601727733095, "grad_norm": 0.9299460053443909, "learning_rate": 4.56919356994164e-05, "loss": 0.4569, "num_input_tokens_seen": 21094768, "step": 36350 }, { "epoch": 5.414804885314268, "grad_norm": 0.8865261077880859, "learning_rate": 4.569011193589707e-05, "loss": 0.4979, "num_input_tokens_seen": 21097744, "step": 36355 }, { "epoch": 5.415549597855228, "grad_norm": 0.8834642171859741, "learning_rate": 4.568828782283934e-05, "loss": 0.6958, "num_input_tokens_seen": 21100848, "step": 36360 }, { "epoch": 5.416294310396187, "grad_norm": 1.0302129983901978, "learning_rate": 4.5686463360274015e-05, "loss": 0.7633, "num_input_tokens_seen": 21103952, "step": 36365 }, { "epoch": 5.417039022937146, "grad_norm": 1.0079247951507568, "learning_rate": 4.568463854823191e-05, "loss": 0.6161, "num_input_tokens_seen": 21107088, "step": 36370 }, { "epoch": 5.417783735478105, "grad_norm": 0.899075448513031, "learning_rate": 4.5682813386743864e-05, "loss": 0.5312, "num_input_tokens_seen": 21109680, "step": 36375 }, { "epoch": 5.418528448019065, "grad_norm": 1.0455018281936646, "learning_rate": 4.56809878758407e-05, "loss": 0.6059, "num_input_tokens_seen": 21112432, "step": 36380 }, { "epoch": 5.4192731605600235, "grad_norm": 1.0303627252578735, "learning_rate": 4.567916201555327e-05, "loss": 0.6234, "num_input_tokens_seen": 21115408, "step": 36385 }, { "epoch": 5.420017873100983, "grad_norm": 0.8885098099708557, "learning_rate": 4.567733580591241e-05, "loss": 0.9132, "num_input_tokens_seen": 21118416, "step": 36390 }, { "epoch": 5.420762585641942, "grad_norm": 1.0590828657150269, "learning_rate": 4.567550924694898e-05, "loss": 0.6902, "num_input_tokens_seen": 21121328, "step": 36395 }, { "epoch": 5.421507298182902, "grad_norm": 1.0294808149337769, "learning_rate": 4.5673682338693836e-05, "loss": 0.6271, "num_input_tokens_seen": 21124144, "step": 36400 }, { "epoch": 5.42225201072386, "grad_norm": 0.9642481207847595, "learning_rate": 4.567185508117784e-05, "loss": 0.6303, "num_input_tokens_seen": 21126928, "step": 36405 }, { "epoch": 5.42299672326482, "grad_norm": 1.0524890422821045, "learning_rate": 4.567002747443186e-05, "loss": 0.6535, "num_input_tokens_seen": 21129360, "step": 36410 }, { "epoch": 5.423741435805779, "grad_norm": 0.9744668006896973, "learning_rate": 4.5668199518486785e-05, "loss": 0.4656, "num_input_tokens_seen": 21132144, "step": 36415 }, { "epoch": 5.424486148346738, "grad_norm": 2.7617688179016113, "learning_rate": 4.566637121337347e-05, "loss": 0.5068, "num_input_tokens_seen": 21135088, "step": 36420 }, { "epoch": 5.425230860887697, "grad_norm": 0.827218770980835, "learning_rate": 4.566454255912283e-05, "loss": 0.7186, "num_input_tokens_seen": 21137840, "step": 36425 }, { "epoch": 5.425975573428657, "grad_norm": 1.3373708724975586, "learning_rate": 4.5662713555765735e-05, "loss": 0.6543, "num_input_tokens_seen": 21140528, "step": 36430 }, { "epoch": 5.4267202859696155, "grad_norm": 1.3274604082107544, "learning_rate": 4.56608842033331e-05, "loss": 0.7232, "num_input_tokens_seen": 21143248, "step": 36435 }, { "epoch": 5.427464998510575, "grad_norm": 1.1244972944259644, "learning_rate": 4.565905450185583e-05, "loss": 0.6141, "num_input_tokens_seen": 21146320, "step": 36440 }, { "epoch": 5.428209711051534, "grad_norm": 0.6472039818763733, "learning_rate": 4.565722445136483e-05, "loss": 0.5479, "num_input_tokens_seen": 21148880, "step": 36445 }, { "epoch": 5.428954423592494, "grad_norm": 0.9504593014717102, "learning_rate": 4.565539405189101e-05, "loss": 0.7182, "num_input_tokens_seen": 21151792, "step": 36450 }, { "epoch": 5.429699136133452, "grad_norm": 0.9588003754615784, "learning_rate": 4.5653563303465306e-05, "loss": 0.6132, "num_input_tokens_seen": 21154672, "step": 36455 }, { "epoch": 5.430443848674412, "grad_norm": 1.0960707664489746, "learning_rate": 4.565173220611864e-05, "loss": 0.5923, "num_input_tokens_seen": 21157552, "step": 36460 }, { "epoch": 5.431188561215371, "grad_norm": 1.4507921934127808, "learning_rate": 4.5649900759881956e-05, "loss": 0.5936, "num_input_tokens_seen": 21160400, "step": 36465 }, { "epoch": 5.43193327375633, "grad_norm": 0.9100282192230225, "learning_rate": 4.564806896478617e-05, "loss": 0.5881, "num_input_tokens_seen": 21163312, "step": 36470 }, { "epoch": 5.432677986297289, "grad_norm": 1.5152370929718018, "learning_rate": 4.564623682086226e-05, "loss": 0.6885, "num_input_tokens_seen": 21166064, "step": 36475 }, { "epoch": 5.433422698838249, "grad_norm": 0.6019901633262634, "learning_rate": 4.564440432814116e-05, "loss": 0.6381, "num_input_tokens_seen": 21168784, "step": 36480 }, { "epoch": 5.434167411379208, "grad_norm": 2.6341426372528076, "learning_rate": 4.5642571486653825e-05, "loss": 0.6363, "num_input_tokens_seen": 21171824, "step": 36485 }, { "epoch": 5.434912123920167, "grad_norm": 0.927919328212738, "learning_rate": 4.5640738296431224e-05, "loss": 0.6399, "num_input_tokens_seen": 21175440, "step": 36490 }, { "epoch": 5.435656836461126, "grad_norm": 1.0750411748886108, "learning_rate": 4.563890475750433e-05, "loss": 0.6665, "num_input_tokens_seen": 21178320, "step": 36495 }, { "epoch": 5.436401549002086, "grad_norm": 1.795361042022705, "learning_rate": 4.563707086990412e-05, "loss": 0.8427, "num_input_tokens_seen": 21181136, "step": 36500 }, { "epoch": 5.437146261543044, "grad_norm": 2.0714173316955566, "learning_rate": 4.563523663366157e-05, "loss": 0.7546, "num_input_tokens_seen": 21183920, "step": 36505 }, { "epoch": 5.437890974084004, "grad_norm": 1.294299602508545, "learning_rate": 4.563340204880767e-05, "loss": 0.6896, "num_input_tokens_seen": 21186704, "step": 36510 }, { "epoch": 5.438635686624963, "grad_norm": 0.7061731815338135, "learning_rate": 4.563156711537341e-05, "loss": 0.676, "num_input_tokens_seen": 21189552, "step": 36515 }, { "epoch": 5.4393803991659215, "grad_norm": 1.2261481285095215, "learning_rate": 4.56297318333898e-05, "loss": 0.5586, "num_input_tokens_seen": 21192336, "step": 36520 }, { "epoch": 5.440125111706881, "grad_norm": 1.42775297164917, "learning_rate": 4.562789620288783e-05, "loss": 0.7806, "num_input_tokens_seen": 21195376, "step": 36525 }, { "epoch": 5.44086982424784, "grad_norm": 0.7856045961380005, "learning_rate": 4.562606022389853e-05, "loss": 0.5876, "num_input_tokens_seen": 21198192, "step": 36530 }, { "epoch": 5.4416145367888, "grad_norm": 1.8972493410110474, "learning_rate": 4.5624223896452894e-05, "loss": 0.7747, "num_input_tokens_seen": 21201168, "step": 36535 }, { "epoch": 5.442359249329758, "grad_norm": 0.7513054013252258, "learning_rate": 4.5622387220581965e-05, "loss": 0.6092, "num_input_tokens_seen": 21203984, "step": 36540 }, { "epoch": 5.443103961870718, "grad_norm": 0.7661198377609253, "learning_rate": 4.5620550196316757e-05, "loss": 0.662, "num_input_tokens_seen": 21206608, "step": 36545 }, { "epoch": 5.443848674411677, "grad_norm": 0.927281379699707, "learning_rate": 4.5618712823688316e-05, "loss": 0.5787, "num_input_tokens_seen": 21209680, "step": 36550 }, { "epoch": 5.444593386952636, "grad_norm": 0.8145666718482971, "learning_rate": 4.561687510272767e-05, "loss": 0.6608, "num_input_tokens_seen": 21212752, "step": 36555 }, { "epoch": 5.445338099493595, "grad_norm": 0.7974180579185486, "learning_rate": 4.5615037033465876e-05, "loss": 0.6608, "num_input_tokens_seen": 21215536, "step": 36560 }, { "epoch": 5.446082812034555, "grad_norm": 0.8821688890457153, "learning_rate": 4.5613198615933994e-05, "loss": 0.6493, "num_input_tokens_seen": 21218224, "step": 36565 }, { "epoch": 5.446827524575514, "grad_norm": 0.9658627510070801, "learning_rate": 4.561135985016306e-05, "loss": 0.5321, "num_input_tokens_seen": 21221168, "step": 36570 }, { "epoch": 5.447572237116473, "grad_norm": 0.7478209733963013, "learning_rate": 4.560952073618415e-05, "loss": 0.6002, "num_input_tokens_seen": 21223984, "step": 36575 }, { "epoch": 5.448316949657432, "grad_norm": 1.5732545852661133, "learning_rate": 4.560768127402834e-05, "loss": 0.7589, "num_input_tokens_seen": 21227088, "step": 36580 }, { "epoch": 5.449061662198392, "grad_norm": 1.3011226654052734, "learning_rate": 4.5605841463726695e-05, "loss": 0.7252, "num_input_tokens_seen": 21230032, "step": 36585 }, { "epoch": 5.44980637473935, "grad_norm": 1.5216447114944458, "learning_rate": 4.5604001305310304e-05, "loss": 0.7087, "num_input_tokens_seen": 21233072, "step": 36590 }, { "epoch": 5.45055108728031, "grad_norm": 1.391411542892456, "learning_rate": 4.5602160798810256e-05, "loss": 0.5424, "num_input_tokens_seen": 21236080, "step": 36595 }, { "epoch": 5.451295799821269, "grad_norm": 1.676160454750061, "learning_rate": 4.5600319944257635e-05, "loss": 0.8636, "num_input_tokens_seen": 21239056, "step": 36600 }, { "epoch": 5.452040512362228, "grad_norm": 2.0685977935791016, "learning_rate": 4.559847874168355e-05, "loss": 0.6268, "num_input_tokens_seen": 21241872, "step": 36605 }, { "epoch": 5.452785224903187, "grad_norm": 1.3253684043884277, "learning_rate": 4.55966371911191e-05, "loss": 0.6114, "num_input_tokens_seen": 21244752, "step": 36610 }, { "epoch": 5.453529937444147, "grad_norm": 0.5970005393028259, "learning_rate": 4.5594795292595394e-05, "loss": 0.5648, "num_input_tokens_seen": 21247408, "step": 36615 }, { "epoch": 5.454274649985106, "grad_norm": 1.4649598598480225, "learning_rate": 4.559295304614355e-05, "loss": 0.7778, "num_input_tokens_seen": 21250032, "step": 36620 }, { "epoch": 5.455019362526065, "grad_norm": 1.2018091678619385, "learning_rate": 4.559111045179471e-05, "loss": 0.6369, "num_input_tokens_seen": 21253136, "step": 36625 }, { "epoch": 5.455764075067024, "grad_norm": 1.3415151834487915, "learning_rate": 4.558926750957997e-05, "loss": 0.7735, "num_input_tokens_seen": 21256016, "step": 36630 }, { "epoch": 5.456508787607984, "grad_norm": 1.6006180047988892, "learning_rate": 4.558742421953049e-05, "loss": 0.7737, "num_input_tokens_seen": 21259248, "step": 36635 }, { "epoch": 5.457253500148942, "grad_norm": 1.101951241493225, "learning_rate": 4.55855805816774e-05, "loss": 0.6586, "num_input_tokens_seen": 21262192, "step": 36640 }, { "epoch": 5.457998212689902, "grad_norm": 1.0504192113876343, "learning_rate": 4.558373659605185e-05, "loss": 0.6708, "num_input_tokens_seen": 21264912, "step": 36645 }, { "epoch": 5.458742925230861, "grad_norm": 0.9141395688056946, "learning_rate": 4.5581892262684984e-05, "loss": 0.6229, "num_input_tokens_seen": 21267824, "step": 36650 }, { "epoch": 5.4594876377718204, "grad_norm": 1.882929801940918, "learning_rate": 4.558004758160798e-05, "loss": 0.7305, "num_input_tokens_seen": 21270672, "step": 36655 }, { "epoch": 5.460232350312779, "grad_norm": 1.5311179161071777, "learning_rate": 4.5578202552851976e-05, "loss": 0.5569, "num_input_tokens_seen": 21273488, "step": 36660 }, { "epoch": 5.460977062853739, "grad_norm": 1.0997358560562134, "learning_rate": 4.557635717644816e-05, "loss": 0.8302, "num_input_tokens_seen": 21276688, "step": 36665 }, { "epoch": 5.461721775394698, "grad_norm": 1.6084147691726685, "learning_rate": 4.557451145242769e-05, "loss": 0.6222, "num_input_tokens_seen": 21279696, "step": 36670 }, { "epoch": 5.462466487935657, "grad_norm": 1.2227935791015625, "learning_rate": 4.557266538082178e-05, "loss": 0.9917, "num_input_tokens_seen": 21282704, "step": 36675 }, { "epoch": 5.463211200476616, "grad_norm": 1.6025727987289429, "learning_rate": 4.557081896166159e-05, "loss": 0.5881, "num_input_tokens_seen": 21285648, "step": 36680 }, { "epoch": 5.463955913017575, "grad_norm": 1.3770068883895874, "learning_rate": 4.556897219497832e-05, "loss": 0.5378, "num_input_tokens_seen": 21288880, "step": 36685 }, { "epoch": 5.464700625558534, "grad_norm": 1.0671275854110718, "learning_rate": 4.556712508080316e-05, "loss": 0.7667, "num_input_tokens_seen": 21291888, "step": 36690 }, { "epoch": 5.465445338099494, "grad_norm": 0.691713273525238, "learning_rate": 4.556527761916735e-05, "loss": 0.5659, "num_input_tokens_seen": 21294832, "step": 36695 }, { "epoch": 5.466190050640453, "grad_norm": 1.4742003679275513, "learning_rate": 4.556342981010205e-05, "loss": 0.6762, "num_input_tokens_seen": 21297392, "step": 36700 }, { "epoch": 5.466934763181412, "grad_norm": 0.8692266345024109, "learning_rate": 4.5561581653638516e-05, "loss": 0.8156, "num_input_tokens_seen": 21300496, "step": 36705 }, { "epoch": 5.467679475722371, "grad_norm": 0.7042061686515808, "learning_rate": 4.555973314980796e-05, "loss": 0.6318, "num_input_tokens_seen": 21303376, "step": 36710 }, { "epoch": 5.46842418826333, "grad_norm": 0.9675210118293762, "learning_rate": 4.555788429864161e-05, "loss": 0.6779, "num_input_tokens_seen": 21306288, "step": 36715 }, { "epoch": 5.46916890080429, "grad_norm": 1.6843619346618652, "learning_rate": 4.5556035100170683e-05, "loss": 0.683, "num_input_tokens_seen": 21309232, "step": 36720 }, { "epoch": 5.469913613345248, "grad_norm": 1.3299511671066284, "learning_rate": 4.555418555442645e-05, "loss": 0.6314, "num_input_tokens_seen": 21312176, "step": 36725 }, { "epoch": 5.470658325886208, "grad_norm": 0.932969868183136, "learning_rate": 4.555233566144014e-05, "loss": 0.519, "num_input_tokens_seen": 21315184, "step": 36730 }, { "epoch": 5.471403038427167, "grad_norm": 1.1205676794052124, "learning_rate": 4.5550485421243006e-05, "loss": 0.7376, "num_input_tokens_seen": 21318352, "step": 36735 }, { "epoch": 5.4721477509681264, "grad_norm": 1.0969946384429932, "learning_rate": 4.554863483386631e-05, "loss": 0.804, "num_input_tokens_seen": 21321584, "step": 36740 }, { "epoch": 5.472892463509085, "grad_norm": 1.014762043952942, "learning_rate": 4.554678389934131e-05, "loss": 0.639, "num_input_tokens_seen": 21324400, "step": 36745 }, { "epoch": 5.473637176050045, "grad_norm": 1.284213662147522, "learning_rate": 4.554493261769928e-05, "loss": 0.7006, "num_input_tokens_seen": 21327312, "step": 36750 }, { "epoch": 5.474381888591004, "grad_norm": 0.8580458164215088, "learning_rate": 4.5543080988971484e-05, "loss": 0.7185, "num_input_tokens_seen": 21330160, "step": 36755 }, { "epoch": 5.475126601131963, "grad_norm": 0.6355860829353333, "learning_rate": 4.554122901318922e-05, "loss": 0.7467, "num_input_tokens_seen": 21332944, "step": 36760 }, { "epoch": 5.475871313672922, "grad_norm": 0.9597384929656982, "learning_rate": 4.553937669038378e-05, "loss": 0.4211, "num_input_tokens_seen": 21335920, "step": 36765 }, { "epoch": 5.476616026213882, "grad_norm": 0.8403477668762207, "learning_rate": 4.553752402058644e-05, "loss": 0.6628, "num_input_tokens_seen": 21338864, "step": 36770 }, { "epoch": 5.47736073875484, "grad_norm": 1.1305488348007202, "learning_rate": 4.55356710038285e-05, "loss": 0.6376, "num_input_tokens_seen": 21341872, "step": 36775 }, { "epoch": 5.4781054512958, "grad_norm": 1.2687575817108154, "learning_rate": 4.5533817640141275e-05, "loss": 0.7585, "num_input_tokens_seen": 21344432, "step": 36780 }, { "epoch": 5.478850163836759, "grad_norm": 0.9670076966285706, "learning_rate": 4.553196392955606e-05, "loss": 0.5213, "num_input_tokens_seen": 21347248, "step": 36785 }, { "epoch": 5.4795948763777185, "grad_norm": 0.9692125916481018, "learning_rate": 4.55301098721042e-05, "loss": 0.6709, "num_input_tokens_seen": 21349968, "step": 36790 }, { "epoch": 5.480339588918677, "grad_norm": 1.8200477361679077, "learning_rate": 4.5528255467816994e-05, "loss": 0.6989, "num_input_tokens_seen": 21352784, "step": 36795 }, { "epoch": 5.481084301459637, "grad_norm": 0.8488114476203918, "learning_rate": 4.552640071672577e-05, "loss": 0.6535, "num_input_tokens_seen": 21355632, "step": 36800 }, { "epoch": 5.481829014000596, "grad_norm": 1.0271196365356445, "learning_rate": 4.552454561886187e-05, "loss": 0.6778, "num_input_tokens_seen": 21358288, "step": 36805 }, { "epoch": 5.482573726541555, "grad_norm": 1.0010234117507935, "learning_rate": 4.5522690174256635e-05, "loss": 0.6167, "num_input_tokens_seen": 21361232, "step": 36810 }, { "epoch": 5.483318439082514, "grad_norm": 1.289287805557251, "learning_rate": 4.55208343829414e-05, "loss": 0.6454, "num_input_tokens_seen": 21364080, "step": 36815 }, { "epoch": 5.484063151623474, "grad_norm": 0.8442219495773315, "learning_rate": 4.551897824494753e-05, "loss": 0.7164, "num_input_tokens_seen": 21366928, "step": 36820 }, { "epoch": 5.4848078641644324, "grad_norm": 1.3680616617202759, "learning_rate": 4.551712176030638e-05, "loss": 0.5824, "num_input_tokens_seen": 21369840, "step": 36825 }, { "epoch": 5.485552576705392, "grad_norm": 0.8118137121200562, "learning_rate": 4.551526492904931e-05, "loss": 0.7285, "num_input_tokens_seen": 21372624, "step": 36830 }, { "epoch": 5.486297289246351, "grad_norm": 0.6093922853469849, "learning_rate": 4.551340775120768e-05, "loss": 0.6626, "num_input_tokens_seen": 21375536, "step": 36835 }, { "epoch": 5.4870420017873105, "grad_norm": 0.832200288772583, "learning_rate": 4.551155022681288e-05, "loss": 0.5225, "num_input_tokens_seen": 21378544, "step": 36840 }, { "epoch": 5.487786714328269, "grad_norm": 2.843201160430908, "learning_rate": 4.5509692355896296e-05, "loss": 0.7876, "num_input_tokens_seen": 21381200, "step": 36845 }, { "epoch": 5.488531426869228, "grad_norm": 0.7142884135246277, "learning_rate": 4.550783413848929e-05, "loss": 0.6878, "num_input_tokens_seen": 21384656, "step": 36850 }, { "epoch": 5.489276139410188, "grad_norm": 1.2152223587036133, "learning_rate": 4.550597557462328e-05, "loss": 0.5683, "num_input_tokens_seen": 21387440, "step": 36855 }, { "epoch": 5.490020851951147, "grad_norm": 1.5387464761734009, "learning_rate": 4.5504116664329656e-05, "loss": 0.6778, "num_input_tokens_seen": 21390128, "step": 36860 }, { "epoch": 5.490765564492106, "grad_norm": 1.4025332927703857, "learning_rate": 4.550225740763981e-05, "loss": 0.6739, "num_input_tokens_seen": 21392912, "step": 36865 }, { "epoch": 5.491510277033065, "grad_norm": 0.8123483061790466, "learning_rate": 4.5500397804585166e-05, "loss": 0.6629, "num_input_tokens_seen": 21396048, "step": 36870 }, { "epoch": 5.4922549895740245, "grad_norm": 0.9598180651664734, "learning_rate": 4.5498537855197145e-05, "loss": 0.6779, "num_input_tokens_seen": 21398832, "step": 36875 }, { "epoch": 5.492999702114983, "grad_norm": 1.6570338010787964, "learning_rate": 4.549667755950715e-05, "loss": 0.7224, "num_input_tokens_seen": 21401616, "step": 36880 }, { "epoch": 5.493744414655943, "grad_norm": 0.7152214646339417, "learning_rate": 4.5494816917546625e-05, "loss": 0.6891, "num_input_tokens_seen": 21404464, "step": 36885 }, { "epoch": 5.494489127196902, "grad_norm": 1.0606969594955444, "learning_rate": 4.549295592934699e-05, "loss": 0.5745, "num_input_tokens_seen": 21407536, "step": 36890 }, { "epoch": 5.495233839737861, "grad_norm": 0.9556853771209717, "learning_rate": 4.5491094594939705e-05, "loss": 0.5727, "num_input_tokens_seen": 21410448, "step": 36895 }, { "epoch": 5.49597855227882, "grad_norm": 1.386792778968811, "learning_rate": 4.5489232914356196e-05, "loss": 0.588, "num_input_tokens_seen": 21413200, "step": 36900 }, { "epoch": 5.49672326481978, "grad_norm": 1.0876520872116089, "learning_rate": 4.548737088762792e-05, "loss": 0.5548, "num_input_tokens_seen": 21415952, "step": 36905 }, { "epoch": 5.4974679773607384, "grad_norm": 1.4263765811920166, "learning_rate": 4.548550851478634e-05, "loss": 0.7994, "num_input_tokens_seen": 21419312, "step": 36910 }, { "epoch": 5.498212689901698, "grad_norm": 1.5795843601226807, "learning_rate": 4.548364579586291e-05, "loss": 0.7966, "num_input_tokens_seen": 21422320, "step": 36915 }, { "epoch": 5.498957402442657, "grad_norm": 1.1407703161239624, "learning_rate": 4.548178273088911e-05, "loss": 0.7889, "num_input_tokens_seen": 21425392, "step": 36920 }, { "epoch": 5.4997021149836165, "grad_norm": 0.7376894950866699, "learning_rate": 4.54799193198964e-05, "loss": 0.6323, "num_input_tokens_seen": 21428176, "step": 36925 }, { "epoch": 5.500446827524575, "grad_norm": 0.9445262551307678, "learning_rate": 4.547805556291627e-05, "loss": 0.6917, "num_input_tokens_seen": 21431344, "step": 36930 }, { "epoch": 5.501191540065535, "grad_norm": 0.9508808851242065, "learning_rate": 4.54761914599802e-05, "loss": 0.6106, "num_input_tokens_seen": 21434288, "step": 36935 }, { "epoch": 5.501936252606494, "grad_norm": 0.7821341156959534, "learning_rate": 4.54743270111197e-05, "loss": 0.5424, "num_input_tokens_seen": 21437104, "step": 36940 }, { "epoch": 5.502680965147453, "grad_norm": 1.3292725086212158, "learning_rate": 4.547246221636624e-05, "loss": 0.6314, "num_input_tokens_seen": 21440272, "step": 36945 }, { "epoch": 5.503425677688412, "grad_norm": 0.7516741156578064, "learning_rate": 4.5470597075751345e-05, "loss": 0.6171, "num_input_tokens_seen": 21442896, "step": 36950 }, { "epoch": 5.504170390229372, "grad_norm": 1.5590099096298218, "learning_rate": 4.5468731589306516e-05, "loss": 0.5789, "num_input_tokens_seen": 21445584, "step": 36955 }, { "epoch": 5.5049151027703305, "grad_norm": 0.8338817954063416, "learning_rate": 4.546686575706327e-05, "loss": 0.7573, "num_input_tokens_seen": 21448400, "step": 36960 }, { "epoch": 5.50565981531129, "grad_norm": 0.7220488786697388, "learning_rate": 4.546499957905313e-05, "loss": 0.4502, "num_input_tokens_seen": 21451088, "step": 36965 }, { "epoch": 5.506404527852249, "grad_norm": 1.0217194557189941, "learning_rate": 4.546313305530762e-05, "loss": 0.5479, "num_input_tokens_seen": 21454160, "step": 36970 }, { "epoch": 5.5071492403932085, "grad_norm": 0.6813873052597046, "learning_rate": 4.546126618585828e-05, "loss": 0.7004, "num_input_tokens_seen": 21456752, "step": 36975 }, { "epoch": 5.507893952934167, "grad_norm": 1.2170770168304443, "learning_rate": 4.5459398970736636e-05, "loss": 0.6261, "num_input_tokens_seen": 21459920, "step": 36980 }, { "epoch": 5.508638665475127, "grad_norm": 1.1888313293457031, "learning_rate": 4.545753140997424e-05, "loss": 0.6087, "num_input_tokens_seen": 21462864, "step": 36985 }, { "epoch": 5.509383378016086, "grad_norm": 0.9852754473686218, "learning_rate": 4.545566350360265e-05, "loss": 0.6662, "num_input_tokens_seen": 21465616, "step": 36990 }, { "epoch": 5.510128090557045, "grad_norm": 0.8594586253166199, "learning_rate": 4.5453795251653416e-05, "loss": 0.6578, "num_input_tokens_seen": 21468784, "step": 36995 }, { "epoch": 5.510872803098004, "grad_norm": 1.1231887340545654, "learning_rate": 4.545192665415809e-05, "loss": 0.5801, "num_input_tokens_seen": 21471600, "step": 37000 }, { "epoch": 5.511617515638964, "grad_norm": 0.5723852515220642, "learning_rate": 4.545005771114826e-05, "loss": 0.5553, "num_input_tokens_seen": 21474192, "step": 37005 }, { "epoch": 5.5123622281799225, "grad_norm": 1.8204466104507446, "learning_rate": 4.544818842265548e-05, "loss": 0.7426, "num_input_tokens_seen": 21476752, "step": 37010 }, { "epoch": 5.513106940720881, "grad_norm": 1.7642078399658203, "learning_rate": 4.544631878871135e-05, "loss": 0.6278, "num_input_tokens_seen": 21479856, "step": 37015 }, { "epoch": 5.513851653261841, "grad_norm": 1.3409054279327393, "learning_rate": 4.544444880934744e-05, "loss": 0.6888, "num_input_tokens_seen": 21482640, "step": 37020 }, { "epoch": 5.5145963658028005, "grad_norm": 1.0534476041793823, "learning_rate": 4.5442578484595346e-05, "loss": 0.666, "num_input_tokens_seen": 21485424, "step": 37025 }, { "epoch": 5.515341078343759, "grad_norm": 1.3645267486572266, "learning_rate": 4.544070781448666e-05, "loss": 0.6691, "num_input_tokens_seen": 21488368, "step": 37030 }, { "epoch": 5.516085790884718, "grad_norm": 0.963650643825531, "learning_rate": 4.5438836799053e-05, "loss": 0.6689, "num_input_tokens_seen": 21491088, "step": 37035 }, { "epoch": 5.516830503425678, "grad_norm": 1.152929425239563, "learning_rate": 4.5436965438325953e-05, "loss": 0.6697, "num_input_tokens_seen": 21494288, "step": 37040 }, { "epoch": 5.517575215966637, "grad_norm": 0.6869667172431946, "learning_rate": 4.543509373233715e-05, "loss": 0.5873, "num_input_tokens_seen": 21497328, "step": 37045 }, { "epoch": 5.518319928507596, "grad_norm": 1.013123869895935, "learning_rate": 4.5433221681118215e-05, "loss": 0.6263, "num_input_tokens_seen": 21500240, "step": 37050 }, { "epoch": 5.519064641048555, "grad_norm": 0.6511259078979492, "learning_rate": 4.5431349284700764e-05, "loss": 0.6383, "num_input_tokens_seen": 21503632, "step": 37055 }, { "epoch": 5.5198093535895145, "grad_norm": 0.7946273684501648, "learning_rate": 4.542947654311643e-05, "loss": 0.596, "num_input_tokens_seen": 21506640, "step": 37060 }, { "epoch": 5.520554066130473, "grad_norm": 1.2606174945831299, "learning_rate": 4.542760345639686e-05, "loss": 0.588, "num_input_tokens_seen": 21509296, "step": 37065 }, { "epoch": 5.521298778671433, "grad_norm": 1.1932191848754883, "learning_rate": 4.542573002457368e-05, "loss": 0.6528, "num_input_tokens_seen": 21512080, "step": 37070 }, { "epoch": 5.522043491212392, "grad_norm": 1.019238829612732, "learning_rate": 4.5423856247678556e-05, "loss": 0.6857, "num_input_tokens_seen": 21515376, "step": 37075 }, { "epoch": 5.522788203753351, "grad_norm": 0.9259845018386841, "learning_rate": 4.542198212574314e-05, "loss": 0.7451, "num_input_tokens_seen": 21518096, "step": 37080 }, { "epoch": 5.52353291629431, "grad_norm": 0.9375696182250977, "learning_rate": 4.5420107658799094e-05, "loss": 0.5533, "num_input_tokens_seen": 21521232, "step": 37085 }, { "epoch": 5.52427762883527, "grad_norm": 1.297789216041565, "learning_rate": 4.541823284687808e-05, "loss": 0.6407, "num_input_tokens_seen": 21524080, "step": 37090 }, { "epoch": 5.5250223413762285, "grad_norm": 1.126369595527649, "learning_rate": 4.541635769001178e-05, "loss": 0.5693, "num_input_tokens_seen": 21527248, "step": 37095 }, { "epoch": 5.525767053917188, "grad_norm": 1.0322518348693848, "learning_rate": 4.5414482188231864e-05, "loss": 0.5141, "num_input_tokens_seen": 21530128, "step": 37100 }, { "epoch": 5.526511766458147, "grad_norm": 0.99673992395401, "learning_rate": 4.5412606341570016e-05, "loss": 0.5593, "num_input_tokens_seen": 21532720, "step": 37105 }, { "epoch": 5.5272564789991065, "grad_norm": 0.91098952293396, "learning_rate": 4.5410730150057935e-05, "loss": 0.6238, "num_input_tokens_seen": 21535984, "step": 37110 }, { "epoch": 5.528001191540065, "grad_norm": 0.8081196546554565, "learning_rate": 4.5408853613727307e-05, "loss": 0.5001, "num_input_tokens_seen": 21538672, "step": 37115 }, { "epoch": 5.528745904081025, "grad_norm": 1.092829942703247, "learning_rate": 4.540697673260984e-05, "loss": 0.6938, "num_input_tokens_seen": 21541328, "step": 37120 }, { "epoch": 5.529490616621984, "grad_norm": 0.6576019525527954, "learning_rate": 4.5405099506737244e-05, "loss": 0.4607, "num_input_tokens_seen": 21544176, "step": 37125 }, { "epoch": 5.530235329162943, "grad_norm": 1.045175552368164, "learning_rate": 4.540322193614123e-05, "loss": 0.5877, "num_input_tokens_seen": 21546928, "step": 37130 }, { "epoch": 5.530980041703902, "grad_norm": 0.9094420671463013, "learning_rate": 4.540134402085352e-05, "loss": 0.6875, "num_input_tokens_seen": 21549904, "step": 37135 }, { "epoch": 5.531724754244862, "grad_norm": 0.8882014155387878, "learning_rate": 4.539946576090584e-05, "loss": 0.6448, "num_input_tokens_seen": 21553136, "step": 37140 }, { "epoch": 5.5324694667858205, "grad_norm": 0.9240052700042725, "learning_rate": 4.539758715632992e-05, "loss": 0.788, "num_input_tokens_seen": 21556144, "step": 37145 }, { "epoch": 5.53321417932678, "grad_norm": 1.7549811601638794, "learning_rate": 4.539570820715749e-05, "loss": 0.6188, "num_input_tokens_seen": 21559024, "step": 37150 }, { "epoch": 5.533958891867739, "grad_norm": 0.9703622460365295, "learning_rate": 4.53938289134203e-05, "loss": 0.6301, "num_input_tokens_seen": 21562224, "step": 37155 }, { "epoch": 5.534703604408699, "grad_norm": 1.0832111835479736, "learning_rate": 4.5391949275150104e-05, "loss": 0.7703, "num_input_tokens_seen": 21565264, "step": 37160 }, { "epoch": 5.535448316949657, "grad_norm": 1.1446720361709595, "learning_rate": 4.539006929237864e-05, "loss": 0.6374, "num_input_tokens_seen": 21568400, "step": 37165 }, { "epoch": 5.536193029490617, "grad_norm": 1.0160512924194336, "learning_rate": 4.538818896513769e-05, "loss": 0.6885, "num_input_tokens_seen": 21571216, "step": 37170 }, { "epoch": 5.536937742031576, "grad_norm": 1.1542491912841797, "learning_rate": 4.5386308293459e-05, "loss": 0.6532, "num_input_tokens_seen": 21574192, "step": 37175 }, { "epoch": 5.537682454572535, "grad_norm": 1.1118541955947876, "learning_rate": 4.5384427277374355e-05, "loss": 0.6797, "num_input_tokens_seen": 21577040, "step": 37180 }, { "epoch": 5.538427167113494, "grad_norm": 1.2155938148498535, "learning_rate": 4.538254591691553e-05, "loss": 0.7044, "num_input_tokens_seen": 21579920, "step": 37185 }, { "epoch": 5.539171879654454, "grad_norm": 1.4162442684173584, "learning_rate": 4.538066421211431e-05, "loss": 0.6263, "num_input_tokens_seen": 21582608, "step": 37190 }, { "epoch": 5.5399165921954125, "grad_norm": 0.9052091240882874, "learning_rate": 4.5378782163002476e-05, "loss": 0.6599, "num_input_tokens_seen": 21585520, "step": 37195 }, { "epoch": 5.540661304736371, "grad_norm": 1.158296823501587, "learning_rate": 4.537689976961184e-05, "loss": 0.5926, "num_input_tokens_seen": 21588496, "step": 37200 }, { "epoch": 5.541406017277331, "grad_norm": 0.8419106602668762, "learning_rate": 4.537501703197418e-05, "loss": 0.5932, "num_input_tokens_seen": 21591536, "step": 37205 }, { "epoch": 5.542150729818291, "grad_norm": 1.0163936614990234, "learning_rate": 4.5373133950121314e-05, "loss": 0.6115, "num_input_tokens_seen": 21594192, "step": 37210 }, { "epoch": 5.542895442359249, "grad_norm": 0.7944679856300354, "learning_rate": 4.537125052408506e-05, "loss": 0.6671, "num_input_tokens_seen": 21597136, "step": 37215 }, { "epoch": 5.543640154900208, "grad_norm": 1.4631649255752563, "learning_rate": 4.536936675389724e-05, "loss": 0.6957, "num_input_tokens_seen": 21599984, "step": 37220 }, { "epoch": 5.544384867441168, "grad_norm": 1.0348973274230957, "learning_rate": 4.5367482639589665e-05, "loss": 0.4657, "num_input_tokens_seen": 21602736, "step": 37225 }, { "epoch": 5.5451295799821265, "grad_norm": 1.487221121788025, "learning_rate": 4.536559818119418e-05, "loss": 0.6929, "num_input_tokens_seen": 21605616, "step": 37230 }, { "epoch": 5.545874292523086, "grad_norm": 2.2997045516967773, "learning_rate": 4.53637133787426e-05, "loss": 0.4935, "num_input_tokens_seen": 21608912, "step": 37235 }, { "epoch": 5.546619005064045, "grad_norm": 0.8756985068321228, "learning_rate": 4.536182823226678e-05, "loss": 0.5477, "num_input_tokens_seen": 21611920, "step": 37240 }, { "epoch": 5.547363717605005, "grad_norm": 0.8066134452819824, "learning_rate": 4.535994274179858e-05, "loss": 0.6677, "num_input_tokens_seen": 21614704, "step": 37245 }, { "epoch": 5.548108430145963, "grad_norm": 0.9559898972511292, "learning_rate": 4.535805690736983e-05, "loss": 0.5403, "num_input_tokens_seen": 21617648, "step": 37250 }, { "epoch": 5.548853142686923, "grad_norm": 1.0899115800857544, "learning_rate": 4.535617072901239e-05, "loss": 0.5738, "num_input_tokens_seen": 21620432, "step": 37255 }, { "epoch": 5.549597855227882, "grad_norm": 3.9526844024658203, "learning_rate": 4.535428420675816e-05, "loss": 0.9747, "num_input_tokens_seen": 21623216, "step": 37260 }, { "epoch": 5.550342567768841, "grad_norm": 1.2179533243179321, "learning_rate": 4.535239734063896e-05, "loss": 0.5556, "num_input_tokens_seen": 21626064, "step": 37265 }, { "epoch": 5.5510872803098, "grad_norm": 1.367464303970337, "learning_rate": 4.535051013068671e-05, "loss": 0.6842, "num_input_tokens_seen": 21629328, "step": 37270 }, { "epoch": 5.55183199285076, "grad_norm": 1.7132227420806885, "learning_rate": 4.5348622576933265e-05, "loss": 0.6108, "num_input_tokens_seen": 21632240, "step": 37275 }, { "epoch": 5.5525767053917185, "grad_norm": 1.3749961853027344, "learning_rate": 4.534673467941053e-05, "loss": 0.6776, "num_input_tokens_seen": 21635024, "step": 37280 }, { "epoch": 5.553321417932678, "grad_norm": 1.3392868041992188, "learning_rate": 4.534484643815038e-05, "loss": 0.7607, "num_input_tokens_seen": 21637872, "step": 37285 }, { "epoch": 5.554066130473637, "grad_norm": 1.2034809589385986, "learning_rate": 4.534295785318474e-05, "loss": 0.8246, "num_input_tokens_seen": 21640944, "step": 37290 }, { "epoch": 5.554810843014597, "grad_norm": 1.0176167488098145, "learning_rate": 4.53410689245455e-05, "loss": 0.5218, "num_input_tokens_seen": 21644048, "step": 37295 }, { "epoch": 5.555555555555555, "grad_norm": 0.9365347027778625, "learning_rate": 4.5339179652264576e-05, "loss": 0.6288, "num_input_tokens_seen": 21646832, "step": 37300 }, { "epoch": 5.556300268096515, "grad_norm": 1.0508127212524414, "learning_rate": 4.5337290036373875e-05, "loss": 0.5274, "num_input_tokens_seen": 21649680, "step": 37305 }, { "epoch": 5.557044980637474, "grad_norm": 0.6619358658790588, "learning_rate": 4.533540007690533e-05, "loss": 0.5858, "num_input_tokens_seen": 21652464, "step": 37310 }, { "epoch": 5.557789693178433, "grad_norm": 1.1037514209747314, "learning_rate": 4.533350977389087e-05, "loss": 0.6284, "num_input_tokens_seen": 21655344, "step": 37315 }, { "epoch": 5.558534405719392, "grad_norm": 0.7432914972305298, "learning_rate": 4.533161912736243e-05, "loss": 0.6465, "num_input_tokens_seen": 21658480, "step": 37320 }, { "epoch": 5.559279118260352, "grad_norm": 1.0171977281570435, "learning_rate": 4.532972813735196e-05, "loss": 0.6189, "num_input_tokens_seen": 21661232, "step": 37325 }, { "epoch": 5.560023830801311, "grad_norm": 1.2903165817260742, "learning_rate": 4.532783680389138e-05, "loss": 0.6225, "num_input_tokens_seen": 21664112, "step": 37330 }, { "epoch": 5.56076854334227, "grad_norm": 0.6934966444969177, "learning_rate": 4.532594512701266e-05, "loss": 0.6595, "num_input_tokens_seen": 21666640, "step": 37335 }, { "epoch": 5.561513255883229, "grad_norm": 1.1854153871536255, "learning_rate": 4.532405310674776e-05, "loss": 0.5927, "num_input_tokens_seen": 21669968, "step": 37340 }, { "epoch": 5.562257968424189, "grad_norm": 1.5631704330444336, "learning_rate": 4.532216074312864e-05, "loss": 0.5623, "num_input_tokens_seen": 21672912, "step": 37345 }, { "epoch": 5.563002680965147, "grad_norm": 1.2423245906829834, "learning_rate": 4.5320268036187266e-05, "loss": 0.5285, "num_input_tokens_seen": 21675760, "step": 37350 }, { "epoch": 5.563747393506107, "grad_norm": 0.978096067905426, "learning_rate": 4.531837498595561e-05, "loss": 0.5647, "num_input_tokens_seen": 21678480, "step": 37355 }, { "epoch": 5.564492106047066, "grad_norm": 1.6199240684509277, "learning_rate": 4.531648159246567e-05, "loss": 0.6242, "num_input_tokens_seen": 21681360, "step": 37360 }, { "epoch": 5.5652368185880245, "grad_norm": 0.8429867625236511, "learning_rate": 4.531458785574941e-05, "loss": 0.6311, "num_input_tokens_seen": 21684496, "step": 37365 }, { "epoch": 5.565981531128984, "grad_norm": 1.5405853986740112, "learning_rate": 4.531269377583885e-05, "loss": 0.7828, "num_input_tokens_seen": 21687152, "step": 37370 }, { "epoch": 5.566726243669944, "grad_norm": 0.7910005450248718, "learning_rate": 4.5310799352765964e-05, "loss": 0.6386, "num_input_tokens_seen": 21689936, "step": 37375 }, { "epoch": 5.567470956210903, "grad_norm": 1.0573821067810059, "learning_rate": 4.5308904586562774e-05, "loss": 0.6827, "num_input_tokens_seen": 21692592, "step": 37380 }, { "epoch": 5.568215668751861, "grad_norm": 1.1935707330703735, "learning_rate": 4.530700947726127e-05, "loss": 0.6181, "num_input_tokens_seen": 21695440, "step": 37385 }, { "epoch": 5.568960381292821, "grad_norm": 1.1027413606643677, "learning_rate": 4.530511402489349e-05, "loss": 0.6649, "num_input_tokens_seen": 21698640, "step": 37390 }, { "epoch": 5.569705093833781, "grad_norm": 1.0252190828323364, "learning_rate": 4.530321822949144e-05, "loss": 0.5257, "num_input_tokens_seen": 21701328, "step": 37395 }, { "epoch": 5.570449806374739, "grad_norm": 1.7136203050613403, "learning_rate": 4.530132209108715e-05, "loss": 0.7124, "num_input_tokens_seen": 21704112, "step": 37400 }, { "epoch": 5.571194518915698, "grad_norm": 1.1388108730316162, "learning_rate": 4.529942560971266e-05, "loss": 0.6663, "num_input_tokens_seen": 21706736, "step": 37405 }, { "epoch": 5.571939231456658, "grad_norm": 1.0742267370224, "learning_rate": 4.529752878540001e-05, "loss": 0.8172, "num_input_tokens_seen": 21709360, "step": 37410 }, { "epoch": 5.572683943997617, "grad_norm": 0.7212013006210327, "learning_rate": 4.529563161818124e-05, "loss": 0.526, "num_input_tokens_seen": 21711920, "step": 37415 }, { "epoch": 5.573428656538576, "grad_norm": 1.350052833557129, "learning_rate": 4.529373410808841e-05, "loss": 0.73, "num_input_tokens_seen": 21714928, "step": 37420 }, { "epoch": 5.574173369079535, "grad_norm": 0.892358660697937, "learning_rate": 4.5291836255153555e-05, "loss": 0.4261, "num_input_tokens_seen": 21717872, "step": 37425 }, { "epoch": 5.574918081620495, "grad_norm": 1.0317466259002686, "learning_rate": 4.528993805940874e-05, "loss": 0.721, "num_input_tokens_seen": 21721072, "step": 37430 }, { "epoch": 5.575662794161453, "grad_norm": 1.0108277797698975, "learning_rate": 4.528803952088606e-05, "loss": 0.6578, "num_input_tokens_seen": 21724208, "step": 37435 }, { "epoch": 5.576407506702413, "grad_norm": 5.499897480010986, "learning_rate": 4.5286140639617566e-05, "loss": 0.5464, "num_input_tokens_seen": 21727248, "step": 37440 }, { "epoch": 5.577152219243372, "grad_norm": 0.7994077205657959, "learning_rate": 4.528424141563535e-05, "loss": 0.6733, "num_input_tokens_seen": 21730448, "step": 37445 }, { "epoch": 5.577896931784331, "grad_norm": 1.9976361989974976, "learning_rate": 4.528234184897149e-05, "loss": 0.6481, "num_input_tokens_seen": 21733488, "step": 37450 }, { "epoch": 5.57864164432529, "grad_norm": 1.098598599433899, "learning_rate": 4.528044193965807e-05, "loss": 0.5838, "num_input_tokens_seen": 21736432, "step": 37455 }, { "epoch": 5.57938635686625, "grad_norm": 0.8443538546562195, "learning_rate": 4.527854168772721e-05, "loss": 0.4192, "num_input_tokens_seen": 21739184, "step": 37460 }, { "epoch": 5.580131069407209, "grad_norm": 1.3298300504684448, "learning_rate": 4.527664109321098e-05, "loss": 0.594, "num_input_tokens_seen": 21741776, "step": 37465 }, { "epoch": 5.580875781948168, "grad_norm": 1.024542212486267, "learning_rate": 4.5274740156141516e-05, "loss": 0.7326, "num_input_tokens_seen": 21744528, "step": 37470 }, { "epoch": 5.581620494489127, "grad_norm": 1.432489037513733, "learning_rate": 4.527283887655093e-05, "loss": 0.6877, "num_input_tokens_seen": 21747312, "step": 37475 }, { "epoch": 5.582365207030087, "grad_norm": 1.2624366283416748, "learning_rate": 4.5270937254471325e-05, "loss": 0.6159, "num_input_tokens_seen": 21750096, "step": 37480 }, { "epoch": 5.583109919571045, "grad_norm": 0.817518949508667, "learning_rate": 4.526903528993484e-05, "loss": 0.5954, "num_input_tokens_seen": 21753104, "step": 37485 }, { "epoch": 5.583854632112005, "grad_norm": 0.9467719197273254, "learning_rate": 4.526713298297361e-05, "loss": 0.6693, "num_input_tokens_seen": 21755888, "step": 37490 }, { "epoch": 5.584599344652964, "grad_norm": 0.7869559526443481, "learning_rate": 4.526523033361976e-05, "loss": 0.6775, "num_input_tokens_seen": 21758736, "step": 37495 }, { "epoch": 5.5853440571939235, "grad_norm": 1.5694118738174438, "learning_rate": 4.5263327341905443e-05, "loss": 0.6823, "num_input_tokens_seen": 21761616, "step": 37500 }, { "epoch": 5.586088769734882, "grad_norm": 1.6894699335098267, "learning_rate": 4.526142400786281e-05, "loss": 0.7847, "num_input_tokens_seen": 21764336, "step": 37505 }, { "epoch": 5.586833482275842, "grad_norm": 0.942366361618042, "learning_rate": 4.5259520331524004e-05, "loss": 0.6683, "num_input_tokens_seen": 21767280, "step": 37510 }, { "epoch": 5.587578194816801, "grad_norm": 1.230642318725586, "learning_rate": 4.525761631292119e-05, "loss": 0.8073, "num_input_tokens_seen": 21770224, "step": 37515 }, { "epoch": 5.58832290735776, "grad_norm": 0.71394944190979, "learning_rate": 4.5255711952086545e-05, "loss": 0.6372, "num_input_tokens_seen": 21773200, "step": 37520 }, { "epoch": 5.589067619898719, "grad_norm": 0.8611265420913696, "learning_rate": 4.525380724905224e-05, "loss": 0.6134, "num_input_tokens_seen": 21776176, "step": 37525 }, { "epoch": 5.589812332439678, "grad_norm": 1.4184802770614624, "learning_rate": 4.525190220385043e-05, "loss": 0.6052, "num_input_tokens_seen": 21778928, "step": 37530 }, { "epoch": 5.590557044980637, "grad_norm": 2.0704758167266846, "learning_rate": 4.5249996816513325e-05, "loss": 0.5977, "num_input_tokens_seen": 21782192, "step": 37535 }, { "epoch": 5.591301757521597, "grad_norm": 1.152298092842102, "learning_rate": 4.52480910870731e-05, "loss": 0.6957, "num_input_tokens_seen": 21785040, "step": 37540 }, { "epoch": 5.592046470062556, "grad_norm": 1.4262531995773315, "learning_rate": 4.524618501556196e-05, "loss": 0.6867, "num_input_tokens_seen": 21787792, "step": 37545 }, { "epoch": 5.592791182603515, "grad_norm": 1.1127784252166748, "learning_rate": 4.52442786020121e-05, "loss": 0.5895, "num_input_tokens_seen": 21790576, "step": 37550 }, { "epoch": 5.593535895144474, "grad_norm": 1.0290685892105103, "learning_rate": 4.524237184645573e-05, "loss": 0.4626, "num_input_tokens_seen": 21793360, "step": 37555 }, { "epoch": 5.594280607685434, "grad_norm": 1.5451171398162842, "learning_rate": 4.524046474892506e-05, "loss": 0.6203, "num_input_tokens_seen": 21796304, "step": 37560 }, { "epoch": 5.595025320226393, "grad_norm": 0.6981661319732666, "learning_rate": 4.523855730945231e-05, "loss": 0.6875, "num_input_tokens_seen": 21799568, "step": 37565 }, { "epoch": 5.595770032767351, "grad_norm": 2.1829349994659424, "learning_rate": 4.52366495280697e-05, "loss": 0.6921, "num_input_tokens_seen": 21802384, "step": 37570 }, { "epoch": 5.596514745308311, "grad_norm": 1.032152771949768, "learning_rate": 4.523474140480947e-05, "loss": 0.6913, "num_input_tokens_seen": 21805264, "step": 37575 }, { "epoch": 5.59725945784927, "grad_norm": 0.6619634032249451, "learning_rate": 4.5232832939703846e-05, "loss": 0.54, "num_input_tokens_seen": 21808048, "step": 37580 }, { "epoch": 5.5980041703902295, "grad_norm": 0.8777989149093628, "learning_rate": 4.5230924132785066e-05, "loss": 0.7496, "num_input_tokens_seen": 21810992, "step": 37585 }, { "epoch": 5.598748882931188, "grad_norm": 1.5977017879486084, "learning_rate": 4.52290149840854e-05, "loss": 0.6074, "num_input_tokens_seen": 21813648, "step": 37590 }, { "epoch": 5.599493595472148, "grad_norm": 0.6288974285125732, "learning_rate": 4.522710549363708e-05, "loss": 0.4413, "num_input_tokens_seen": 21816624, "step": 37595 }, { "epoch": 5.600238308013107, "grad_norm": 1.9699149131774902, "learning_rate": 4.5225195661472364e-05, "loss": 0.7029, "num_input_tokens_seen": 21819600, "step": 37600 }, { "epoch": 5.600983020554066, "grad_norm": 0.9095305800437927, "learning_rate": 4.522328548762353e-05, "loss": 0.6068, "num_input_tokens_seen": 21822480, "step": 37605 }, { "epoch": 5.601727733095025, "grad_norm": 0.965965986251831, "learning_rate": 4.5221374972122837e-05, "loss": 0.6009, "num_input_tokens_seen": 21825328, "step": 37610 }, { "epoch": 5.602472445635985, "grad_norm": 0.9115407466888428, "learning_rate": 4.521946411500257e-05, "loss": 0.5997, "num_input_tokens_seen": 21827952, "step": 37615 }, { "epoch": 5.603217158176943, "grad_norm": 1.5321978330612183, "learning_rate": 4.5217552916295e-05, "loss": 0.7626, "num_input_tokens_seen": 21830640, "step": 37620 }, { "epoch": 5.603961870717903, "grad_norm": 1.0769444704055786, "learning_rate": 4.521564137603244e-05, "loss": 0.519, "num_input_tokens_seen": 21833584, "step": 37625 }, { "epoch": 5.604706583258862, "grad_norm": 0.8659643530845642, "learning_rate": 4.521372949424715e-05, "loss": 0.6628, "num_input_tokens_seen": 21836336, "step": 37630 }, { "epoch": 5.6054512957998215, "grad_norm": 0.8819825649261475, "learning_rate": 4.521181727097144e-05, "loss": 0.5923, "num_input_tokens_seen": 21839248, "step": 37635 }, { "epoch": 5.60619600834078, "grad_norm": 1.9705371856689453, "learning_rate": 4.5209904706237626e-05, "loss": 0.699, "num_input_tokens_seen": 21842096, "step": 37640 }, { "epoch": 5.60694072088174, "grad_norm": 2.5646538734436035, "learning_rate": 4.5207991800078015e-05, "loss": 0.8009, "num_input_tokens_seen": 21845040, "step": 37645 }, { "epoch": 5.607685433422699, "grad_norm": 0.5316250324249268, "learning_rate": 4.520607855252492e-05, "loss": 0.6499, "num_input_tokens_seen": 21847760, "step": 37650 }, { "epoch": 5.608430145963658, "grad_norm": 1.3602653741836548, "learning_rate": 4.520416496361066e-05, "loss": 0.7183, "num_input_tokens_seen": 21851088, "step": 37655 }, { "epoch": 5.609174858504617, "grad_norm": 1.0737422704696655, "learning_rate": 4.5202251033367574e-05, "loss": 0.6139, "num_input_tokens_seen": 21854096, "step": 37660 }, { "epoch": 5.609919571045577, "grad_norm": 0.770216703414917, "learning_rate": 4.5200336761827985e-05, "loss": 0.5982, "num_input_tokens_seen": 21856944, "step": 37665 }, { "epoch": 5.6106642835865355, "grad_norm": 1.1154119968414307, "learning_rate": 4.519842214902423e-05, "loss": 0.5496, "num_input_tokens_seen": 21859792, "step": 37670 }, { "epoch": 5.611408996127495, "grad_norm": 2.264885187149048, "learning_rate": 4.519650719498868e-05, "loss": 0.6818, "num_input_tokens_seen": 21862896, "step": 37675 }, { "epoch": 5.612153708668454, "grad_norm": 1.818077564239502, "learning_rate": 4.519459189975365e-05, "loss": 0.793, "num_input_tokens_seen": 21865808, "step": 37680 }, { "epoch": 5.6128984212094135, "grad_norm": 1.131365418434143, "learning_rate": 4.519267626335153e-05, "loss": 0.5287, "num_input_tokens_seen": 21868720, "step": 37685 }, { "epoch": 5.613643133750372, "grad_norm": 0.8063929080963135, "learning_rate": 4.519076028581466e-05, "loss": 0.6895, "num_input_tokens_seen": 21871472, "step": 37690 }, { "epoch": 5.614387846291332, "grad_norm": 1.0819411277770996, "learning_rate": 4.518884396717541e-05, "loss": 0.7906, "num_input_tokens_seen": 21874320, "step": 37695 }, { "epoch": 5.615132558832291, "grad_norm": 1.176487684249878, "learning_rate": 4.518692730746616e-05, "loss": 0.6388, "num_input_tokens_seen": 21877168, "step": 37700 }, { "epoch": 5.61587727137325, "grad_norm": 1.1612433195114136, "learning_rate": 4.51850103067193e-05, "loss": 0.6225, "num_input_tokens_seen": 21880016, "step": 37705 }, { "epoch": 5.616621983914209, "grad_norm": 1.2536842823028564, "learning_rate": 4.5183092964967204e-05, "loss": 0.6513, "num_input_tokens_seen": 21882704, "step": 37710 }, { "epoch": 5.617366696455168, "grad_norm": 1.0050808191299438, "learning_rate": 4.518117528224226e-05, "loss": 0.626, "num_input_tokens_seen": 21885744, "step": 37715 }, { "epoch": 5.6181114089961275, "grad_norm": 0.849913477897644, "learning_rate": 4.517925725857688e-05, "loss": 0.5862, "num_input_tokens_seen": 21888560, "step": 37720 }, { "epoch": 5.618856121537087, "grad_norm": 1.9060869216918945, "learning_rate": 4.5177338894003454e-05, "loss": 0.6199, "num_input_tokens_seen": 21891600, "step": 37725 }, { "epoch": 5.619600834078046, "grad_norm": 1.2494772672653198, "learning_rate": 4.517542018855439e-05, "loss": 0.5315, "num_input_tokens_seen": 21894416, "step": 37730 }, { "epoch": 5.620345546619005, "grad_norm": 1.026196837425232, "learning_rate": 4.517350114226211e-05, "loss": 0.6847, "num_input_tokens_seen": 21897712, "step": 37735 }, { "epoch": 5.621090259159964, "grad_norm": 0.8532106876373291, "learning_rate": 4.517158175515903e-05, "loss": 0.6456, "num_input_tokens_seen": 21900752, "step": 37740 }, { "epoch": 5.621834971700923, "grad_norm": 0.8904659152030945, "learning_rate": 4.516966202727758e-05, "loss": 0.7773, "num_input_tokens_seen": 21903728, "step": 37745 }, { "epoch": 5.622579684241883, "grad_norm": 0.742835283279419, "learning_rate": 4.516774195865019e-05, "loss": 0.5991, "num_input_tokens_seen": 21906704, "step": 37750 }, { "epoch": 5.6233243967828415, "grad_norm": 1.3092867136001587, "learning_rate": 4.5165821549309294e-05, "loss": 0.6347, "num_input_tokens_seen": 21909552, "step": 37755 }, { "epoch": 5.624069109323801, "grad_norm": 1.3964180946350098, "learning_rate": 4.516390079928734e-05, "loss": 0.701, "num_input_tokens_seen": 21912432, "step": 37760 }, { "epoch": 5.62481382186476, "grad_norm": 0.9652478098869324, "learning_rate": 4.516197970861679e-05, "loss": 0.6058, "num_input_tokens_seen": 21915408, "step": 37765 }, { "epoch": 5.6255585344057195, "grad_norm": 1.146533727645874, "learning_rate": 4.5160058277330066e-05, "loss": 0.7674, "num_input_tokens_seen": 21918256, "step": 37770 }, { "epoch": 5.626303246946678, "grad_norm": 1.3210982084274292, "learning_rate": 4.515813650545965e-05, "loss": 0.7482, "num_input_tokens_seen": 21921392, "step": 37775 }, { "epoch": 5.627047959487638, "grad_norm": 0.6523802876472473, "learning_rate": 4.515621439303801e-05, "loss": 0.5316, "num_input_tokens_seen": 21924240, "step": 37780 }, { "epoch": 5.627792672028597, "grad_norm": 2.285999059677124, "learning_rate": 4.515429194009761e-05, "loss": 0.6619, "num_input_tokens_seen": 21927184, "step": 37785 }, { "epoch": 5.628537384569556, "grad_norm": 0.8520255088806152, "learning_rate": 4.515236914667094e-05, "loss": 0.6385, "num_input_tokens_seen": 21929936, "step": 37790 }, { "epoch": 5.629282097110515, "grad_norm": 0.9911006093025208, "learning_rate": 4.515044601279046e-05, "loss": 0.5051, "num_input_tokens_seen": 21932912, "step": 37795 }, { "epoch": 5.630026809651475, "grad_norm": 1.4626890420913696, "learning_rate": 4.514852253848868e-05, "loss": 0.5498, "num_input_tokens_seen": 21935888, "step": 37800 }, { "epoch": 5.6307715221924335, "grad_norm": 1.030205488204956, "learning_rate": 4.51465987237981e-05, "loss": 0.6509, "num_input_tokens_seen": 21938672, "step": 37805 }, { "epoch": 5.631516234733393, "grad_norm": 1.3323495388031006, "learning_rate": 4.51446745687512e-05, "loss": 0.7988, "num_input_tokens_seen": 21941552, "step": 37810 }, { "epoch": 5.632260947274352, "grad_norm": 1.0925381183624268, "learning_rate": 4.5142750073380505e-05, "loss": 0.6714, "num_input_tokens_seen": 21944336, "step": 37815 }, { "epoch": 5.6330056598153115, "grad_norm": 1.2102444171905518, "learning_rate": 4.514082523771851e-05, "loss": 0.5172, "num_input_tokens_seen": 21947056, "step": 37820 }, { "epoch": 5.63375037235627, "grad_norm": 1.1741385459899902, "learning_rate": 4.513890006179775e-05, "loss": 0.5488, "num_input_tokens_seen": 21949872, "step": 37825 }, { "epoch": 5.63449508489723, "grad_norm": 1.0782814025878906, "learning_rate": 4.513697454565074e-05, "loss": 0.6412, "num_input_tokens_seen": 21952368, "step": 37830 }, { "epoch": 5.635239797438189, "grad_norm": 1.261705756187439, "learning_rate": 4.513504868931001e-05, "loss": 0.7524, "num_input_tokens_seen": 21955344, "step": 37835 }, { "epoch": 5.635984509979148, "grad_norm": 0.9505009651184082, "learning_rate": 4.51331224928081e-05, "loss": 0.7076, "num_input_tokens_seen": 21958064, "step": 37840 }, { "epoch": 5.636729222520107, "grad_norm": 0.6267655491828918, "learning_rate": 4.5131195956177546e-05, "loss": 0.7563, "num_input_tokens_seen": 21961072, "step": 37845 }, { "epoch": 5.637473935061067, "grad_norm": 1.1792333126068115, "learning_rate": 4.5129269079450894e-05, "loss": 0.8056, "num_input_tokens_seen": 21964208, "step": 37850 }, { "epoch": 5.6382186476020255, "grad_norm": 1.1861181259155273, "learning_rate": 4.512734186266071e-05, "loss": 0.4861, "num_input_tokens_seen": 21967120, "step": 37855 }, { "epoch": 5.638963360142985, "grad_norm": 0.9065684080123901, "learning_rate": 4.512541430583953e-05, "loss": 0.556, "num_input_tokens_seen": 21970000, "step": 37860 }, { "epoch": 5.639708072683944, "grad_norm": 1.2483160495758057, "learning_rate": 4.5123486409019936e-05, "loss": 0.7473, "num_input_tokens_seen": 21972944, "step": 37865 }, { "epoch": 5.640452785224904, "grad_norm": 0.6746602058410645, "learning_rate": 4.5121558172234484e-05, "loss": 0.6425, "num_input_tokens_seen": 21975792, "step": 37870 }, { "epoch": 5.641197497765862, "grad_norm": 1.1329874992370605, "learning_rate": 4.511962959551576e-05, "loss": 0.6732, "num_input_tokens_seen": 21978448, "step": 37875 }, { "epoch": 5.641942210306821, "grad_norm": 0.6243996024131775, "learning_rate": 4.511770067889635e-05, "loss": 0.5747, "num_input_tokens_seen": 21981552, "step": 37880 }, { "epoch": 5.642686922847781, "grad_norm": 1.541455864906311, "learning_rate": 4.5115771422408826e-05, "loss": 0.7634, "num_input_tokens_seen": 21984304, "step": 37885 }, { "epoch": 5.64343163538874, "grad_norm": 1.6719950437545776, "learning_rate": 4.5113841826085796e-05, "loss": 0.7864, "num_input_tokens_seen": 21986928, "step": 37890 }, { "epoch": 5.644176347929699, "grad_norm": 1.4589331150054932, "learning_rate": 4.5111911889959846e-05, "loss": 0.7273, "num_input_tokens_seen": 21989680, "step": 37895 }, { "epoch": 5.644921060470658, "grad_norm": 1.6389576196670532, "learning_rate": 4.5109981614063584e-05, "loss": 0.6726, "num_input_tokens_seen": 21992464, "step": 37900 }, { "epoch": 5.6456657730116175, "grad_norm": 0.857315182685852, "learning_rate": 4.510805099842963e-05, "loss": 0.6327, "num_input_tokens_seen": 21995216, "step": 37905 }, { "epoch": 5.646410485552577, "grad_norm": 1.213415503501892, "learning_rate": 4.5106120043090585e-05, "loss": 0.6319, "num_input_tokens_seen": 21998032, "step": 37910 }, { "epoch": 5.647155198093536, "grad_norm": 0.7628365755081177, "learning_rate": 4.510418874807907e-05, "loss": 0.5478, "num_input_tokens_seen": 22001008, "step": 37915 }, { "epoch": 5.647899910634495, "grad_norm": 1.501473069190979, "learning_rate": 4.5102257113427726e-05, "loss": 0.7388, "num_input_tokens_seen": 22004080, "step": 37920 }, { "epoch": 5.648644623175454, "grad_norm": 1.9458818435668945, "learning_rate": 4.510032513916919e-05, "loss": 0.8082, "num_input_tokens_seen": 22007248, "step": 37925 }, { "epoch": 5.649389335716413, "grad_norm": 2.1592955589294434, "learning_rate": 4.509839282533607e-05, "loss": 0.6861, "num_input_tokens_seen": 22010128, "step": 37930 }, { "epoch": 5.650134048257373, "grad_norm": 1.1734105348587036, "learning_rate": 4.509646017196104e-05, "loss": 0.5736, "num_input_tokens_seen": 22012848, "step": 37935 }, { "epoch": 5.6508787607983315, "grad_norm": 1.4616458415985107, "learning_rate": 4.509452717907674e-05, "loss": 0.5919, "num_input_tokens_seen": 22015888, "step": 37940 }, { "epoch": 5.651623473339291, "grad_norm": 1.2326033115386963, "learning_rate": 4.509259384671582e-05, "loss": 0.6428, "num_input_tokens_seen": 22018704, "step": 37945 }, { "epoch": 5.65236818588025, "grad_norm": 1.5163012742996216, "learning_rate": 4.509066017491096e-05, "loss": 0.4565, "num_input_tokens_seen": 22021264, "step": 37950 }, { "epoch": 5.65311289842121, "grad_norm": 1.4562588930130005, "learning_rate": 4.508872616369481e-05, "loss": 0.8199, "num_input_tokens_seen": 22023984, "step": 37955 }, { "epoch": 5.653857610962168, "grad_norm": 1.1246718168258667, "learning_rate": 4.508679181310005e-05, "loss": 0.6078, "num_input_tokens_seen": 22026960, "step": 37960 }, { "epoch": 5.654602323503128, "grad_norm": 3.8877406120300293, "learning_rate": 4.508485712315935e-05, "loss": 0.7029, "num_input_tokens_seen": 22029616, "step": 37965 }, { "epoch": 5.655347036044087, "grad_norm": 1.077419400215149, "learning_rate": 4.508292209390541e-05, "loss": 0.556, "num_input_tokens_seen": 22032720, "step": 37970 }, { "epoch": 5.656091748585046, "grad_norm": 1.1718156337738037, "learning_rate": 4.5080986725370914e-05, "loss": 0.6608, "num_input_tokens_seen": 22035888, "step": 37975 }, { "epoch": 5.656836461126005, "grad_norm": 1.4197523593902588, "learning_rate": 4.507905101758855e-05, "loss": 0.6374, "num_input_tokens_seen": 22038640, "step": 37980 }, { "epoch": 5.657581173666965, "grad_norm": 1.194759726524353, "learning_rate": 4.507711497059104e-05, "loss": 0.6845, "num_input_tokens_seen": 22041264, "step": 37985 }, { "epoch": 5.6583258862079235, "grad_norm": 1.1420860290527344, "learning_rate": 4.5075178584411064e-05, "loss": 0.6779, "num_input_tokens_seen": 22044048, "step": 37990 }, { "epoch": 5.659070598748883, "grad_norm": 1.5054973363876343, "learning_rate": 4.507324185908135e-05, "loss": 0.6756, "num_input_tokens_seen": 22046512, "step": 37995 }, { "epoch": 5.659815311289842, "grad_norm": 1.2900618314743042, "learning_rate": 4.507130479463462e-05, "loss": 0.7245, "num_input_tokens_seen": 22049424, "step": 38000 }, { "epoch": 5.660560023830802, "grad_norm": 0.8808643221855164, "learning_rate": 4.50693673911036e-05, "loss": 0.6324, "num_input_tokens_seen": 22052464, "step": 38005 }, { "epoch": 5.66130473637176, "grad_norm": 0.9095359444618225, "learning_rate": 4.506742964852101e-05, "loss": 0.525, "num_input_tokens_seen": 22055216, "step": 38010 }, { "epoch": 5.66204944891272, "grad_norm": 1.5415695905685425, "learning_rate": 4.506549156691959e-05, "loss": 0.5578, "num_input_tokens_seen": 22058224, "step": 38015 }, { "epoch": 5.662794161453679, "grad_norm": 0.662686824798584, "learning_rate": 4.506355314633209e-05, "loss": 0.7257, "num_input_tokens_seen": 22061392, "step": 38020 }, { "epoch": 5.663538873994638, "grad_norm": 0.7862675786018372, "learning_rate": 4.506161438679125e-05, "loss": 0.6358, "num_input_tokens_seen": 22064336, "step": 38025 }, { "epoch": 5.664283586535597, "grad_norm": 0.9243202209472656, "learning_rate": 4.5059675288329815e-05, "loss": 0.6543, "num_input_tokens_seen": 22067536, "step": 38030 }, { "epoch": 5.665028299076557, "grad_norm": 1.085408091545105, "learning_rate": 4.5057735850980564e-05, "loss": 0.6352, "num_input_tokens_seen": 22070352, "step": 38035 }, { "epoch": 5.665773011617516, "grad_norm": 0.812303364276886, "learning_rate": 4.5055796074776244e-05, "loss": 0.6821, "num_input_tokens_seen": 22073360, "step": 38040 }, { "epoch": 5.666517724158475, "grad_norm": 0.8933157324790955, "learning_rate": 4.505385595974964e-05, "loss": 0.7672, "num_input_tokens_seen": 22076048, "step": 38045 }, { "epoch": 5.667262436699434, "grad_norm": 1.0756688117980957, "learning_rate": 4.505191550593352e-05, "loss": 0.6722, "num_input_tokens_seen": 22078992, "step": 38050 }, { "epoch": 5.668007149240394, "grad_norm": 1.0719314813613892, "learning_rate": 4.5049974713360665e-05, "loss": 0.6195, "num_input_tokens_seen": 22081488, "step": 38055 }, { "epoch": 5.668751861781352, "grad_norm": 1.914902687072754, "learning_rate": 4.504803358206387e-05, "loss": 0.7227, "num_input_tokens_seen": 22084368, "step": 38060 }, { "epoch": 5.669496574322311, "grad_norm": 1.630022406578064, "learning_rate": 4.504609211207591e-05, "loss": 0.5954, "num_input_tokens_seen": 22086992, "step": 38065 }, { "epoch": 5.670241286863271, "grad_norm": 0.939614474773407, "learning_rate": 4.504415030342961e-05, "loss": 0.7132, "num_input_tokens_seen": 22089840, "step": 38070 }, { "epoch": 5.67098599940423, "grad_norm": 1.100093126296997, "learning_rate": 4.504220815615776e-05, "loss": 0.4512, "num_input_tokens_seen": 22092752, "step": 38075 }, { "epoch": 5.671730711945189, "grad_norm": 0.8292966485023499, "learning_rate": 4.5040265670293174e-05, "loss": 0.6322, "num_input_tokens_seen": 22095408, "step": 38080 }, { "epoch": 5.672475424486148, "grad_norm": 1.2011265754699707, "learning_rate": 4.503832284586867e-05, "loss": 0.632, "num_input_tokens_seen": 22098032, "step": 38085 }, { "epoch": 5.673220137027108, "grad_norm": 1.079731822013855, "learning_rate": 4.5036379682917065e-05, "loss": 0.6549, "num_input_tokens_seen": 22100880, "step": 38090 }, { "epoch": 5.673964849568066, "grad_norm": 0.7308080792427063, "learning_rate": 4.503443618147119e-05, "loss": 0.5661, "num_input_tokens_seen": 22103952, "step": 38095 }, { "epoch": 5.674709562109026, "grad_norm": 0.7851502299308777, "learning_rate": 4.503249234156387e-05, "loss": 0.6364, "num_input_tokens_seen": 22106864, "step": 38100 }, { "epoch": 5.675454274649985, "grad_norm": 0.8381107449531555, "learning_rate": 4.503054816322796e-05, "loss": 0.6373, "num_input_tokens_seen": 22109840, "step": 38105 }, { "epoch": 5.676198987190944, "grad_norm": 1.4035470485687256, "learning_rate": 4.50286036464963e-05, "loss": 0.5457, "num_input_tokens_seen": 22113008, "step": 38110 }, { "epoch": 5.676943699731903, "grad_norm": 0.9156206250190735, "learning_rate": 4.502665879140173e-05, "loss": 0.471, "num_input_tokens_seen": 22115824, "step": 38115 }, { "epoch": 5.677688412272863, "grad_norm": 0.9323596954345703, "learning_rate": 4.502471359797712e-05, "loss": 0.6514, "num_input_tokens_seen": 22118928, "step": 38120 }, { "epoch": 5.678433124813822, "grad_norm": 1.117791771888733, "learning_rate": 4.5022768066255315e-05, "loss": 0.5764, "num_input_tokens_seen": 22122160, "step": 38125 }, { "epoch": 5.679177837354781, "grad_norm": 3.202441930770874, "learning_rate": 4.502082219626921e-05, "loss": 0.7941, "num_input_tokens_seen": 22125072, "step": 38130 }, { "epoch": 5.67992254989574, "grad_norm": 0.9811007976531982, "learning_rate": 4.501887598805165e-05, "loss": 0.5342, "num_input_tokens_seen": 22127984, "step": 38135 }, { "epoch": 5.6806672624367, "grad_norm": 1.0726629495620728, "learning_rate": 4.501692944163553e-05, "loss": 0.683, "num_input_tokens_seen": 22130896, "step": 38140 }, { "epoch": 5.681411974977658, "grad_norm": 0.8106951713562012, "learning_rate": 4.501498255705373e-05, "loss": 0.5392, "num_input_tokens_seen": 22133808, "step": 38145 }, { "epoch": 5.682156687518618, "grad_norm": 1.3085569143295288, "learning_rate": 4.501303533433915e-05, "loss": 0.861, "num_input_tokens_seen": 22136784, "step": 38150 }, { "epoch": 5.682901400059577, "grad_norm": 1.6873458623886108, "learning_rate": 4.501108777352467e-05, "loss": 0.8148, "num_input_tokens_seen": 22139824, "step": 38155 }, { "epoch": 5.683646112600536, "grad_norm": 0.9490269422531128, "learning_rate": 4.50091398746432e-05, "loss": 0.7661, "num_input_tokens_seen": 22142832, "step": 38160 }, { "epoch": 5.684390825141495, "grad_norm": 1.0052920579910278, "learning_rate": 4.500719163772765e-05, "loss": 0.6638, "num_input_tokens_seen": 22145520, "step": 38165 }, { "epoch": 5.685135537682455, "grad_norm": 2.0117738246917725, "learning_rate": 4.5005243062810934e-05, "loss": 1.0072, "num_input_tokens_seen": 22148560, "step": 38170 }, { "epoch": 5.685880250223414, "grad_norm": 1.1754974126815796, "learning_rate": 4.500329414992597e-05, "loss": 0.5865, "num_input_tokens_seen": 22151600, "step": 38175 }, { "epoch": 5.686624962764373, "grad_norm": 0.69109708070755, "learning_rate": 4.500134489910567e-05, "loss": 0.5397, "num_input_tokens_seen": 22154864, "step": 38180 }, { "epoch": 5.687369675305332, "grad_norm": 0.6357429623603821, "learning_rate": 4.4999395310382994e-05, "loss": 0.6077, "num_input_tokens_seen": 22157648, "step": 38185 }, { "epoch": 5.688114387846292, "grad_norm": 1.0414047241210938, "learning_rate": 4.4997445383790846e-05, "loss": 0.6395, "num_input_tokens_seen": 22160560, "step": 38190 }, { "epoch": 5.68885910038725, "grad_norm": 1.2299593687057495, "learning_rate": 4.499549511936219e-05, "loss": 0.5855, "num_input_tokens_seen": 22163696, "step": 38195 }, { "epoch": 5.68960381292821, "grad_norm": 0.9352518916130066, "learning_rate": 4.499354451712997e-05, "loss": 0.5577, "num_input_tokens_seen": 22166448, "step": 38200 }, { "epoch": 5.690348525469169, "grad_norm": 1.2319416999816895, "learning_rate": 4.499159357712713e-05, "loss": 0.7369, "num_input_tokens_seen": 22169232, "step": 38205 }, { "epoch": 5.6910932380101285, "grad_norm": 1.0112583637237549, "learning_rate": 4.4989642299386636e-05, "loss": 0.7187, "num_input_tokens_seen": 22172368, "step": 38210 }, { "epoch": 5.691837950551087, "grad_norm": 1.5739840269088745, "learning_rate": 4.498769068394145e-05, "loss": 0.7571, "num_input_tokens_seen": 22175152, "step": 38215 }, { "epoch": 5.692582663092047, "grad_norm": 1.6194285154342651, "learning_rate": 4.498573873082454e-05, "loss": 0.5677, "num_input_tokens_seen": 22177648, "step": 38220 }, { "epoch": 5.693327375633006, "grad_norm": 0.6856042146682739, "learning_rate": 4.4983786440068896e-05, "loss": 0.6804, "num_input_tokens_seen": 22180528, "step": 38225 }, { "epoch": 5.694072088173964, "grad_norm": 0.8425026535987854, "learning_rate": 4.498183381170749e-05, "loss": 0.6132, "num_input_tokens_seen": 22183600, "step": 38230 }, { "epoch": 5.694816800714924, "grad_norm": 1.2360050678253174, "learning_rate": 4.497988084577331e-05, "loss": 0.6468, "num_input_tokens_seen": 22186384, "step": 38235 }, { "epoch": 5.695561513255884, "grad_norm": 0.7146120667457581, "learning_rate": 4.497792754229935e-05, "loss": 0.5935, "num_input_tokens_seen": 22189488, "step": 38240 }, { "epoch": 5.696306225796842, "grad_norm": 1.1778812408447266, "learning_rate": 4.49759739013186e-05, "loss": 0.5973, "num_input_tokens_seen": 22192208, "step": 38245 }, { "epoch": 5.697050938337801, "grad_norm": 0.9971823692321777, "learning_rate": 4.4974019922864086e-05, "loss": 0.5332, "num_input_tokens_seen": 22195248, "step": 38250 }, { "epoch": 5.697795650878761, "grad_norm": 0.7842435836791992, "learning_rate": 4.497206560696881e-05, "loss": 0.7318, "num_input_tokens_seen": 22198384, "step": 38255 }, { "epoch": 5.6985403634197205, "grad_norm": 2.0076041221618652, "learning_rate": 4.497011095366577e-05, "loss": 0.8409, "num_input_tokens_seen": 22201104, "step": 38260 }, { "epoch": 5.699285075960679, "grad_norm": 1.4061269760131836, "learning_rate": 4.4968155962988e-05, "loss": 0.7767, "num_input_tokens_seen": 22203888, "step": 38265 }, { "epoch": 5.700029788501638, "grad_norm": 1.4142946004867554, "learning_rate": 4.496620063496854e-05, "loss": 0.7618, "num_input_tokens_seen": 22207120, "step": 38270 }, { "epoch": 5.700774501042598, "grad_norm": 2.178659677505493, "learning_rate": 4.496424496964041e-05, "loss": 0.6857, "num_input_tokens_seen": 22210256, "step": 38275 }, { "epoch": 5.701519213583556, "grad_norm": 1.0464733839035034, "learning_rate": 4.496228896703665e-05, "loss": 0.7732, "num_input_tokens_seen": 22213136, "step": 38280 }, { "epoch": 5.702263926124516, "grad_norm": 0.725690484046936, "learning_rate": 4.496033262719031e-05, "loss": 0.7458, "num_input_tokens_seen": 22216144, "step": 38285 }, { "epoch": 5.703008638665475, "grad_norm": 0.8229570984840393, "learning_rate": 4.495837595013443e-05, "loss": 0.7073, "num_input_tokens_seen": 22218704, "step": 38290 }, { "epoch": 5.7037533512064345, "grad_norm": 1.3359785079956055, "learning_rate": 4.495641893590209e-05, "loss": 0.5188, "num_input_tokens_seen": 22221680, "step": 38295 }, { "epoch": 5.704498063747393, "grad_norm": 1.6599647998809814, "learning_rate": 4.495446158452632e-05, "loss": 0.6486, "num_input_tokens_seen": 22224368, "step": 38300 }, { "epoch": 5.705242776288353, "grad_norm": 1.0495878458023071, "learning_rate": 4.49525038960402e-05, "loss": 0.7167, "num_input_tokens_seen": 22227280, "step": 38305 }, { "epoch": 5.705987488829312, "grad_norm": 1.4780292510986328, "learning_rate": 4.495054587047682e-05, "loss": 0.6169, "num_input_tokens_seen": 22230064, "step": 38310 }, { "epoch": 5.706732201370271, "grad_norm": 1.0769721269607544, "learning_rate": 4.4948587507869235e-05, "loss": 0.7322, "num_input_tokens_seen": 22233008, "step": 38315 }, { "epoch": 5.70747691391123, "grad_norm": 1.1021652221679688, "learning_rate": 4.494662880825053e-05, "loss": 0.8009, "num_input_tokens_seen": 22236144, "step": 38320 }, { "epoch": 5.70822162645219, "grad_norm": 0.7978659272193909, "learning_rate": 4.494466977165382e-05, "loss": 0.6687, "num_input_tokens_seen": 22239216, "step": 38325 }, { "epoch": 5.708966338993148, "grad_norm": 3.862502098083496, "learning_rate": 4.494271039811217e-05, "loss": 0.7763, "num_input_tokens_seen": 22241808, "step": 38330 }, { "epoch": 5.709711051534108, "grad_norm": 0.5921324491500854, "learning_rate": 4.4940750687658716e-05, "loss": 0.5902, "num_input_tokens_seen": 22244752, "step": 38335 }, { "epoch": 5.710455764075067, "grad_norm": 0.9287495613098145, "learning_rate": 4.4938790640326534e-05, "loss": 0.4994, "num_input_tokens_seen": 22247920, "step": 38340 }, { "epoch": 5.7112004766160265, "grad_norm": 1.0357458591461182, "learning_rate": 4.4936830256148755e-05, "loss": 0.5491, "num_input_tokens_seen": 22250768, "step": 38345 }, { "epoch": 5.711945189156985, "grad_norm": 1.4191341400146484, "learning_rate": 4.493486953515848e-05, "loss": 0.7037, "num_input_tokens_seen": 22253616, "step": 38350 }, { "epoch": 5.712689901697945, "grad_norm": 0.7916328310966492, "learning_rate": 4.493290847738886e-05, "loss": 0.5539, "num_input_tokens_seen": 22256208, "step": 38355 }, { "epoch": 5.713434614238904, "grad_norm": 1.5686367750167847, "learning_rate": 4.4930947082873e-05, "loss": 0.7094, "num_input_tokens_seen": 22259024, "step": 38360 }, { "epoch": 5.714179326779863, "grad_norm": 0.8470624685287476, "learning_rate": 4.492898535164405e-05, "loss": 0.5628, "num_input_tokens_seen": 22262192, "step": 38365 }, { "epoch": 5.714924039320822, "grad_norm": 1.2950328588485718, "learning_rate": 4.492702328373515e-05, "loss": 0.5859, "num_input_tokens_seen": 22264912, "step": 38370 }, { "epoch": 5.715668751861782, "grad_norm": 2.3815078735351562, "learning_rate": 4.492506087917944e-05, "loss": 0.7397, "num_input_tokens_seen": 22267824, "step": 38375 }, { "epoch": 5.7164134644027405, "grad_norm": 0.7481780052185059, "learning_rate": 4.4923098138010064e-05, "loss": 0.8369, "num_input_tokens_seen": 22270960, "step": 38380 }, { "epoch": 5.7171581769437, "grad_norm": 1.1225427389144897, "learning_rate": 4.492113506026021e-05, "loss": 0.6315, "num_input_tokens_seen": 22274032, "step": 38385 }, { "epoch": 5.717902889484659, "grad_norm": 0.9586130976676941, "learning_rate": 4.491917164596303e-05, "loss": 0.8606, "num_input_tokens_seen": 22276848, "step": 38390 }, { "epoch": 5.718647602025618, "grad_norm": 1.1644231081008911, "learning_rate": 4.491720789515168e-05, "loss": 0.6886, "num_input_tokens_seen": 22279760, "step": 38395 }, { "epoch": 5.719392314566577, "grad_norm": 1.5052943229675293, "learning_rate": 4.491524380785935e-05, "loss": 0.69, "num_input_tokens_seen": 22282608, "step": 38400 }, { "epoch": 5.720137027107537, "grad_norm": 0.8898491859436035, "learning_rate": 4.4913279384119214e-05, "loss": 0.6867, "num_input_tokens_seen": 22285584, "step": 38405 }, { "epoch": 5.720881739648496, "grad_norm": 2.011976957321167, "learning_rate": 4.4911314623964466e-05, "loss": 0.6403, "num_input_tokens_seen": 22288144, "step": 38410 }, { "epoch": 5.721626452189454, "grad_norm": 1.1318897008895874, "learning_rate": 4.490934952742829e-05, "loss": 0.6921, "num_input_tokens_seen": 22291056, "step": 38415 }, { "epoch": 5.722371164730414, "grad_norm": 0.6518115997314453, "learning_rate": 4.490738409454389e-05, "loss": 0.661, "num_input_tokens_seen": 22293808, "step": 38420 }, { "epoch": 5.723115877271374, "grad_norm": 1.1747126579284668, "learning_rate": 4.4905418325344475e-05, "loss": 0.6539, "num_input_tokens_seen": 22296688, "step": 38425 }, { "epoch": 5.7238605898123325, "grad_norm": 0.8257744312286377, "learning_rate": 4.490345221986324e-05, "loss": 0.4424, "num_input_tokens_seen": 22299184, "step": 38430 }, { "epoch": 5.724605302353291, "grad_norm": 1.1570922136306763, "learning_rate": 4.490148577813341e-05, "loss": 0.7697, "num_input_tokens_seen": 22302160, "step": 38435 }, { "epoch": 5.725350014894251, "grad_norm": 1.2843281030654907, "learning_rate": 4.489951900018821e-05, "loss": 0.6517, "num_input_tokens_seen": 22305520, "step": 38440 }, { "epoch": 5.72609472743521, "grad_norm": 1.1577130556106567, "learning_rate": 4.4897551886060866e-05, "loss": 0.7847, "num_input_tokens_seen": 22308656, "step": 38445 }, { "epoch": 5.726839439976169, "grad_norm": 1.0395574569702148, "learning_rate": 4.489558443578459e-05, "loss": 0.5475, "num_input_tokens_seen": 22311472, "step": 38450 }, { "epoch": 5.727584152517128, "grad_norm": 0.7572944164276123, "learning_rate": 4.4893616649392646e-05, "loss": 0.5871, "num_input_tokens_seen": 22314448, "step": 38455 }, { "epoch": 5.728328865058088, "grad_norm": 1.2986942529678345, "learning_rate": 4.4891648526918265e-05, "loss": 0.6258, "num_input_tokens_seen": 22317392, "step": 38460 }, { "epoch": 5.7290735775990465, "grad_norm": 1.3222306966781616, "learning_rate": 4.48896800683947e-05, "loss": 0.6148, "num_input_tokens_seen": 22320400, "step": 38465 }, { "epoch": 5.729818290140006, "grad_norm": 0.8929845690727234, "learning_rate": 4.48877112738552e-05, "loss": 0.6251, "num_input_tokens_seen": 22323376, "step": 38470 }, { "epoch": 5.730563002680965, "grad_norm": 1.7944694757461548, "learning_rate": 4.488574214333304e-05, "loss": 0.7598, "num_input_tokens_seen": 22326320, "step": 38475 }, { "epoch": 5.7313077152219245, "grad_norm": 1.3142379522323608, "learning_rate": 4.488377267686147e-05, "loss": 0.5917, "num_input_tokens_seen": 22328976, "step": 38480 }, { "epoch": 5.732052427762883, "grad_norm": 0.9333818554878235, "learning_rate": 4.488180287447378e-05, "loss": 0.6268, "num_input_tokens_seen": 22332080, "step": 38485 }, { "epoch": 5.732797140303843, "grad_norm": 1.3772281408309937, "learning_rate": 4.4879832736203224e-05, "loss": 0.6469, "num_input_tokens_seen": 22335024, "step": 38490 }, { "epoch": 5.733541852844802, "grad_norm": 1.0450025796890259, "learning_rate": 4.48778622620831e-05, "loss": 0.581, "num_input_tokens_seen": 22337712, "step": 38495 }, { "epoch": 5.734286565385761, "grad_norm": 1.0186011791229248, "learning_rate": 4.487589145214671e-05, "loss": 0.7701, "num_input_tokens_seen": 22340752, "step": 38500 }, { "epoch": 5.73503127792672, "grad_norm": 1.4456263780593872, "learning_rate": 4.487392030642733e-05, "loss": 0.5357, "num_input_tokens_seen": 22343312, "step": 38505 }, { "epoch": 5.73577599046768, "grad_norm": 0.638470470905304, "learning_rate": 4.487194882495826e-05, "loss": 0.6579, "num_input_tokens_seen": 22346544, "step": 38510 }, { "epoch": 5.7365207030086385, "grad_norm": 1.1114903688430786, "learning_rate": 4.486997700777281e-05, "loss": 0.6858, "num_input_tokens_seen": 22349392, "step": 38515 }, { "epoch": 5.737265415549598, "grad_norm": 0.6658192873001099, "learning_rate": 4.486800485490429e-05, "loss": 0.6717, "num_input_tokens_seen": 22352432, "step": 38520 }, { "epoch": 5.738010128090557, "grad_norm": 0.8615927696228027, "learning_rate": 4.4866032366386034e-05, "loss": 0.5894, "num_input_tokens_seen": 22355536, "step": 38525 }, { "epoch": 5.7387548406315165, "grad_norm": 1.639878749847412, "learning_rate": 4.486405954225135e-05, "loss": 0.5908, "num_input_tokens_seen": 22358384, "step": 38530 }, { "epoch": 5.739499553172475, "grad_norm": 0.9689438939094543, "learning_rate": 4.486208638253356e-05, "loss": 0.5443, "num_input_tokens_seen": 22361360, "step": 38535 }, { "epoch": 5.740244265713435, "grad_norm": 1.1790333986282349, "learning_rate": 4.486011288726601e-05, "loss": 0.6016, "num_input_tokens_seen": 22364048, "step": 38540 }, { "epoch": 5.740988978254394, "grad_norm": 1.0955675840377808, "learning_rate": 4.485813905648204e-05, "loss": 0.6812, "num_input_tokens_seen": 22366800, "step": 38545 }, { "epoch": 5.741733690795353, "grad_norm": 0.8759945034980774, "learning_rate": 4.485616489021499e-05, "loss": 0.7196, "num_input_tokens_seen": 22369552, "step": 38550 }, { "epoch": 5.742478403336312, "grad_norm": 0.9855355024337769, "learning_rate": 4.485419038849822e-05, "loss": 0.5413, "num_input_tokens_seen": 22372496, "step": 38555 }, { "epoch": 5.743223115877272, "grad_norm": 0.7998118996620178, "learning_rate": 4.485221555136508e-05, "loss": 0.6986, "num_input_tokens_seen": 22375376, "step": 38560 }, { "epoch": 5.7439678284182305, "grad_norm": 1.5717557668685913, "learning_rate": 4.485024037884894e-05, "loss": 0.5968, "num_input_tokens_seen": 22378384, "step": 38565 }, { "epoch": 5.74471254095919, "grad_norm": 1.387130856513977, "learning_rate": 4.484826487098316e-05, "loss": 0.8468, "num_input_tokens_seen": 22380912, "step": 38570 }, { "epoch": 5.745457253500149, "grad_norm": 0.941261351108551, "learning_rate": 4.484628902780111e-05, "loss": 0.7788, "num_input_tokens_seen": 22383888, "step": 38575 }, { "epoch": 5.746201966041108, "grad_norm": 1.2290624380111694, "learning_rate": 4.484431284933619e-05, "loss": 0.7053, "num_input_tokens_seen": 22386640, "step": 38580 }, { "epoch": 5.746946678582067, "grad_norm": 0.8581324219703674, "learning_rate": 4.484233633562176e-05, "loss": 0.6047, "num_input_tokens_seen": 22389552, "step": 38585 }, { "epoch": 5.747691391123027, "grad_norm": 1.0109401941299438, "learning_rate": 4.484035948669124e-05, "loss": 0.5937, "num_input_tokens_seen": 22392464, "step": 38590 }, { "epoch": 5.748436103663986, "grad_norm": 1.5357801914215088, "learning_rate": 4.4838382302577995e-05, "loss": 0.7437, "num_input_tokens_seen": 22395248, "step": 38595 }, { "epoch": 5.7491808162049445, "grad_norm": 0.8055275678634644, "learning_rate": 4.483640478331546e-05, "loss": 0.5757, "num_input_tokens_seen": 22398384, "step": 38600 }, { "epoch": 5.749925528745904, "grad_norm": 1.1671898365020752, "learning_rate": 4.483442692893702e-05, "loss": 0.5923, "num_input_tokens_seen": 22401072, "step": 38605 }, { "epoch": 5.750670241286863, "grad_norm": 0.7054986953735352, "learning_rate": 4.483244873947609e-05, "loss": 0.6327, "num_input_tokens_seen": 22403888, "step": 38610 }, { "epoch": 5.7514149538278225, "grad_norm": 0.8831521272659302, "learning_rate": 4.48304702149661e-05, "loss": 0.6764, "num_input_tokens_seen": 22406832, "step": 38615 }, { "epoch": 5.752159666368781, "grad_norm": 0.8210641741752625, "learning_rate": 4.482849135544048e-05, "loss": 0.6237, "num_input_tokens_seen": 22409616, "step": 38620 }, { "epoch": 5.752904378909741, "grad_norm": 0.7583151459693909, "learning_rate": 4.4826512160932636e-05, "loss": 0.5812, "num_input_tokens_seen": 22412656, "step": 38625 }, { "epoch": 5.7536490914507, "grad_norm": 1.1977522373199463, "learning_rate": 4.482453263147603e-05, "loss": 0.6225, "num_input_tokens_seen": 22415280, "step": 38630 }, { "epoch": 5.754393803991659, "grad_norm": 2.6686668395996094, "learning_rate": 4.4822552767104095e-05, "loss": 0.7641, "num_input_tokens_seen": 22417936, "step": 38635 }, { "epoch": 5.755138516532618, "grad_norm": 0.8928251266479492, "learning_rate": 4.482057256785027e-05, "loss": 0.7653, "num_input_tokens_seen": 22420880, "step": 38640 }, { "epoch": 5.755883229073578, "grad_norm": 1.1579821109771729, "learning_rate": 4.481859203374802e-05, "loss": 0.6628, "num_input_tokens_seen": 22423632, "step": 38645 }, { "epoch": 5.7566279416145365, "grad_norm": 0.9177306294441223, "learning_rate": 4.48166111648308e-05, "loss": 0.7846, "num_input_tokens_seen": 22426992, "step": 38650 }, { "epoch": 5.757372654155496, "grad_norm": 1.655297875404358, "learning_rate": 4.481462996113207e-05, "loss": 0.665, "num_input_tokens_seen": 22430288, "step": 38655 }, { "epoch": 5.758117366696455, "grad_norm": 0.8194665312767029, "learning_rate": 4.481264842268531e-05, "loss": 0.7352, "num_input_tokens_seen": 22433392, "step": 38660 }, { "epoch": 5.7588620792374146, "grad_norm": 0.8876010179519653, "learning_rate": 4.4810666549523997e-05, "loss": 0.6302, "num_input_tokens_seen": 22436368, "step": 38665 }, { "epoch": 5.759606791778373, "grad_norm": 0.9551488757133484, "learning_rate": 4.48086843416816e-05, "loss": 0.6032, "num_input_tokens_seen": 22439376, "step": 38670 }, { "epoch": 5.760351504319333, "grad_norm": 1.3200794458389282, "learning_rate": 4.480670179919162e-05, "loss": 0.7573, "num_input_tokens_seen": 22442544, "step": 38675 }, { "epoch": 5.761096216860292, "grad_norm": 1.0468617677688599, "learning_rate": 4.480471892208754e-05, "loss": 0.8628, "num_input_tokens_seen": 22445744, "step": 38680 }, { "epoch": 5.761840929401251, "grad_norm": 0.8623418807983398, "learning_rate": 4.480273571040285e-05, "loss": 0.5524, "num_input_tokens_seen": 22448592, "step": 38685 }, { "epoch": 5.76258564194221, "grad_norm": 0.725281834602356, "learning_rate": 4.480075216417109e-05, "loss": 0.5077, "num_input_tokens_seen": 22451536, "step": 38690 }, { "epoch": 5.76333035448317, "grad_norm": 0.8951221108436584, "learning_rate": 4.479876828342573e-05, "loss": 0.642, "num_input_tokens_seen": 22454736, "step": 38695 }, { "epoch": 5.7640750670241285, "grad_norm": 0.949734091758728, "learning_rate": 4.479678406820031e-05, "loss": 0.5928, "num_input_tokens_seen": 22457616, "step": 38700 }, { "epoch": 5.764819779565088, "grad_norm": 1.6706829071044922, "learning_rate": 4.479479951852834e-05, "loss": 0.703, "num_input_tokens_seen": 22460464, "step": 38705 }, { "epoch": 5.765564492106047, "grad_norm": 1.3361313343048096, "learning_rate": 4.479281463444335e-05, "loss": 0.5881, "num_input_tokens_seen": 22463376, "step": 38710 }, { "epoch": 5.766309204647007, "grad_norm": 0.9791535139083862, "learning_rate": 4.479082941597888e-05, "loss": 0.5273, "num_input_tokens_seen": 22466192, "step": 38715 }, { "epoch": 5.767053917187965, "grad_norm": 1.016387939453125, "learning_rate": 4.4788843863168455e-05, "loss": 0.6332, "num_input_tokens_seen": 22469520, "step": 38720 }, { "epoch": 5.767798629728925, "grad_norm": 0.7196943759918213, "learning_rate": 4.4786857976045625e-05, "loss": 0.49, "num_input_tokens_seen": 22472272, "step": 38725 }, { "epoch": 5.768543342269884, "grad_norm": 1.5734543800354004, "learning_rate": 4.4784871754643946e-05, "loss": 0.6981, "num_input_tokens_seen": 22475216, "step": 38730 }, { "epoch": 5.769288054810843, "grad_norm": 1.345015525817871, "learning_rate": 4.478288519899697e-05, "loss": 0.598, "num_input_tokens_seen": 22478000, "step": 38735 }, { "epoch": 5.770032767351802, "grad_norm": 1.3683934211730957, "learning_rate": 4.4780898309138245e-05, "loss": 0.7128, "num_input_tokens_seen": 22480912, "step": 38740 }, { "epoch": 5.770777479892761, "grad_norm": 1.4768043756484985, "learning_rate": 4.477891108510135e-05, "loss": 0.7449, "num_input_tokens_seen": 22483888, "step": 38745 }, { "epoch": 5.7715221924337206, "grad_norm": 0.9732615351676941, "learning_rate": 4.4776923526919855e-05, "loss": 0.7071, "num_input_tokens_seen": 22486640, "step": 38750 }, { "epoch": 5.77226690497468, "grad_norm": 1.5101853609085083, "learning_rate": 4.477493563462733e-05, "loss": 0.5929, "num_input_tokens_seen": 22489168, "step": 38755 }, { "epoch": 5.773011617515639, "grad_norm": 1.1241310834884644, "learning_rate": 4.477294740825738e-05, "loss": 0.5594, "num_input_tokens_seen": 22491888, "step": 38760 }, { "epoch": 5.773756330056598, "grad_norm": 1.133941650390625, "learning_rate": 4.477095884784358e-05, "loss": 0.6619, "num_input_tokens_seen": 22494704, "step": 38765 }, { "epoch": 5.774501042597557, "grad_norm": 0.8427591919898987, "learning_rate": 4.476896995341951e-05, "loss": 0.5523, "num_input_tokens_seen": 22497744, "step": 38770 }, { "epoch": 5.775245755138517, "grad_norm": 1.502974510192871, "learning_rate": 4.47669807250188e-05, "loss": 0.8118, "num_input_tokens_seen": 22500816, "step": 38775 }, { "epoch": 5.775990467679476, "grad_norm": 1.590453863143921, "learning_rate": 4.476499116267503e-05, "loss": 0.7303, "num_input_tokens_seen": 22503568, "step": 38780 }, { "epoch": 5.7767351802204345, "grad_norm": 1.955538034439087, "learning_rate": 4.476300126642183e-05, "loss": 0.5769, "num_input_tokens_seen": 22506608, "step": 38785 }, { "epoch": 5.777479892761394, "grad_norm": 1.1295771598815918, "learning_rate": 4.4761011036292804e-05, "loss": 0.6242, "num_input_tokens_seen": 22509360, "step": 38790 }, { "epoch": 5.778224605302353, "grad_norm": 0.9320940971374512, "learning_rate": 4.475902047232159e-05, "loss": 0.5433, "num_input_tokens_seen": 22512368, "step": 38795 }, { "epoch": 5.778969317843313, "grad_norm": 1.2001688480377197, "learning_rate": 4.4757029574541795e-05, "loss": 0.5745, "num_input_tokens_seen": 22515376, "step": 38800 }, { "epoch": 5.779714030384271, "grad_norm": 0.7212958931922913, "learning_rate": 4.475503834298707e-05, "loss": 0.4994, "num_input_tokens_seen": 22518608, "step": 38805 }, { "epoch": 5.780458742925231, "grad_norm": 1.1962578296661377, "learning_rate": 4.475304677769105e-05, "loss": 0.6457, "num_input_tokens_seen": 22521552, "step": 38810 }, { "epoch": 5.78120345546619, "grad_norm": 1.0803903341293335, "learning_rate": 4.475105487868739e-05, "loss": 0.6314, "num_input_tokens_seen": 22524464, "step": 38815 }, { "epoch": 5.781948168007149, "grad_norm": 0.9845152497291565, "learning_rate": 4.474906264600972e-05, "loss": 0.5163, "num_input_tokens_seen": 22527248, "step": 38820 }, { "epoch": 5.782692880548108, "grad_norm": 2.5408236980438232, "learning_rate": 4.474707007969171e-05, "loss": 0.6746, "num_input_tokens_seen": 22530192, "step": 38825 }, { "epoch": 5.783437593089068, "grad_norm": 0.7982558608055115, "learning_rate": 4.4745077179767026e-05, "loss": 0.5032, "num_input_tokens_seen": 22533072, "step": 38830 }, { "epoch": 5.7841823056300266, "grad_norm": 1.1705222129821777, "learning_rate": 4.4743083946269324e-05, "loss": 0.7674, "num_input_tokens_seen": 22535984, "step": 38835 }, { "epoch": 5.784927018170986, "grad_norm": 1.0522873401641846, "learning_rate": 4.47410903792323e-05, "loss": 0.5346, "num_input_tokens_seen": 22538768, "step": 38840 }, { "epoch": 5.785671730711945, "grad_norm": 0.9675180315971375, "learning_rate": 4.47390964786896e-05, "loss": 0.6757, "num_input_tokens_seen": 22541968, "step": 38845 }, { "epoch": 5.786416443252905, "grad_norm": 0.9250494837760925, "learning_rate": 4.4737102244674934e-05, "loss": 0.602, "num_input_tokens_seen": 22544944, "step": 38850 }, { "epoch": 5.787161155793863, "grad_norm": 0.9650985598564148, "learning_rate": 4.473510767722199e-05, "loss": 0.4714, "num_input_tokens_seen": 22547984, "step": 38855 }, { "epoch": 5.787905868334823, "grad_norm": 1.3245819807052612, "learning_rate": 4.473311277636445e-05, "loss": 0.7236, "num_input_tokens_seen": 22550992, "step": 38860 }, { "epoch": 5.788650580875782, "grad_norm": 0.8874348998069763, "learning_rate": 4.4731117542136034e-05, "loss": 0.6397, "num_input_tokens_seen": 22553936, "step": 38865 }, { "epoch": 5.789395293416741, "grad_norm": 1.2106367349624634, "learning_rate": 4.472912197457044e-05, "loss": 0.5795, "num_input_tokens_seen": 22556624, "step": 38870 }, { "epoch": 5.7901400059577, "grad_norm": 1.2807084321975708, "learning_rate": 4.472712607370137e-05, "loss": 0.7771, "num_input_tokens_seen": 22559600, "step": 38875 }, { "epoch": 5.79088471849866, "grad_norm": 1.4342135190963745, "learning_rate": 4.472512983956257e-05, "loss": 0.7843, "num_input_tokens_seen": 22562512, "step": 38880 }, { "epoch": 5.791629431039619, "grad_norm": 1.2005178928375244, "learning_rate": 4.4723133272187745e-05, "loss": 0.6445, "num_input_tokens_seen": 22565680, "step": 38885 }, { "epoch": 5.792374143580578, "grad_norm": 1.0848554372787476, "learning_rate": 4.4721136371610626e-05, "loss": 0.5761, "num_input_tokens_seen": 22568528, "step": 38890 }, { "epoch": 5.793118856121537, "grad_norm": 1.1998296976089478, "learning_rate": 4.4719139137864956e-05, "loss": 0.6443, "num_input_tokens_seen": 22571472, "step": 38895 }, { "epoch": 5.793863568662497, "grad_norm": 1.1092896461486816, "learning_rate": 4.4717141570984474e-05, "loss": 0.6133, "num_input_tokens_seen": 22574320, "step": 38900 }, { "epoch": 5.794608281203455, "grad_norm": 1.3655917644500732, "learning_rate": 4.471514367100292e-05, "loss": 0.6069, "num_input_tokens_seen": 22577232, "step": 38905 }, { "epoch": 5.795352993744415, "grad_norm": 1.3627053499221802, "learning_rate": 4.471314543795405e-05, "loss": 0.7042, "num_input_tokens_seen": 22580560, "step": 38910 }, { "epoch": 5.796097706285374, "grad_norm": 1.216008186340332, "learning_rate": 4.4711146871871625e-05, "loss": 0.6001, "num_input_tokens_seen": 22583440, "step": 38915 }, { "epoch": 5.796842418826333, "grad_norm": 1.1211020946502686, "learning_rate": 4.4709147972789405e-05, "loss": 0.6186, "num_input_tokens_seen": 22586064, "step": 38920 }, { "epoch": 5.797587131367292, "grad_norm": 0.9452603459358215, "learning_rate": 4.470714874074117e-05, "loss": 0.6369, "num_input_tokens_seen": 22588816, "step": 38925 }, { "epoch": 5.798331843908251, "grad_norm": 2.0872039794921875, "learning_rate": 4.470514917576067e-05, "loss": 0.7574, "num_input_tokens_seen": 22591472, "step": 38930 }, { "epoch": 5.799076556449211, "grad_norm": 1.0678166151046753, "learning_rate": 4.470314927788172e-05, "loss": 0.6963, "num_input_tokens_seen": 22594352, "step": 38935 }, { "epoch": 5.79982126899017, "grad_norm": 1.3965888023376465, "learning_rate": 4.470114904713808e-05, "loss": 0.8279, "num_input_tokens_seen": 22597136, "step": 38940 }, { "epoch": 5.800565981531129, "grad_norm": 1.1165803670883179, "learning_rate": 4.4699148483563546e-05, "loss": 0.6123, "num_input_tokens_seen": 22600144, "step": 38945 }, { "epoch": 5.801310694072088, "grad_norm": 1.4053115844726562, "learning_rate": 4.469714758719192e-05, "loss": 0.7628, "num_input_tokens_seen": 22603056, "step": 38950 }, { "epoch": 5.802055406613047, "grad_norm": 1.6101685762405396, "learning_rate": 4.469514635805702e-05, "loss": 0.6999, "num_input_tokens_seen": 22605936, "step": 38955 }, { "epoch": 5.802800119154006, "grad_norm": 1.504713535308838, "learning_rate": 4.469314479619262e-05, "loss": 0.5434, "num_input_tokens_seen": 22608848, "step": 38960 }, { "epoch": 5.803544831694966, "grad_norm": 0.9143543839454651, "learning_rate": 4.469114290163257e-05, "loss": 0.5822, "num_input_tokens_seen": 22611888, "step": 38965 }, { "epoch": 5.804289544235925, "grad_norm": 1.1031948328018188, "learning_rate": 4.468914067441066e-05, "loss": 0.6999, "num_input_tokens_seen": 22615056, "step": 38970 }, { "epoch": 5.805034256776884, "grad_norm": 1.3505486249923706, "learning_rate": 4.468713811456074e-05, "loss": 0.6948, "num_input_tokens_seen": 22618064, "step": 38975 }, { "epoch": 5.805778969317843, "grad_norm": 1.5317517518997192, "learning_rate": 4.468513522211662e-05, "loss": 0.7258, "num_input_tokens_seen": 22621072, "step": 38980 }, { "epoch": 5.806523681858803, "grad_norm": 1.293656587600708, "learning_rate": 4.468313199711216e-05, "loss": 0.7423, "num_input_tokens_seen": 22624080, "step": 38985 }, { "epoch": 5.807268394399761, "grad_norm": 0.9680408835411072, "learning_rate": 4.468112843958118e-05, "loss": 0.7004, "num_input_tokens_seen": 22628304, "step": 38990 }, { "epoch": 5.808013106940721, "grad_norm": 0.9940435290336609, "learning_rate": 4.467912454955755e-05, "loss": 0.6347, "num_input_tokens_seen": 22631024, "step": 38995 }, { "epoch": 5.80875781948168, "grad_norm": 1.1581884622573853, "learning_rate": 4.46771203270751e-05, "loss": 0.7076, "num_input_tokens_seen": 22634288, "step": 39000 }, { "epoch": 5.809502532022639, "grad_norm": 2.3931972980499268, "learning_rate": 4.4675115772167706e-05, "loss": 0.5245, "num_input_tokens_seen": 22637264, "step": 39005 }, { "epoch": 5.810247244563598, "grad_norm": 1.5498921871185303, "learning_rate": 4.467311088486922e-05, "loss": 0.6998, "num_input_tokens_seen": 22640048, "step": 39010 }, { "epoch": 5.810991957104558, "grad_norm": 1.8050600290298462, "learning_rate": 4.467110566521353e-05, "loss": 0.6706, "num_input_tokens_seen": 22642800, "step": 39015 }, { "epoch": 5.811736669645517, "grad_norm": 1.3726844787597656, "learning_rate": 4.4669100113234504e-05, "loss": 0.6392, "num_input_tokens_seen": 22645648, "step": 39020 }, { "epoch": 5.812481382186476, "grad_norm": 1.2509312629699707, "learning_rate": 4.466709422896601e-05, "loss": 0.6732, "num_input_tokens_seen": 22648752, "step": 39025 }, { "epoch": 5.813226094727435, "grad_norm": 1.3302981853485107, "learning_rate": 4.466508801244196e-05, "loss": 0.6088, "num_input_tokens_seen": 22651664, "step": 39030 }, { "epoch": 5.813970807268395, "grad_norm": 0.9743152260780334, "learning_rate": 4.466308146369623e-05, "loss": 0.5484, "num_input_tokens_seen": 22654672, "step": 39035 }, { "epoch": 5.814715519809353, "grad_norm": 1.8428595066070557, "learning_rate": 4.466107458276273e-05, "loss": 0.6482, "num_input_tokens_seen": 22657648, "step": 39040 }, { "epoch": 5.815460232350313, "grad_norm": 1.0541129112243652, "learning_rate": 4.465906736967534e-05, "loss": 0.7272, "num_input_tokens_seen": 22660592, "step": 39045 }, { "epoch": 5.816204944891272, "grad_norm": 0.8677961230278015, "learning_rate": 4.465705982446801e-05, "loss": 0.5155, "num_input_tokens_seen": 22663280, "step": 39050 }, { "epoch": 5.8169496574322315, "grad_norm": 0.6794248223304749, "learning_rate": 4.465505194717462e-05, "loss": 0.5642, "num_input_tokens_seen": 22666416, "step": 39055 }, { "epoch": 5.81769436997319, "grad_norm": 1.3045542240142822, "learning_rate": 4.46530437378291e-05, "loss": 0.779, "num_input_tokens_seen": 22669296, "step": 39060 }, { "epoch": 5.81843908251415, "grad_norm": 1.3320249319076538, "learning_rate": 4.465103519646539e-05, "loss": 0.6757, "num_input_tokens_seen": 22672208, "step": 39065 }, { "epoch": 5.819183795055109, "grad_norm": 0.9074057936668396, "learning_rate": 4.4649026323117404e-05, "loss": 0.6694, "num_input_tokens_seen": 22675120, "step": 39070 }, { "epoch": 5.819928507596068, "grad_norm": 1.0914181470870972, "learning_rate": 4.464701711781909e-05, "loss": 0.6541, "num_input_tokens_seen": 22678064, "step": 39075 }, { "epoch": 5.820673220137027, "grad_norm": 1.9222438335418701, "learning_rate": 4.46450075806044e-05, "loss": 0.8215, "num_input_tokens_seen": 22680656, "step": 39080 }, { "epoch": 5.821417932677987, "grad_norm": 1.6237168312072754, "learning_rate": 4.464299771150727e-05, "loss": 0.7077, "num_input_tokens_seen": 22683632, "step": 39085 }, { "epoch": 5.822162645218945, "grad_norm": 1.4141989946365356, "learning_rate": 4.464098751056165e-05, "loss": 0.6323, "num_input_tokens_seen": 22686576, "step": 39090 }, { "epoch": 5.822907357759904, "grad_norm": 1.5139800310134888, "learning_rate": 4.463897697780152e-05, "loss": 0.6846, "num_input_tokens_seen": 22689168, "step": 39095 }, { "epoch": 5.823652070300864, "grad_norm": 1.2230242490768433, "learning_rate": 4.463696611326082e-05, "loss": 0.7526, "num_input_tokens_seen": 22691856, "step": 39100 }, { "epoch": 5.8243967828418235, "grad_norm": 0.8726202249526978, "learning_rate": 4.4634954916973545e-05, "loss": 0.5169, "num_input_tokens_seen": 22694800, "step": 39105 }, { "epoch": 5.825141495382782, "grad_norm": 1.9609650373458862, "learning_rate": 4.463294338897366e-05, "loss": 0.8844, "num_input_tokens_seen": 22697808, "step": 39110 }, { "epoch": 5.825886207923741, "grad_norm": 1.167474627494812, "learning_rate": 4.463093152929515e-05, "loss": 0.6081, "num_input_tokens_seen": 22700848, "step": 39115 }, { "epoch": 5.826630920464701, "grad_norm": 0.7537720203399658, "learning_rate": 4.4628919337972e-05, "loss": 0.5032, "num_input_tokens_seen": 22703760, "step": 39120 }, { "epoch": 5.82737563300566, "grad_norm": 1.0263351202011108, "learning_rate": 4.462690681503822e-05, "loss": 0.653, "num_input_tokens_seen": 22706544, "step": 39125 }, { "epoch": 5.828120345546619, "grad_norm": 0.9091975092887878, "learning_rate": 4.462489396052779e-05, "loss": 0.604, "num_input_tokens_seen": 22709712, "step": 39130 }, { "epoch": 5.828865058087578, "grad_norm": 0.810343325138092, "learning_rate": 4.462288077447472e-05, "loss": 0.4752, "num_input_tokens_seen": 22712464, "step": 39135 }, { "epoch": 5.8296097706285375, "grad_norm": 1.1048789024353027, "learning_rate": 4.462086725691302e-05, "loss": 0.5204, "num_input_tokens_seen": 22715472, "step": 39140 }, { "epoch": 5.830354483169496, "grad_norm": 1.4287689924240112, "learning_rate": 4.4618853407876714e-05, "loss": 0.6932, "num_input_tokens_seen": 22718448, "step": 39145 }, { "epoch": 5.831099195710456, "grad_norm": 1.2510112524032593, "learning_rate": 4.461683922739982e-05, "loss": 0.5889, "num_input_tokens_seen": 22721104, "step": 39150 }, { "epoch": 5.831843908251415, "grad_norm": 0.9792107939720154, "learning_rate": 4.461482471551637e-05, "loss": 0.6067, "num_input_tokens_seen": 22724304, "step": 39155 }, { "epoch": 5.832588620792374, "grad_norm": 1.123597264289856, "learning_rate": 4.4612809872260386e-05, "loss": 0.579, "num_input_tokens_seen": 22727088, "step": 39160 }, { "epoch": 5.833333333333333, "grad_norm": 0.9641755223274231, "learning_rate": 4.461079469766592e-05, "loss": 0.6059, "num_input_tokens_seen": 22730000, "step": 39165 }, { "epoch": 5.834078045874293, "grad_norm": 0.7754077911376953, "learning_rate": 4.4608779191766994e-05, "loss": 0.7759, "num_input_tokens_seen": 22732912, "step": 39170 }, { "epoch": 5.834822758415251, "grad_norm": 1.1069564819335938, "learning_rate": 4.460676335459768e-05, "loss": 0.8059, "num_input_tokens_seen": 22735760, "step": 39175 }, { "epoch": 5.835567470956211, "grad_norm": 1.2936865091323853, "learning_rate": 4.460474718619203e-05, "loss": 0.5448, "num_input_tokens_seen": 22738960, "step": 39180 }, { "epoch": 5.83631218349717, "grad_norm": 1.393706202507019, "learning_rate": 4.4602730686584105e-05, "loss": 0.5331, "num_input_tokens_seen": 22741616, "step": 39185 }, { "epoch": 5.8370568960381295, "grad_norm": 1.5033252239227295, "learning_rate": 4.460071385580796e-05, "loss": 0.6717, "num_input_tokens_seen": 22744560, "step": 39190 }, { "epoch": 5.837801608579088, "grad_norm": 1.3760335445404053, "learning_rate": 4.459869669389768e-05, "loss": 0.7712, "num_input_tokens_seen": 22747536, "step": 39195 }, { "epoch": 5.838546321120048, "grad_norm": 1.14365816116333, "learning_rate": 4.459667920088734e-05, "loss": 0.6299, "num_input_tokens_seen": 22750480, "step": 39200 }, { "epoch": 5.839291033661007, "grad_norm": 1.2860982418060303, "learning_rate": 4.459466137681102e-05, "loss": 0.5082, "num_input_tokens_seen": 22753264, "step": 39205 }, { "epoch": 5.840035746201966, "grad_norm": 1.0427922010421753, "learning_rate": 4.4592643221702805e-05, "loss": 0.5693, "num_input_tokens_seen": 22756144, "step": 39210 }, { "epoch": 5.840780458742925, "grad_norm": 0.7503942251205444, "learning_rate": 4.459062473559681e-05, "loss": 0.6724, "num_input_tokens_seen": 22758896, "step": 39215 }, { "epoch": 5.841525171283885, "grad_norm": 0.9306414723396301, "learning_rate": 4.4588605918527104e-05, "loss": 0.6336, "num_input_tokens_seen": 22761680, "step": 39220 }, { "epoch": 5.8422698838248435, "grad_norm": 1.2353861331939697, "learning_rate": 4.458658677052782e-05, "loss": 0.6525, "num_input_tokens_seen": 22764496, "step": 39225 }, { "epoch": 5.843014596365803, "grad_norm": 1.1317877769470215, "learning_rate": 4.458456729163306e-05, "loss": 0.7426, "num_input_tokens_seen": 22767184, "step": 39230 }, { "epoch": 5.843759308906762, "grad_norm": 1.2035620212554932, "learning_rate": 4.458254748187693e-05, "loss": 0.6492, "num_input_tokens_seen": 22769968, "step": 39235 }, { "epoch": 5.8445040214477215, "grad_norm": 0.8270658254623413, "learning_rate": 4.458052734129358e-05, "loss": 0.5459, "num_input_tokens_seen": 22772592, "step": 39240 }, { "epoch": 5.84524873398868, "grad_norm": 1.4120914936065674, "learning_rate": 4.457850686991711e-05, "loss": 0.6048, "num_input_tokens_seen": 22775504, "step": 39245 }, { "epoch": 5.84599344652964, "grad_norm": 1.8474006652832031, "learning_rate": 4.4576486067781675e-05, "loss": 0.684, "num_input_tokens_seen": 22778544, "step": 39250 }, { "epoch": 5.846738159070599, "grad_norm": 2.7988107204437256, "learning_rate": 4.45744649349214e-05, "loss": 0.6202, "num_input_tokens_seen": 22781456, "step": 39255 }, { "epoch": 5.847482871611557, "grad_norm": 0.9971069097518921, "learning_rate": 4.457244347137043e-05, "loss": 0.6799, "num_input_tokens_seen": 22784304, "step": 39260 }, { "epoch": 5.848227584152517, "grad_norm": 0.8980727195739746, "learning_rate": 4.457042167716292e-05, "loss": 0.5856, "num_input_tokens_seen": 22787056, "step": 39265 }, { "epoch": 5.848972296693477, "grad_norm": 1.1085387468338013, "learning_rate": 4.456839955233303e-05, "loss": 0.5681, "num_input_tokens_seen": 22789744, "step": 39270 }, { "epoch": 5.8497170092344355, "grad_norm": 1.2341055870056152, "learning_rate": 4.456637709691491e-05, "loss": 0.5426, "num_input_tokens_seen": 22792496, "step": 39275 }, { "epoch": 5.850461721775394, "grad_norm": 1.5335090160369873, "learning_rate": 4.456435431094275e-05, "loss": 0.4996, "num_input_tokens_seen": 22795312, "step": 39280 }, { "epoch": 5.851206434316354, "grad_norm": 2.0872409343719482, "learning_rate": 4.45623311944507e-05, "loss": 0.7573, "num_input_tokens_seen": 22798544, "step": 39285 }, { "epoch": 5.8519511468573135, "grad_norm": 0.7438521385192871, "learning_rate": 4.4560307747472945e-05, "loss": 0.5972, "num_input_tokens_seen": 22801104, "step": 39290 }, { "epoch": 5.852695859398272, "grad_norm": 1.1131383180618286, "learning_rate": 4.4558283970043676e-05, "loss": 0.8144, "num_input_tokens_seen": 22804144, "step": 39295 }, { "epoch": 5.853440571939231, "grad_norm": 0.8462159037590027, "learning_rate": 4.4556259862197067e-05, "loss": 0.7494, "num_input_tokens_seen": 22807152, "step": 39300 }, { "epoch": 5.854185284480191, "grad_norm": 1.0522661209106445, "learning_rate": 4.4554235423967336e-05, "loss": 0.5404, "num_input_tokens_seen": 22810032, "step": 39305 }, { "epoch": 5.8549299970211495, "grad_norm": 0.9255536198616028, "learning_rate": 4.4552210655388664e-05, "loss": 0.5069, "num_input_tokens_seen": 22812752, "step": 39310 }, { "epoch": 5.855674709562109, "grad_norm": 0.9412525296211243, "learning_rate": 4.455018555649527e-05, "loss": 0.6094, "num_input_tokens_seen": 22815408, "step": 39315 }, { "epoch": 5.856419422103068, "grad_norm": 1.2883594036102295, "learning_rate": 4.4548160127321356e-05, "loss": 0.5097, "num_input_tokens_seen": 22818704, "step": 39320 }, { "epoch": 5.8571641346440275, "grad_norm": 2.3550174236297607, "learning_rate": 4.454613436790115e-05, "loss": 0.8135, "num_input_tokens_seen": 22821680, "step": 39325 }, { "epoch": 5.857908847184986, "grad_norm": 1.231828212738037, "learning_rate": 4.454410827826887e-05, "loss": 0.6352, "num_input_tokens_seen": 22824560, "step": 39330 }, { "epoch": 5.858653559725946, "grad_norm": 1.4864130020141602, "learning_rate": 4.454208185845874e-05, "loss": 0.6776, "num_input_tokens_seen": 22827344, "step": 39335 }, { "epoch": 5.859398272266905, "grad_norm": 0.951582670211792, "learning_rate": 4.4540055108504996e-05, "loss": 0.4891, "num_input_tokens_seen": 22830032, "step": 39340 }, { "epoch": 5.860142984807864, "grad_norm": 1.1164000034332275, "learning_rate": 4.4538028028441885e-05, "loss": 0.7651, "num_input_tokens_seen": 22833264, "step": 39345 }, { "epoch": 5.860887697348823, "grad_norm": 0.8412945866584778, "learning_rate": 4.453600061830365e-05, "loss": 0.5507, "num_input_tokens_seen": 22836592, "step": 39350 }, { "epoch": 5.861632409889783, "grad_norm": 0.7414585947990417, "learning_rate": 4.453397287812453e-05, "loss": 0.6164, "num_input_tokens_seen": 22839664, "step": 39355 }, { "epoch": 5.8623771224307415, "grad_norm": 1.8148868083953857, "learning_rate": 4.4531944807938806e-05, "loss": 0.5634, "num_input_tokens_seen": 22842480, "step": 39360 }, { "epoch": 5.863121834971701, "grad_norm": 0.9508452415466309, "learning_rate": 4.4529916407780715e-05, "loss": 0.7093, "num_input_tokens_seen": 22845520, "step": 39365 }, { "epoch": 5.86386654751266, "grad_norm": 0.9497473835945129, "learning_rate": 4.452788767768454e-05, "loss": 0.5764, "num_input_tokens_seen": 22848592, "step": 39370 }, { "epoch": 5.8646112600536195, "grad_norm": 0.5698069334030151, "learning_rate": 4.4525858617684545e-05, "loss": 0.7007, "num_input_tokens_seen": 22851088, "step": 39375 }, { "epoch": 5.865355972594578, "grad_norm": 1.1078394651412964, "learning_rate": 4.452382922781503e-05, "loss": 0.737, "num_input_tokens_seen": 22853840, "step": 39380 }, { "epoch": 5.866100685135538, "grad_norm": 1.1359643936157227, "learning_rate": 4.4521799508110245e-05, "loss": 0.5647, "num_input_tokens_seen": 22856816, "step": 39385 }, { "epoch": 5.866845397676497, "grad_norm": 1.3449610471725464, "learning_rate": 4.4519769458604504e-05, "loss": 0.4085, "num_input_tokens_seen": 22859632, "step": 39390 }, { "epoch": 5.867590110217456, "grad_norm": 1.8016506433486938, "learning_rate": 4.45177390793321e-05, "loss": 0.7512, "num_input_tokens_seen": 22862320, "step": 39395 }, { "epoch": 5.868334822758415, "grad_norm": 2.367344617843628, "learning_rate": 4.451570837032733e-05, "loss": 0.7816, "num_input_tokens_seen": 22865008, "step": 39400 }, { "epoch": 5.869079535299375, "grad_norm": 2.097234010696411, "learning_rate": 4.45136773316245e-05, "loss": 0.6073, "num_input_tokens_seen": 22867856, "step": 39405 }, { "epoch": 5.8698242478403335, "grad_norm": 1.521855354309082, "learning_rate": 4.451164596325793e-05, "loss": 0.5164, "num_input_tokens_seen": 22871888, "step": 39410 }, { "epoch": 5.870568960381293, "grad_norm": 2.2477478981018066, "learning_rate": 4.450961426526192e-05, "loss": 0.8099, "num_input_tokens_seen": 22874608, "step": 39415 }, { "epoch": 5.871313672922252, "grad_norm": 0.7707133889198303, "learning_rate": 4.450758223767082e-05, "loss": 0.6505, "num_input_tokens_seen": 22877520, "step": 39420 }, { "epoch": 5.872058385463212, "grad_norm": 1.0401493310928345, "learning_rate": 4.4505549880518935e-05, "loss": 0.5191, "num_input_tokens_seen": 22880496, "step": 39425 }, { "epoch": 5.87280309800417, "grad_norm": 0.9872366189956665, "learning_rate": 4.4503517193840615e-05, "loss": 0.5992, "num_input_tokens_seen": 22883056, "step": 39430 }, { "epoch": 5.87354781054513, "grad_norm": 2.064535617828369, "learning_rate": 4.4501484177670186e-05, "loss": 0.6524, "num_input_tokens_seen": 22886160, "step": 39435 }, { "epoch": 5.874292523086089, "grad_norm": 1.3170171976089478, "learning_rate": 4.449945083204201e-05, "loss": 0.7289, "num_input_tokens_seen": 22889264, "step": 39440 }, { "epoch": 5.8750372356270475, "grad_norm": 1.108931064605713, "learning_rate": 4.4497417156990427e-05, "loss": 0.6438, "num_input_tokens_seen": 22892400, "step": 39445 }, { "epoch": 5.875781948168007, "grad_norm": 0.8312886953353882, "learning_rate": 4.44953831525498e-05, "loss": 0.5802, "num_input_tokens_seen": 22895280, "step": 39450 }, { "epoch": 5.876526660708967, "grad_norm": 1.0955003499984741, "learning_rate": 4.449334881875449e-05, "loss": 0.8007, "num_input_tokens_seen": 22898384, "step": 39455 }, { "epoch": 5.8772713732499255, "grad_norm": 1.4809989929199219, "learning_rate": 4.4491314155638865e-05, "loss": 0.7632, "num_input_tokens_seen": 22901232, "step": 39460 }, { "epoch": 5.878016085790884, "grad_norm": 1.0799795389175415, "learning_rate": 4.448927916323729e-05, "loss": 0.694, "num_input_tokens_seen": 22904304, "step": 39465 }, { "epoch": 5.878760798331844, "grad_norm": 1.9165263175964355, "learning_rate": 4.448724384158416e-05, "loss": 0.6799, "num_input_tokens_seen": 22907184, "step": 39470 }, { "epoch": 5.879505510872804, "grad_norm": 0.978399932384491, "learning_rate": 4.4485208190713846e-05, "loss": 0.7461, "num_input_tokens_seen": 22909808, "step": 39475 }, { "epoch": 5.880250223413762, "grad_norm": 1.5572293996810913, "learning_rate": 4.448317221066074e-05, "loss": 0.5082, "num_input_tokens_seen": 22912464, "step": 39480 }, { "epoch": 5.880994935954721, "grad_norm": 0.6274295449256897, "learning_rate": 4.4481135901459245e-05, "loss": 0.6611, "num_input_tokens_seen": 22915280, "step": 39485 }, { "epoch": 5.881739648495681, "grad_norm": 1.4778088331222534, "learning_rate": 4.4479099263143765e-05, "loss": 0.6328, "num_input_tokens_seen": 22918160, "step": 39490 }, { "epoch": 5.8824843610366395, "grad_norm": 1.5309576988220215, "learning_rate": 4.447706229574869e-05, "loss": 0.6778, "num_input_tokens_seen": 22921008, "step": 39495 }, { "epoch": 5.883229073577599, "grad_norm": 1.1240766048431396, "learning_rate": 4.4475024999308454e-05, "loss": 0.9042, "num_input_tokens_seen": 22923504, "step": 39500 }, { "epoch": 5.883973786118558, "grad_norm": 1.0543714761734009, "learning_rate": 4.4472987373857456e-05, "loss": 0.5866, "num_input_tokens_seen": 22926416, "step": 39505 }, { "epoch": 5.884718498659518, "grad_norm": 1.166221022605896, "learning_rate": 4.447094941943013e-05, "loss": 0.7738, "num_input_tokens_seen": 22929168, "step": 39510 }, { "epoch": 5.885463211200476, "grad_norm": 0.7831113338470459, "learning_rate": 4.44689111360609e-05, "loss": 0.606, "num_input_tokens_seen": 22931952, "step": 39515 }, { "epoch": 5.886207923741436, "grad_norm": 0.7077429890632629, "learning_rate": 4.446687252378421e-05, "loss": 0.7117, "num_input_tokens_seen": 22935088, "step": 39520 }, { "epoch": 5.886952636282395, "grad_norm": 0.8048246502876282, "learning_rate": 4.446483358263449e-05, "loss": 0.7071, "num_input_tokens_seen": 22937872, "step": 39525 }, { "epoch": 5.887697348823354, "grad_norm": 0.9469480514526367, "learning_rate": 4.44627943126462e-05, "loss": 0.6802, "num_input_tokens_seen": 22940784, "step": 39530 }, { "epoch": 5.888442061364313, "grad_norm": 0.9744033217430115, "learning_rate": 4.446075471385376e-05, "loss": 0.6254, "num_input_tokens_seen": 22943664, "step": 39535 }, { "epoch": 5.889186773905273, "grad_norm": 1.442168116569519, "learning_rate": 4.4458714786291666e-05, "loss": 0.523, "num_input_tokens_seen": 22946320, "step": 39540 }, { "epoch": 5.8899314864462315, "grad_norm": 0.9053953289985657, "learning_rate": 4.4456674529994356e-05, "loss": 0.5071, "num_input_tokens_seen": 22949648, "step": 39545 }, { "epoch": 5.890676198987191, "grad_norm": 1.2040367126464844, "learning_rate": 4.44546339449963e-05, "loss": 0.7181, "num_input_tokens_seen": 22952560, "step": 39550 }, { "epoch": 5.89142091152815, "grad_norm": 0.7323303818702698, "learning_rate": 4.445259303133198e-05, "loss": 0.6466, "num_input_tokens_seen": 22955408, "step": 39555 }, { "epoch": 5.89216562406911, "grad_norm": 0.7291958332061768, "learning_rate": 4.445055178903588e-05, "loss": 0.601, "num_input_tokens_seen": 22958576, "step": 39560 }, { "epoch": 5.892910336610068, "grad_norm": 1.0537141561508179, "learning_rate": 4.444851021814247e-05, "loss": 0.7819, "num_input_tokens_seen": 22961616, "step": 39565 }, { "epoch": 5.893655049151028, "grad_norm": 1.5419303178787231, "learning_rate": 4.444646831868624e-05, "loss": 0.6094, "num_input_tokens_seen": 22964528, "step": 39570 }, { "epoch": 5.894399761691987, "grad_norm": 1.1773239374160767, "learning_rate": 4.44444260907017e-05, "loss": 0.6485, "num_input_tokens_seen": 22967344, "step": 39575 }, { "epoch": 5.895144474232946, "grad_norm": 1.0490777492523193, "learning_rate": 4.444238353422334e-05, "loss": 0.4493, "num_input_tokens_seen": 22970416, "step": 39580 }, { "epoch": 5.895889186773905, "grad_norm": 0.8718574643135071, "learning_rate": 4.444034064928567e-05, "loss": 0.6425, "num_input_tokens_seen": 22973296, "step": 39585 }, { "epoch": 5.896633899314865, "grad_norm": 0.8567020893096924, "learning_rate": 4.443829743592321e-05, "loss": 0.6592, "num_input_tokens_seen": 22976272, "step": 39590 }, { "epoch": 5.897378611855824, "grad_norm": 0.9927030205726624, "learning_rate": 4.4436253894170464e-05, "loss": 0.5589, "num_input_tokens_seen": 22978992, "step": 39595 }, { "epoch": 5.898123324396783, "grad_norm": 0.9405168294906616, "learning_rate": 4.4434210024061966e-05, "loss": 0.6393, "num_input_tokens_seen": 22981840, "step": 39600 }, { "epoch": 5.898868036937742, "grad_norm": 1.133068561553955, "learning_rate": 4.443216582563224e-05, "loss": 0.6824, "num_input_tokens_seen": 22984752, "step": 39605 }, { "epoch": 5.899612749478701, "grad_norm": 1.085518717765808, "learning_rate": 4.443012129891583e-05, "loss": 0.5469, "num_input_tokens_seen": 22988048, "step": 39610 }, { "epoch": 5.90035746201966, "grad_norm": 0.8906620740890503, "learning_rate": 4.442807644394725e-05, "loss": 0.6189, "num_input_tokens_seen": 22990864, "step": 39615 }, { "epoch": 5.90110217456062, "grad_norm": 0.9878780245780945, "learning_rate": 4.442603126076108e-05, "loss": 0.5779, "num_input_tokens_seen": 22993584, "step": 39620 }, { "epoch": 5.901846887101579, "grad_norm": 0.8761065006256104, "learning_rate": 4.442398574939185e-05, "loss": 0.6021, "num_input_tokens_seen": 22996592, "step": 39625 }, { "epoch": 5.9025915996425375, "grad_norm": 0.932133138179779, "learning_rate": 4.442193990987412e-05, "loss": 0.7994, "num_input_tokens_seen": 22999888, "step": 39630 }, { "epoch": 5.903336312183497, "grad_norm": 1.2794255018234253, "learning_rate": 4.441989374224246e-05, "loss": 0.644, "num_input_tokens_seen": 23002736, "step": 39635 }, { "epoch": 5.904081024724457, "grad_norm": 0.9842860102653503, "learning_rate": 4.4417847246531435e-05, "loss": 0.8066, "num_input_tokens_seen": 23005392, "step": 39640 }, { "epoch": 5.904825737265416, "grad_norm": 1.264207124710083, "learning_rate": 4.4415800422775614e-05, "loss": 0.7021, "num_input_tokens_seen": 23008304, "step": 39645 }, { "epoch": 5.905570449806374, "grad_norm": 1.2876180410385132, "learning_rate": 4.441375327100957e-05, "loss": 0.6484, "num_input_tokens_seen": 23011056, "step": 39650 }, { "epoch": 5.906315162347334, "grad_norm": 2.8882689476013184, "learning_rate": 4.4411705791267904e-05, "loss": 0.845, "num_input_tokens_seen": 23013904, "step": 39655 }, { "epoch": 5.907059874888293, "grad_norm": 2.0203168392181396, "learning_rate": 4.44096579835852e-05, "loss": 0.6519, "num_input_tokens_seen": 23016944, "step": 39660 }, { "epoch": 5.907804587429252, "grad_norm": 0.9657630920410156, "learning_rate": 4.440760984799605e-05, "loss": 0.6159, "num_input_tokens_seen": 23019824, "step": 39665 }, { "epoch": 5.908549299970211, "grad_norm": 1.0648224353790283, "learning_rate": 4.440556138453505e-05, "loss": 0.6046, "num_input_tokens_seen": 23022704, "step": 39670 }, { "epoch": 5.909294012511171, "grad_norm": 1.0909148454666138, "learning_rate": 4.440351259323682e-05, "loss": 0.5037, "num_input_tokens_seen": 23025424, "step": 39675 }, { "epoch": 5.91003872505213, "grad_norm": 2.718986988067627, "learning_rate": 4.440146347413596e-05, "loss": 0.6106, "num_input_tokens_seen": 23028176, "step": 39680 }, { "epoch": 5.910783437593089, "grad_norm": 1.2689968347549438, "learning_rate": 4.4399414027267094e-05, "loss": 0.6215, "num_input_tokens_seen": 23030992, "step": 39685 }, { "epoch": 5.911528150134048, "grad_norm": 1.0763522386550903, "learning_rate": 4.439736425266485e-05, "loss": 0.6951, "num_input_tokens_seen": 23033552, "step": 39690 }, { "epoch": 5.912272862675008, "grad_norm": 1.2281674146652222, "learning_rate": 4.4395314150363856e-05, "loss": 0.7804, "num_input_tokens_seen": 23036336, "step": 39695 }, { "epoch": 5.913017575215966, "grad_norm": 0.8506574034690857, "learning_rate": 4.439326372039872e-05, "loss": 0.71, "num_input_tokens_seen": 23039152, "step": 39700 }, { "epoch": 5.913762287756926, "grad_norm": 2.0868802070617676, "learning_rate": 4.439121296280413e-05, "loss": 0.6672, "num_input_tokens_seen": 23042128, "step": 39705 }, { "epoch": 5.914507000297885, "grad_norm": 1.8123745918273926, "learning_rate": 4.438916187761469e-05, "loss": 0.6153, "num_input_tokens_seen": 23044976, "step": 39710 }, { "epoch": 5.915251712838844, "grad_norm": 1.5096229314804077, "learning_rate": 4.4387110464865066e-05, "loss": 0.7089, "num_input_tokens_seen": 23047888, "step": 39715 }, { "epoch": 5.915996425379803, "grad_norm": 1.170259714126587, "learning_rate": 4.4385058724589925e-05, "loss": 0.5607, "num_input_tokens_seen": 23050672, "step": 39720 }, { "epoch": 5.916741137920763, "grad_norm": 1.5087578296661377, "learning_rate": 4.438300665682391e-05, "loss": 0.6622, "num_input_tokens_seen": 23053328, "step": 39725 }, { "epoch": 5.917485850461722, "grad_norm": 1.0704070329666138, "learning_rate": 4.43809542616017e-05, "loss": 0.6226, "num_input_tokens_seen": 23056656, "step": 39730 }, { "epoch": 5.918230563002681, "grad_norm": 1.3730156421661377, "learning_rate": 4.437890153895797e-05, "loss": 0.6346, "num_input_tokens_seen": 23059440, "step": 39735 }, { "epoch": 5.91897527554364, "grad_norm": 1.1991972923278809, "learning_rate": 4.437684848892739e-05, "loss": 0.6125, "num_input_tokens_seen": 23062416, "step": 39740 }, { "epoch": 5.9197199880846, "grad_norm": 0.9389368891716003, "learning_rate": 4.437479511154465e-05, "loss": 0.6102, "num_input_tokens_seen": 23065040, "step": 39745 }, { "epoch": 5.920464700625558, "grad_norm": 1.6980119943618774, "learning_rate": 4.4372741406844434e-05, "loss": 0.7299, "num_input_tokens_seen": 23068112, "step": 39750 }, { "epoch": 5.921209413166518, "grad_norm": 1.1468820571899414, "learning_rate": 4.437068737486145e-05, "loss": 0.5033, "num_input_tokens_seen": 23071056, "step": 39755 }, { "epoch": 5.921954125707477, "grad_norm": 0.9798274040222168, "learning_rate": 4.4368633015630385e-05, "loss": 0.5527, "num_input_tokens_seen": 23073904, "step": 39760 }, { "epoch": 5.9226988382484365, "grad_norm": 1.0883121490478516, "learning_rate": 4.436657832918595e-05, "loss": 0.6829, "num_input_tokens_seen": 23076880, "step": 39765 }, { "epoch": 5.923443550789395, "grad_norm": 1.4268559217453003, "learning_rate": 4.436452331556286e-05, "loss": 0.7104, "num_input_tokens_seen": 23079600, "step": 39770 }, { "epoch": 5.924188263330355, "grad_norm": 1.1091084480285645, "learning_rate": 4.436246797479582e-05, "loss": 0.6516, "num_input_tokens_seen": 23082640, "step": 39775 }, { "epoch": 5.924932975871314, "grad_norm": 0.9834780693054199, "learning_rate": 4.436041230691957e-05, "loss": 0.79, "num_input_tokens_seen": 23085456, "step": 39780 }, { "epoch": 5.925677688412273, "grad_norm": 1.3607885837554932, "learning_rate": 4.435835631196884e-05, "loss": 0.639, "num_input_tokens_seen": 23087984, "step": 39785 }, { "epoch": 5.926422400953232, "grad_norm": 0.8676140904426575, "learning_rate": 4.435629998997835e-05, "loss": 0.7984, "num_input_tokens_seen": 23090640, "step": 39790 }, { "epoch": 5.927167113494191, "grad_norm": 1.5302979946136475, "learning_rate": 4.435424334098284e-05, "loss": 0.6729, "num_input_tokens_seen": 23093808, "step": 39795 }, { "epoch": 5.92791182603515, "grad_norm": 1.1184548139572144, "learning_rate": 4.435218636501706e-05, "loss": 0.5113, "num_input_tokens_seen": 23096688, "step": 39800 }, { "epoch": 5.92865653857611, "grad_norm": 0.8663678765296936, "learning_rate": 4.435012906211576e-05, "loss": 0.5778, "num_input_tokens_seen": 23099376, "step": 39805 }, { "epoch": 5.929401251117069, "grad_norm": 0.9964502453804016, "learning_rate": 4.43480714323137e-05, "loss": 0.6079, "num_input_tokens_seen": 23102256, "step": 39810 }, { "epoch": 5.930145963658028, "grad_norm": 1.1132310628890991, "learning_rate": 4.434601347564563e-05, "loss": 0.6094, "num_input_tokens_seen": 23105008, "step": 39815 }, { "epoch": 5.930890676198987, "grad_norm": 1.3393815755844116, "learning_rate": 4.434395519214633e-05, "loss": 0.7826, "num_input_tokens_seen": 23107760, "step": 39820 }, { "epoch": 5.931635388739946, "grad_norm": 1.3410296440124512, "learning_rate": 4.4341896581850566e-05, "loss": 0.5293, "num_input_tokens_seen": 23110544, "step": 39825 }, { "epoch": 5.932380101280906, "grad_norm": 0.7613914012908936, "learning_rate": 4.433983764479312e-05, "loss": 0.5261, "num_input_tokens_seen": 23113200, "step": 39830 }, { "epoch": 5.933124813821864, "grad_norm": 2.2139806747436523, "learning_rate": 4.433777838100876e-05, "loss": 0.7669, "num_input_tokens_seen": 23116112, "step": 39835 }, { "epoch": 5.933869526362824, "grad_norm": 1.0445445775985718, "learning_rate": 4.433571879053231e-05, "loss": 0.582, "num_input_tokens_seen": 23119024, "step": 39840 }, { "epoch": 5.934614238903783, "grad_norm": 0.8435192704200745, "learning_rate": 4.433365887339853e-05, "loss": 0.6194, "num_input_tokens_seen": 23122320, "step": 39845 }, { "epoch": 5.9353589514447425, "grad_norm": 1.4143816232681274, "learning_rate": 4.4331598629642235e-05, "loss": 0.609, "num_input_tokens_seen": 23125136, "step": 39850 }, { "epoch": 5.936103663985701, "grad_norm": 0.9591948390007019, "learning_rate": 4.432953805929823e-05, "loss": 0.4558, "num_input_tokens_seen": 23127952, "step": 39855 }, { "epoch": 5.936848376526661, "grad_norm": 2.3575403690338135, "learning_rate": 4.432747716240133e-05, "loss": 0.702, "num_input_tokens_seen": 23130704, "step": 39860 }, { "epoch": 5.93759308906762, "grad_norm": 1.3595815896987915, "learning_rate": 4.432541593898634e-05, "loss": 0.5762, "num_input_tokens_seen": 23133296, "step": 39865 }, { "epoch": 5.938337801608579, "grad_norm": 0.7319628000259399, "learning_rate": 4.432335438908809e-05, "loss": 0.5786, "num_input_tokens_seen": 23136176, "step": 39870 }, { "epoch": 5.939082514149538, "grad_norm": 0.9915433526039124, "learning_rate": 4.432129251274141e-05, "loss": 0.7715, "num_input_tokens_seen": 23138928, "step": 39875 }, { "epoch": 5.939827226690498, "grad_norm": 1.202095627784729, "learning_rate": 4.431923030998113e-05, "loss": 0.5694, "num_input_tokens_seen": 23141904, "step": 39880 }, { "epoch": 5.940571939231456, "grad_norm": 1.0758243799209595, "learning_rate": 4.4317167780842086e-05, "loss": 0.5008, "num_input_tokens_seen": 23144688, "step": 39885 }, { "epoch": 5.941316651772416, "grad_norm": 1.0025455951690674, "learning_rate": 4.4315104925359124e-05, "loss": 0.7261, "num_input_tokens_seen": 23147792, "step": 39890 }, { "epoch": 5.942061364313375, "grad_norm": 1.8154395818710327, "learning_rate": 4.431304174356709e-05, "loss": 0.6686, "num_input_tokens_seen": 23150576, "step": 39895 }, { "epoch": 5.9428060768543345, "grad_norm": 0.7233659029006958, "learning_rate": 4.431097823550086e-05, "loss": 0.4331, "num_input_tokens_seen": 23153680, "step": 39900 }, { "epoch": 5.943550789395293, "grad_norm": 1.0112468004226685, "learning_rate": 4.4308914401195275e-05, "loss": 0.5852, "num_input_tokens_seen": 23156976, "step": 39905 }, { "epoch": 5.944295501936253, "grad_norm": 0.9212741255760193, "learning_rate": 4.43068502406852e-05, "loss": 0.7092, "num_input_tokens_seen": 23160304, "step": 39910 }, { "epoch": 5.945040214477212, "grad_norm": 0.7904177904129028, "learning_rate": 4.4304785754005516e-05, "loss": 0.649, "num_input_tokens_seen": 23163568, "step": 39915 }, { "epoch": 5.945784927018171, "grad_norm": 0.9505919218063354, "learning_rate": 4.43027209411911e-05, "loss": 0.7304, "num_input_tokens_seen": 23166288, "step": 39920 }, { "epoch": 5.94652963955913, "grad_norm": 1.9166266918182373, "learning_rate": 4.430065580227683e-05, "loss": 0.6336, "num_input_tokens_seen": 23169072, "step": 39925 }, { "epoch": 5.94727435210009, "grad_norm": 1.7497570514678955, "learning_rate": 4.4298590337297595e-05, "loss": 0.5816, "num_input_tokens_seen": 23171920, "step": 39930 }, { "epoch": 5.9480190646410485, "grad_norm": 0.8112602233886719, "learning_rate": 4.4296524546288286e-05, "loss": 0.5247, "num_input_tokens_seen": 23174672, "step": 39935 }, { "epoch": 5.948763777182008, "grad_norm": 1.1096136569976807, "learning_rate": 4.429445842928382e-05, "loss": 0.6471, "num_input_tokens_seen": 23177456, "step": 39940 }, { "epoch": 5.949508489722967, "grad_norm": 1.4523935317993164, "learning_rate": 4.4292391986319084e-05, "loss": 0.5901, "num_input_tokens_seen": 23180400, "step": 39945 }, { "epoch": 5.9502532022639265, "grad_norm": 1.47616446018219, "learning_rate": 4.429032521742899e-05, "loss": 0.5703, "num_input_tokens_seen": 23183056, "step": 39950 }, { "epoch": 5.950997914804885, "grad_norm": 0.8859185576438904, "learning_rate": 4.428825812264845e-05, "loss": 0.8108, "num_input_tokens_seen": 23185840, "step": 39955 }, { "epoch": 5.951742627345844, "grad_norm": 1.525935411453247, "learning_rate": 4.4286190702012405e-05, "loss": 0.7929, "num_input_tokens_seen": 23188816, "step": 39960 }, { "epoch": 5.952487339886804, "grad_norm": 1.625233769416809, "learning_rate": 4.428412295555576e-05, "loss": 0.5823, "num_input_tokens_seen": 23191376, "step": 39965 }, { "epoch": 5.953232052427763, "grad_norm": 0.9499470591545105, "learning_rate": 4.4282054883313464e-05, "loss": 0.8435, "num_input_tokens_seen": 23194224, "step": 39970 }, { "epoch": 5.953976764968722, "grad_norm": 1.1520161628723145, "learning_rate": 4.427998648532045e-05, "loss": 0.6765, "num_input_tokens_seen": 23197008, "step": 39975 }, { "epoch": 5.954721477509681, "grad_norm": 2.5261662006378174, "learning_rate": 4.427791776161165e-05, "loss": 0.6078, "num_input_tokens_seen": 23200080, "step": 39980 }, { "epoch": 5.9554661900506405, "grad_norm": 1.131966233253479, "learning_rate": 4.4275848712222035e-05, "loss": 0.7852, "num_input_tokens_seen": 23203024, "step": 39985 }, { "epoch": 5.9562109025916, "grad_norm": 1.9230746030807495, "learning_rate": 4.4273779337186536e-05, "loss": 0.8313, "num_input_tokens_seen": 23206288, "step": 39990 }, { "epoch": 5.956955615132559, "grad_norm": 1.536698579788208, "learning_rate": 4.427170963654013e-05, "loss": 0.5741, "num_input_tokens_seen": 23209136, "step": 39995 }, { "epoch": 5.957700327673518, "grad_norm": 1.169646978378296, "learning_rate": 4.426963961031777e-05, "loss": 0.6474, "num_input_tokens_seen": 23212272, "step": 40000 }, { "epoch": 5.958445040214477, "grad_norm": 0.7708083391189575, "learning_rate": 4.426756925855444e-05, "loss": 0.741, "num_input_tokens_seen": 23215088, "step": 40005 }, { "epoch": 5.959189752755436, "grad_norm": 1.6378763914108276, "learning_rate": 4.4265498581285114e-05, "loss": 0.6283, "num_input_tokens_seen": 23218000, "step": 40010 }, { "epoch": 5.959934465296396, "grad_norm": 0.8829421997070312, "learning_rate": 4.426342757854476e-05, "loss": 0.6327, "num_input_tokens_seen": 23220880, "step": 40015 }, { "epoch": 5.9606791778373545, "grad_norm": 0.8509023785591125, "learning_rate": 4.4261356250368386e-05, "loss": 0.4415, "num_input_tokens_seen": 23223728, "step": 40020 }, { "epoch": 5.961423890378314, "grad_norm": 1.618131160736084, "learning_rate": 4.4259284596790976e-05, "loss": 0.5917, "num_input_tokens_seen": 23226608, "step": 40025 }, { "epoch": 5.962168602919273, "grad_norm": 0.783739447593689, "learning_rate": 4.425721261784751e-05, "loss": 0.6338, "num_input_tokens_seen": 23229264, "step": 40030 }, { "epoch": 5.9629133154602325, "grad_norm": 1.388184666633606, "learning_rate": 4.425514031357302e-05, "loss": 0.6597, "num_input_tokens_seen": 23232368, "step": 40035 }, { "epoch": 5.963658028001191, "grad_norm": 1.0383968353271484, "learning_rate": 4.4253067684002505e-05, "loss": 0.7266, "num_input_tokens_seen": 23235120, "step": 40040 }, { "epoch": 5.964402740542151, "grad_norm": 1.175121545791626, "learning_rate": 4.425099472917098e-05, "loss": 0.6895, "num_input_tokens_seen": 23238192, "step": 40045 }, { "epoch": 5.96514745308311, "grad_norm": 1.7493650913238525, "learning_rate": 4.4248921449113464e-05, "loss": 0.8772, "num_input_tokens_seen": 23241328, "step": 40050 }, { "epoch": 5.965892165624069, "grad_norm": 1.156124472618103, "learning_rate": 4.424684784386498e-05, "loss": 0.6293, "num_input_tokens_seen": 23244304, "step": 40055 }, { "epoch": 5.966636878165028, "grad_norm": 0.8630332946777344, "learning_rate": 4.424477391346057e-05, "loss": 0.6621, "num_input_tokens_seen": 23247056, "step": 40060 }, { "epoch": 5.967381590705988, "grad_norm": 1.1531802415847778, "learning_rate": 4.424269965793526e-05, "loss": 0.7376, "num_input_tokens_seen": 23250128, "step": 40065 }, { "epoch": 5.9681263032469465, "grad_norm": 1.377076268196106, "learning_rate": 4.424062507732409e-05, "loss": 0.6147, "num_input_tokens_seen": 23253008, "step": 40070 }, { "epoch": 5.968871015787906, "grad_norm": 1.1386127471923828, "learning_rate": 4.4238550171662127e-05, "loss": 0.6524, "num_input_tokens_seen": 23255504, "step": 40075 }, { "epoch": 5.969615728328865, "grad_norm": 1.4068193435668945, "learning_rate": 4.423647494098441e-05, "loss": 0.7397, "num_input_tokens_seen": 23258288, "step": 40080 }, { "epoch": 5.9703604408698245, "grad_norm": 0.9440998435020447, "learning_rate": 4.423439938532599e-05, "loss": 0.63, "num_input_tokens_seen": 23261168, "step": 40085 }, { "epoch": 5.971105153410783, "grad_norm": 2.1320574283599854, "learning_rate": 4.423232350472195e-05, "loss": 0.5937, "num_input_tokens_seen": 23263888, "step": 40090 }, { "epoch": 5.971849865951743, "grad_norm": 0.8830459713935852, "learning_rate": 4.423024729920735e-05, "loss": 0.6676, "num_input_tokens_seen": 23266992, "step": 40095 }, { "epoch": 5.972594578492702, "grad_norm": 1.5403975248336792, "learning_rate": 4.422817076881726e-05, "loss": 0.7606, "num_input_tokens_seen": 23269904, "step": 40100 }, { "epoch": 5.973339291033661, "grad_norm": 0.906288743019104, "learning_rate": 4.422609391358677e-05, "loss": 0.7916, "num_input_tokens_seen": 23272752, "step": 40105 }, { "epoch": 5.97408400357462, "grad_norm": 0.8982897996902466, "learning_rate": 4.4224016733550975e-05, "loss": 0.6421, "num_input_tokens_seen": 23275600, "step": 40110 }, { "epoch": 5.97482871611558, "grad_norm": 1.5471196174621582, "learning_rate": 4.4221939228744945e-05, "loss": 0.5422, "num_input_tokens_seen": 23278416, "step": 40115 }, { "epoch": 5.9755734286565385, "grad_norm": 0.8734667897224426, "learning_rate": 4.421986139920379e-05, "loss": 0.7144, "num_input_tokens_seen": 23281424, "step": 40120 }, { "epoch": 5.976318141197497, "grad_norm": 1.2179111242294312, "learning_rate": 4.4217783244962615e-05, "loss": 0.5173, "num_input_tokens_seen": 23284240, "step": 40125 }, { "epoch": 5.977062853738457, "grad_norm": 1.0345408916473389, "learning_rate": 4.421570476605652e-05, "loss": 0.7026, "num_input_tokens_seen": 23287056, "step": 40130 }, { "epoch": 5.977807566279417, "grad_norm": 0.8461352586746216, "learning_rate": 4.421362596252062e-05, "loss": 0.7077, "num_input_tokens_seen": 23289904, "step": 40135 }, { "epoch": 5.978552278820375, "grad_norm": 1.234895944595337, "learning_rate": 4.4211546834390046e-05, "loss": 0.6792, "num_input_tokens_seen": 23292912, "step": 40140 }, { "epoch": 5.979296991361334, "grad_norm": 0.9886916875839233, "learning_rate": 4.420946738169991e-05, "loss": 0.6302, "num_input_tokens_seen": 23295952, "step": 40145 }, { "epoch": 5.980041703902294, "grad_norm": 1.6644848585128784, "learning_rate": 4.4207387604485345e-05, "loss": 0.8304, "num_input_tokens_seen": 23298896, "step": 40150 }, { "epoch": 5.980786416443253, "grad_norm": 0.6563839912414551, "learning_rate": 4.420530750278149e-05, "loss": 0.5986, "num_input_tokens_seen": 23301840, "step": 40155 }, { "epoch": 5.981531128984212, "grad_norm": 0.7242083549499512, "learning_rate": 4.420322707662348e-05, "loss": 0.7474, "num_input_tokens_seen": 23304752, "step": 40160 }, { "epoch": 5.982275841525171, "grad_norm": 1.0913236141204834, "learning_rate": 4.420114632604647e-05, "loss": 0.6024, "num_input_tokens_seen": 23307760, "step": 40165 }, { "epoch": 5.9830205540661305, "grad_norm": 1.3871954679489136, "learning_rate": 4.41990652510856e-05, "loss": 0.69, "num_input_tokens_seen": 23310608, "step": 40170 }, { "epoch": 5.983765266607089, "grad_norm": 1.343623399734497, "learning_rate": 4.4196983851776044e-05, "loss": 0.6536, "num_input_tokens_seen": 23313456, "step": 40175 }, { "epoch": 5.984509979148049, "grad_norm": 0.8581292033195496, "learning_rate": 4.419490212815296e-05, "loss": 0.5136, "num_input_tokens_seen": 23316752, "step": 40180 }, { "epoch": 5.985254691689008, "grad_norm": 1.6869938373565674, "learning_rate": 4.419282008025151e-05, "loss": 0.6745, "num_input_tokens_seen": 23319568, "step": 40185 }, { "epoch": 5.985999404229967, "grad_norm": 1.3730031251907349, "learning_rate": 4.4190737708106864e-05, "loss": 0.6708, "num_input_tokens_seen": 23322416, "step": 40190 }, { "epoch": 5.986744116770926, "grad_norm": 0.7431386709213257, "learning_rate": 4.418865501175422e-05, "loss": 0.5918, "num_input_tokens_seen": 23325232, "step": 40195 }, { "epoch": 5.987488829311886, "grad_norm": 1.443053126335144, "learning_rate": 4.418657199122874e-05, "loss": 0.8229, "num_input_tokens_seen": 23328272, "step": 40200 }, { "epoch": 5.9882335418528445, "grad_norm": 1.1600351333618164, "learning_rate": 4.418448864656564e-05, "loss": 0.6409, "num_input_tokens_seen": 23331376, "step": 40205 }, { "epoch": 5.988978254393804, "grad_norm": 1.0180206298828125, "learning_rate": 4.418240497780009e-05, "loss": 0.6587, "num_input_tokens_seen": 23334352, "step": 40210 }, { "epoch": 5.989722966934763, "grad_norm": 0.9085702300071716, "learning_rate": 4.4180320984967305e-05, "loss": 0.6842, "num_input_tokens_seen": 23337296, "step": 40215 }, { "epoch": 5.990467679475723, "grad_norm": 1.2754440307617188, "learning_rate": 4.4178236668102504e-05, "loss": 0.8081, "num_input_tokens_seen": 23340432, "step": 40220 }, { "epoch": 5.991212392016681, "grad_norm": 0.9530972838401794, "learning_rate": 4.417615202724087e-05, "loss": 0.6563, "num_input_tokens_seen": 23343120, "step": 40225 }, { "epoch": 5.991957104557641, "grad_norm": 1.1376094818115234, "learning_rate": 4.4174067062417645e-05, "loss": 0.5496, "num_input_tokens_seen": 23345936, "step": 40230 }, { "epoch": 5.9927018170986, "grad_norm": 1.9099079370498657, "learning_rate": 4.417198177366805e-05, "loss": 0.7309, "num_input_tokens_seen": 23348912, "step": 40235 }, { "epoch": 5.993446529639559, "grad_norm": 1.0537989139556885, "learning_rate": 4.41698961610273e-05, "loss": 0.7368, "num_input_tokens_seen": 23352208, "step": 40240 }, { "epoch": 5.994191242180518, "grad_norm": 1.123478651046753, "learning_rate": 4.416781022453064e-05, "loss": 0.5726, "num_input_tokens_seen": 23355376, "step": 40245 }, { "epoch": 5.994935954721478, "grad_norm": 3.281773805618286, "learning_rate": 4.4165723964213314e-05, "loss": 0.6863, "num_input_tokens_seen": 23358160, "step": 40250 }, { "epoch": 5.9956806672624365, "grad_norm": 1.7940123081207275, "learning_rate": 4.4163637380110555e-05, "loss": 0.808, "num_input_tokens_seen": 23361040, "step": 40255 }, { "epoch": 5.996425379803396, "grad_norm": 1.4242289066314697, "learning_rate": 4.416155047225762e-05, "loss": 0.7762, "num_input_tokens_seen": 23364016, "step": 40260 }, { "epoch": 5.997170092344355, "grad_norm": 0.8185709118843079, "learning_rate": 4.415946324068976e-05, "loss": 0.3969, "num_input_tokens_seen": 23366800, "step": 40265 }, { "epoch": 5.997914804885315, "grad_norm": 1.0195962190628052, "learning_rate": 4.4157375685442246e-05, "loss": 0.5582, "num_input_tokens_seen": 23369296, "step": 40270 }, { "epoch": 5.998659517426273, "grad_norm": 1.1002130508422852, "learning_rate": 4.415528780655034e-05, "loss": 0.6106, "num_input_tokens_seen": 23372112, "step": 40275 }, { "epoch": 5.999404229967233, "grad_norm": 1.0430338382720947, "learning_rate": 4.4153199604049315e-05, "loss": 0.6932, "num_input_tokens_seen": 23374992, "step": 40280 }, { "epoch": 6.0, "eval_loss": 0.6587122082710266, "eval_runtime": 47.0687, "eval_samples_per_second": 63.397, "eval_steps_per_second": 15.849, "num_input_tokens_seen": 23376776, "step": 40284 }, { "epoch": 6.000148942508192, "grad_norm": 1.0201009511947632, "learning_rate": 4.415111107797445e-05, "loss": 0.559, "num_input_tokens_seen": 23377352, "step": 40285 }, { "epoch": 6.000893655049151, "grad_norm": 1.3899444341659546, "learning_rate": 4.414902222836103e-05, "loss": 0.6681, "num_input_tokens_seen": 23379944, "step": 40290 }, { "epoch": 6.00163836759011, "grad_norm": 1.3989691734313965, "learning_rate": 4.414693305524434e-05, "loss": 0.6038, "num_input_tokens_seen": 23382792, "step": 40295 }, { "epoch": 6.00238308013107, "grad_norm": 1.2563955783843994, "learning_rate": 4.4144843558659675e-05, "loss": 0.7678, "num_input_tokens_seen": 23385960, "step": 40300 }, { "epoch": 6.003127792672029, "grad_norm": 0.9140015244483948, "learning_rate": 4.414275373864234e-05, "loss": 0.6726, "num_input_tokens_seen": 23389160, "step": 40305 }, { "epoch": 6.003872505212988, "grad_norm": 1.0961804389953613, "learning_rate": 4.4140663595227624e-05, "loss": 0.6056, "num_input_tokens_seen": 23392040, "step": 40310 }, { "epoch": 6.004617217753947, "grad_norm": 1.566105842590332, "learning_rate": 4.413857312845086e-05, "loss": 0.5947, "num_input_tokens_seen": 23394984, "step": 40315 }, { "epoch": 6.005361930294906, "grad_norm": 1.1729676723480225, "learning_rate": 4.4136482338347356e-05, "loss": 0.6512, "num_input_tokens_seen": 23397768, "step": 40320 }, { "epoch": 6.006106642835865, "grad_norm": 1.110213041305542, "learning_rate": 4.413439122495243e-05, "loss": 0.6814, "num_input_tokens_seen": 23400648, "step": 40325 }, { "epoch": 6.006851355376824, "grad_norm": 1.100525140762329, "learning_rate": 4.413229978830141e-05, "loss": 0.623, "num_input_tokens_seen": 23403720, "step": 40330 }, { "epoch": 6.007596067917784, "grad_norm": 1.3049224615097046, "learning_rate": 4.413020802842963e-05, "loss": 0.7185, "num_input_tokens_seen": 23406664, "step": 40335 }, { "epoch": 6.0083407804587425, "grad_norm": 0.8515501618385315, "learning_rate": 4.412811594537243e-05, "loss": 0.699, "num_input_tokens_seen": 23409352, "step": 40340 }, { "epoch": 6.009085492999702, "grad_norm": 0.9300693869590759, "learning_rate": 4.4126023539165155e-05, "loss": 0.6795, "num_input_tokens_seen": 23412104, "step": 40345 }, { "epoch": 6.009830205540661, "grad_norm": 1.4600698947906494, "learning_rate": 4.412393080984315e-05, "loss": 0.66, "num_input_tokens_seen": 23414888, "step": 40350 }, { "epoch": 6.010574918081621, "grad_norm": 1.0314408540725708, "learning_rate": 4.412183775744177e-05, "loss": 0.6204, "num_input_tokens_seen": 23417608, "step": 40355 }, { "epoch": 6.011319630622579, "grad_norm": 1.2272759675979614, "learning_rate": 4.411974438199637e-05, "loss": 0.6935, "num_input_tokens_seen": 23420584, "step": 40360 }, { "epoch": 6.012064343163539, "grad_norm": 0.6881519556045532, "learning_rate": 4.411765068354233e-05, "loss": 0.5906, "num_input_tokens_seen": 23423528, "step": 40365 }, { "epoch": 6.012809055704498, "grad_norm": 0.9864732027053833, "learning_rate": 4.4115556662115004e-05, "loss": 0.7691, "num_input_tokens_seen": 23426280, "step": 40370 }, { "epoch": 6.013553768245457, "grad_norm": 1.3626861572265625, "learning_rate": 4.411346231774978e-05, "loss": 0.6148, "num_input_tokens_seen": 23429416, "step": 40375 }, { "epoch": 6.014298480786416, "grad_norm": 0.9853082895278931, "learning_rate": 4.411136765048204e-05, "loss": 0.6232, "num_input_tokens_seen": 23432360, "step": 40380 }, { "epoch": 6.015043193327376, "grad_norm": 1.1570285558700562, "learning_rate": 4.410927266034716e-05, "loss": 0.7746, "num_input_tokens_seen": 23435336, "step": 40385 }, { "epoch": 6.015787905868335, "grad_norm": 1.2681862115859985, "learning_rate": 4.4107177347380545e-05, "loss": 0.5521, "num_input_tokens_seen": 23438408, "step": 40390 }, { "epoch": 6.016532618409294, "grad_norm": 0.7855640053749084, "learning_rate": 4.4105081711617594e-05, "loss": 0.6202, "num_input_tokens_seen": 23441416, "step": 40395 }, { "epoch": 6.017277330950253, "grad_norm": 0.8179792761802673, "learning_rate": 4.410298575309369e-05, "loss": 0.4576, "num_input_tokens_seen": 23444392, "step": 40400 }, { "epoch": 6.018022043491213, "grad_norm": 1.1838175058364868, "learning_rate": 4.4100889471844263e-05, "loss": 0.689, "num_input_tokens_seen": 23447272, "step": 40405 }, { "epoch": 6.018766756032171, "grad_norm": 0.8087660670280457, "learning_rate": 4.4098792867904724e-05, "loss": 0.5741, "num_input_tokens_seen": 23450344, "step": 40410 }, { "epoch": 6.019511468573131, "grad_norm": 1.5551105737686157, "learning_rate": 4.409669594131049e-05, "loss": 0.6738, "num_input_tokens_seen": 23453192, "step": 40415 }, { "epoch": 6.02025618111409, "grad_norm": 1.354210615158081, "learning_rate": 4.409459869209699e-05, "loss": 0.6893, "num_input_tokens_seen": 23456040, "step": 40420 }, { "epoch": 6.021000893655049, "grad_norm": 0.7641433477401733, "learning_rate": 4.409250112029965e-05, "loss": 0.769, "num_input_tokens_seen": 23458664, "step": 40425 }, { "epoch": 6.021745606196008, "grad_norm": 1.8807966709136963, "learning_rate": 4.4090403225953915e-05, "loss": 0.7785, "num_input_tokens_seen": 23461512, "step": 40430 }, { "epoch": 6.022490318736968, "grad_norm": 1.2011038064956665, "learning_rate": 4.408830500909521e-05, "loss": 0.7897, "num_input_tokens_seen": 23464136, "step": 40435 }, { "epoch": 6.023235031277927, "grad_norm": 1.2476987838745117, "learning_rate": 4.408620646975899e-05, "loss": 0.6167, "num_input_tokens_seen": 23466920, "step": 40440 }, { "epoch": 6.023979743818886, "grad_norm": 1.5770351886749268, "learning_rate": 4.408410760798072e-05, "loss": 0.6029, "num_input_tokens_seen": 23469640, "step": 40445 }, { "epoch": 6.024724456359845, "grad_norm": 0.913801372051239, "learning_rate": 4.408200842379584e-05, "loss": 0.7475, "num_input_tokens_seen": 23472552, "step": 40450 }, { "epoch": 6.025469168900805, "grad_norm": 1.5066297054290771, "learning_rate": 4.407990891723983e-05, "loss": 0.5859, "num_input_tokens_seen": 23475688, "step": 40455 }, { "epoch": 6.026213881441763, "grad_norm": 1.657523274421692, "learning_rate": 4.407780908834814e-05, "loss": 0.6347, "num_input_tokens_seen": 23478664, "step": 40460 }, { "epoch": 6.026958593982723, "grad_norm": 1.0788110494613647, "learning_rate": 4.407570893715627e-05, "loss": 0.5119, "num_input_tokens_seen": 23481352, "step": 40465 }, { "epoch": 6.027703306523682, "grad_norm": 1.0582538843154907, "learning_rate": 4.4073608463699676e-05, "loss": 0.6316, "num_input_tokens_seen": 23484040, "step": 40470 }, { "epoch": 6.0284480190646414, "grad_norm": 0.9165743589401245, "learning_rate": 4.4071507668013854e-05, "loss": 0.7494, "num_input_tokens_seen": 23486952, "step": 40475 }, { "epoch": 6.0291927316056, "grad_norm": 0.984519362449646, "learning_rate": 4.406940655013429e-05, "loss": 0.6102, "num_input_tokens_seen": 23489768, "step": 40480 }, { "epoch": 6.02993744414656, "grad_norm": 0.9223398566246033, "learning_rate": 4.406730511009649e-05, "loss": 0.6458, "num_input_tokens_seen": 23492904, "step": 40485 }, { "epoch": 6.030682156687519, "grad_norm": 1.1288059949874878, "learning_rate": 4.406520334793595e-05, "loss": 0.5451, "num_input_tokens_seen": 23495624, "step": 40490 }, { "epoch": 6.031426869228477, "grad_norm": 1.6703354120254517, "learning_rate": 4.4063101263688164e-05, "loss": 0.7797, "num_input_tokens_seen": 23498664, "step": 40495 }, { "epoch": 6.032171581769437, "grad_norm": 0.8039628863334656, "learning_rate": 4.406099885738866e-05, "loss": 0.7317, "num_input_tokens_seen": 23501704, "step": 40500 }, { "epoch": 6.032916294310396, "grad_norm": 1.1251248121261597, "learning_rate": 4.405889612907296e-05, "loss": 0.6132, "num_input_tokens_seen": 23504936, "step": 40505 }, { "epoch": 6.033661006851355, "grad_norm": 1.0713821649551392, "learning_rate": 4.405679307877658e-05, "loss": 0.5894, "num_input_tokens_seen": 23507528, "step": 40510 }, { "epoch": 6.034405719392314, "grad_norm": 1.461859107017517, "learning_rate": 4.4054689706535044e-05, "loss": 0.6784, "num_input_tokens_seen": 23510312, "step": 40515 }, { "epoch": 6.035150431933274, "grad_norm": 1.0668519735336304, "learning_rate": 4.40525860123839e-05, "loss": 0.666, "num_input_tokens_seen": 23513416, "step": 40520 }, { "epoch": 6.035895144474233, "grad_norm": 1.5902658700942993, "learning_rate": 4.405048199635868e-05, "loss": 0.6935, "num_input_tokens_seen": 23516552, "step": 40525 }, { "epoch": 6.036639857015192, "grad_norm": 1.4493541717529297, "learning_rate": 4.404837765849492e-05, "loss": 0.548, "num_input_tokens_seen": 23519336, "step": 40530 }, { "epoch": 6.037384569556151, "grad_norm": 0.9986886382102966, "learning_rate": 4.4046272998828186e-05, "loss": 0.6151, "num_input_tokens_seen": 23522504, "step": 40535 }, { "epoch": 6.038129282097111, "grad_norm": 2.81318736076355, "learning_rate": 4.4044168017394025e-05, "loss": 0.5114, "num_input_tokens_seen": 23525032, "step": 40540 }, { "epoch": 6.038873994638069, "grad_norm": 1.0082310438156128, "learning_rate": 4.4042062714228e-05, "loss": 0.6973, "num_input_tokens_seen": 23528072, "step": 40545 }, { "epoch": 6.039618707179029, "grad_norm": 1.9980930089950562, "learning_rate": 4.403995708936568e-05, "loss": 0.7976, "num_input_tokens_seen": 23530888, "step": 40550 }, { "epoch": 6.040363419719988, "grad_norm": 1.9650886058807373, "learning_rate": 4.403785114284263e-05, "loss": 0.7178, "num_input_tokens_seen": 23534024, "step": 40555 }, { "epoch": 6.0411081322609474, "grad_norm": 1.1295123100280762, "learning_rate": 4.4035744874694444e-05, "loss": 0.6023, "num_input_tokens_seen": 23536744, "step": 40560 }, { "epoch": 6.041852844801906, "grad_norm": 0.6684537529945374, "learning_rate": 4.403363828495669e-05, "loss": 0.5011, "num_input_tokens_seen": 23539720, "step": 40565 }, { "epoch": 6.042597557342866, "grad_norm": 1.1170016527175903, "learning_rate": 4.403153137366497e-05, "loss": 0.6476, "num_input_tokens_seen": 23542664, "step": 40570 }, { "epoch": 6.043342269883825, "grad_norm": 1.049407958984375, "learning_rate": 4.402942414085486e-05, "loss": 0.5975, "num_input_tokens_seen": 23545512, "step": 40575 }, { "epoch": 6.044086982424784, "grad_norm": 0.8835940957069397, "learning_rate": 4.4027316586561976e-05, "loss": 0.4956, "num_input_tokens_seen": 23548424, "step": 40580 }, { "epoch": 6.044831694965743, "grad_norm": 0.8724485635757446, "learning_rate": 4.402520871082191e-05, "loss": 0.493, "num_input_tokens_seen": 23551400, "step": 40585 }, { "epoch": 6.045576407506703, "grad_norm": 1.3656858205795288, "learning_rate": 4.402310051367029e-05, "loss": 0.6305, "num_input_tokens_seen": 23554568, "step": 40590 }, { "epoch": 6.046321120047661, "grad_norm": 1.0951318740844727, "learning_rate": 4.4020991995142716e-05, "loss": 0.7071, "num_input_tokens_seen": 23557256, "step": 40595 }, { "epoch": 6.047065832588621, "grad_norm": 1.185571551322937, "learning_rate": 4.401888315527481e-05, "loss": 0.493, "num_input_tokens_seen": 23560040, "step": 40600 }, { "epoch": 6.04781054512958, "grad_norm": 1.0000284910202026, "learning_rate": 4.40167739941022e-05, "loss": 0.4885, "num_input_tokens_seen": 23562760, "step": 40605 }, { "epoch": 6.0485552576705395, "grad_norm": 1.1524932384490967, "learning_rate": 4.401466451166053e-05, "loss": 0.694, "num_input_tokens_seen": 23565768, "step": 40610 }, { "epoch": 6.049299970211498, "grad_norm": 2.2077722549438477, "learning_rate": 4.401255470798543e-05, "loss": 0.8519, "num_input_tokens_seen": 23568712, "step": 40615 }, { "epoch": 6.050044682752458, "grad_norm": 1.3046373128890991, "learning_rate": 4.401044458311254e-05, "loss": 0.6435, "num_input_tokens_seen": 23571592, "step": 40620 }, { "epoch": 6.050789395293417, "grad_norm": 1.0951234102249146, "learning_rate": 4.40083341370775e-05, "loss": 0.7942, "num_input_tokens_seen": 23574600, "step": 40625 }, { "epoch": 6.051534107834376, "grad_norm": 2.7929630279541016, "learning_rate": 4.400622336991599e-05, "loss": 0.6888, "num_input_tokens_seen": 23577448, "step": 40630 }, { "epoch": 6.052278820375335, "grad_norm": 1.3240283727645874, "learning_rate": 4.400411228166364e-05, "loss": 0.5204, "num_input_tokens_seen": 23580040, "step": 40635 }, { "epoch": 6.053023532916295, "grad_norm": 1.0535178184509277, "learning_rate": 4.400200087235613e-05, "loss": 0.5933, "num_input_tokens_seen": 23583048, "step": 40640 }, { "epoch": 6.0537682454572534, "grad_norm": 1.1170293092727661, "learning_rate": 4.399988914202913e-05, "loss": 0.5851, "num_input_tokens_seen": 23585832, "step": 40645 }, { "epoch": 6.054512957998213, "grad_norm": 0.586089015007019, "learning_rate": 4.399777709071832e-05, "loss": 0.5191, "num_input_tokens_seen": 23588808, "step": 40650 }, { "epoch": 6.055257670539172, "grad_norm": 1.1227830648422241, "learning_rate": 4.399566471845937e-05, "loss": 0.6301, "num_input_tokens_seen": 23591656, "step": 40655 }, { "epoch": 6.0560023830801315, "grad_norm": 1.250432014465332, "learning_rate": 4.3993552025287966e-05, "loss": 0.5205, "num_input_tokens_seen": 23594184, "step": 40660 }, { "epoch": 6.05674709562109, "grad_norm": 2.3249335289001465, "learning_rate": 4.399143901123981e-05, "loss": 0.6848, "num_input_tokens_seen": 23597032, "step": 40665 }, { "epoch": 6.057491808162049, "grad_norm": 1.2237417697906494, "learning_rate": 4.398932567635059e-05, "loss": 0.7765, "num_input_tokens_seen": 23599880, "step": 40670 }, { "epoch": 6.058236520703009, "grad_norm": 2.0248920917510986, "learning_rate": 4.398721202065602e-05, "loss": 0.5856, "num_input_tokens_seen": 23602728, "step": 40675 }, { "epoch": 6.058981233243967, "grad_norm": 1.145066499710083, "learning_rate": 4.398509804419179e-05, "loss": 0.4845, "num_input_tokens_seen": 23605640, "step": 40680 }, { "epoch": 6.059725945784927, "grad_norm": 0.8478415608406067, "learning_rate": 4.3982983746993636e-05, "loss": 0.5978, "num_input_tokens_seen": 23608264, "step": 40685 }, { "epoch": 6.060470658325886, "grad_norm": 1.0158717632293701, "learning_rate": 4.398086912909726e-05, "loss": 0.6457, "num_input_tokens_seen": 23611176, "step": 40690 }, { "epoch": 6.0612153708668455, "grad_norm": 1.7473704814910889, "learning_rate": 4.397875419053838e-05, "loss": 0.6978, "num_input_tokens_seen": 23614344, "step": 40695 }, { "epoch": 6.061960083407804, "grad_norm": 0.598556637763977, "learning_rate": 4.397663893135275e-05, "loss": 0.3814, "num_input_tokens_seen": 23617192, "step": 40700 }, { "epoch": 6.062704795948764, "grad_norm": 0.8505914807319641, "learning_rate": 4.397452335157609e-05, "loss": 0.6161, "num_input_tokens_seen": 23620456, "step": 40705 }, { "epoch": 6.063449508489723, "grad_norm": 1.1110682487487793, "learning_rate": 4.397240745124414e-05, "loss": 0.6228, "num_input_tokens_seen": 23623464, "step": 40710 }, { "epoch": 6.064194221030682, "grad_norm": 0.7519652247428894, "learning_rate": 4.397029123039266e-05, "loss": 0.6816, "num_input_tokens_seen": 23626696, "step": 40715 }, { "epoch": 6.064938933571641, "grad_norm": 1.299349308013916, "learning_rate": 4.396817468905738e-05, "loss": 0.6514, "num_input_tokens_seen": 23629608, "step": 40720 }, { "epoch": 6.065683646112601, "grad_norm": 0.8845204710960388, "learning_rate": 4.396605782727406e-05, "loss": 0.5268, "num_input_tokens_seen": 23632520, "step": 40725 }, { "epoch": 6.0664283586535594, "grad_norm": 1.1214911937713623, "learning_rate": 4.3963940645078484e-05, "loss": 0.6944, "num_input_tokens_seen": 23635528, "step": 40730 }, { "epoch": 6.067173071194519, "grad_norm": 1.0989230871200562, "learning_rate": 4.3961823142506395e-05, "loss": 0.7709, "num_input_tokens_seen": 23638376, "step": 40735 }, { "epoch": 6.067917783735478, "grad_norm": 0.825761616230011, "learning_rate": 4.395970531959358e-05, "loss": 0.7357, "num_input_tokens_seen": 23641192, "step": 40740 }, { "epoch": 6.0686624962764375, "grad_norm": 0.7292333245277405, "learning_rate": 4.395758717637581e-05, "loss": 0.5391, "num_input_tokens_seen": 23644040, "step": 40745 }, { "epoch": 6.069407208817396, "grad_norm": 1.150227665901184, "learning_rate": 4.3955468712888884e-05, "loss": 0.6137, "num_input_tokens_seen": 23647048, "step": 40750 }, { "epoch": 6.070151921358356, "grad_norm": 1.468427062034607, "learning_rate": 4.395334992916857e-05, "loss": 0.6869, "num_input_tokens_seen": 23649768, "step": 40755 }, { "epoch": 6.070896633899315, "grad_norm": 1.075304627418518, "learning_rate": 4.395123082525067e-05, "loss": 0.7172, "num_input_tokens_seen": 23652648, "step": 40760 }, { "epoch": 6.071641346440274, "grad_norm": 0.8125884532928467, "learning_rate": 4.394911140117099e-05, "loss": 0.5949, "num_input_tokens_seen": 23655336, "step": 40765 }, { "epoch": 6.072386058981233, "grad_norm": 1.1165744066238403, "learning_rate": 4.3946991656965334e-05, "loss": 0.6024, "num_input_tokens_seen": 23658504, "step": 40770 }, { "epoch": 6.073130771522193, "grad_norm": 0.9232101440429688, "learning_rate": 4.394487159266951e-05, "loss": 0.6962, "num_input_tokens_seen": 23661288, "step": 40775 }, { "epoch": 6.0738754840631515, "grad_norm": 0.8797985315322876, "learning_rate": 4.394275120831933e-05, "loss": 0.6291, "num_input_tokens_seen": 23664328, "step": 40780 }, { "epoch": 6.074620196604111, "grad_norm": 1.0913821458816528, "learning_rate": 4.394063050395063e-05, "loss": 0.6272, "num_input_tokens_seen": 23667240, "step": 40785 }, { "epoch": 6.07536490914507, "grad_norm": 1.3592517375946045, "learning_rate": 4.393850947959922e-05, "loss": 0.5964, "num_input_tokens_seen": 23669960, "step": 40790 }, { "epoch": 6.0761096216860295, "grad_norm": 0.9624893069267273, "learning_rate": 4.3936388135300946e-05, "loss": 0.6421, "num_input_tokens_seen": 23672904, "step": 40795 }, { "epoch": 6.076854334226988, "grad_norm": 0.7706442475318909, "learning_rate": 4.3934266471091635e-05, "loss": 0.59, "num_input_tokens_seen": 23676040, "step": 40800 }, { "epoch": 6.077599046767948, "grad_norm": 1.2321968078613281, "learning_rate": 4.393214448700713e-05, "loss": 0.5856, "num_input_tokens_seen": 23678984, "step": 40805 }, { "epoch": 6.078343759308907, "grad_norm": 0.7925340533256531, "learning_rate": 4.39300221830833e-05, "loss": 0.5562, "num_input_tokens_seen": 23681800, "step": 40810 }, { "epoch": 6.079088471849866, "grad_norm": 1.6906579732894897, "learning_rate": 4.392789955935598e-05, "loss": 0.5589, "num_input_tokens_seen": 23684936, "step": 40815 }, { "epoch": 6.079833184390825, "grad_norm": 1.1407090425491333, "learning_rate": 4.3925776615861034e-05, "loss": 0.6586, "num_input_tokens_seen": 23687816, "step": 40820 }, { "epoch": 6.080577896931785, "grad_norm": 1.130338191986084, "learning_rate": 4.392365335263432e-05, "loss": 0.5537, "num_input_tokens_seen": 23690760, "step": 40825 }, { "epoch": 6.0813226094727435, "grad_norm": 1.3964636325836182, "learning_rate": 4.392152976971173e-05, "loss": 0.751, "num_input_tokens_seen": 23693608, "step": 40830 }, { "epoch": 6.082067322013703, "grad_norm": 1.3742611408233643, "learning_rate": 4.3919405867129114e-05, "loss": 0.6741, "num_input_tokens_seen": 23696360, "step": 40835 }, { "epoch": 6.082812034554662, "grad_norm": 1.2208272218704224, "learning_rate": 4.391728164492237e-05, "loss": 0.6143, "num_input_tokens_seen": 23699080, "step": 40840 }, { "epoch": 6.083556747095621, "grad_norm": 0.8562164306640625, "learning_rate": 4.391515710312738e-05, "loss": 0.7183, "num_input_tokens_seen": 23702184, "step": 40845 }, { "epoch": 6.08430145963658, "grad_norm": 1.1884939670562744, "learning_rate": 4.391303224178003e-05, "loss": 0.6661, "num_input_tokens_seen": 23704936, "step": 40850 }, { "epoch": 6.085046172177539, "grad_norm": 1.3243112564086914, "learning_rate": 4.391090706091623e-05, "loss": 0.6355, "num_input_tokens_seen": 23707496, "step": 40855 }, { "epoch": 6.085790884718499, "grad_norm": 1.5929325819015503, "learning_rate": 4.390878156057186e-05, "loss": 0.56, "num_input_tokens_seen": 23710216, "step": 40860 }, { "epoch": 6.0865355972594575, "grad_norm": 0.8461965322494507, "learning_rate": 4.390665574078286e-05, "loss": 0.5609, "num_input_tokens_seen": 23713160, "step": 40865 }, { "epoch": 6.087280309800417, "grad_norm": 1.1170789003372192, "learning_rate": 4.390452960158512e-05, "loss": 0.8033, "num_input_tokens_seen": 23716200, "step": 40870 }, { "epoch": 6.088025022341376, "grad_norm": 1.3081785440444946, "learning_rate": 4.390240314301457e-05, "loss": 0.6773, "num_input_tokens_seen": 23719464, "step": 40875 }, { "epoch": 6.0887697348823355, "grad_norm": 2.280524253845215, "learning_rate": 4.3900276365107126e-05, "loss": 0.5228, "num_input_tokens_seen": 23722312, "step": 40880 }, { "epoch": 6.089514447423294, "grad_norm": 0.998252809047699, "learning_rate": 4.3898149267898727e-05, "loss": 0.5641, "num_input_tokens_seen": 23724968, "step": 40885 }, { "epoch": 6.090259159964254, "grad_norm": 0.8008601665496826, "learning_rate": 4.3896021851425306e-05, "loss": 0.5547, "num_input_tokens_seen": 23727752, "step": 40890 }, { "epoch": 6.091003872505213, "grad_norm": 0.9193388223648071, "learning_rate": 4.389389411572279e-05, "loss": 0.5646, "num_input_tokens_seen": 23730600, "step": 40895 }, { "epoch": 6.091748585046172, "grad_norm": 0.7925918698310852, "learning_rate": 4.389176606082714e-05, "loss": 0.6483, "num_input_tokens_seen": 23733544, "step": 40900 }, { "epoch": 6.092493297587131, "grad_norm": 1.8740326166152954, "learning_rate": 4.388963768677431e-05, "loss": 0.6543, "num_input_tokens_seen": 23736584, "step": 40905 }, { "epoch": 6.093238010128091, "grad_norm": 1.965549111366272, "learning_rate": 4.388750899360025e-05, "loss": 0.7952, "num_input_tokens_seen": 23739592, "step": 40910 }, { "epoch": 6.0939827226690495, "grad_norm": 1.5654295682907104, "learning_rate": 4.3885379981340905e-05, "loss": 0.6558, "num_input_tokens_seen": 23742408, "step": 40915 }, { "epoch": 6.094727435210009, "grad_norm": 0.9820983409881592, "learning_rate": 4.388325065003228e-05, "loss": 0.5834, "num_input_tokens_seen": 23745256, "step": 40920 }, { "epoch": 6.095472147750968, "grad_norm": 2.8384478092193604, "learning_rate": 4.3881120999710315e-05, "loss": 0.7339, "num_input_tokens_seen": 23748168, "step": 40925 }, { "epoch": 6.0962168602919276, "grad_norm": 1.4890683889389038, "learning_rate": 4.3878991030411e-05, "loss": 0.6621, "num_input_tokens_seen": 23751112, "step": 40930 }, { "epoch": 6.096961572832886, "grad_norm": 1.3168747425079346, "learning_rate": 4.387686074217032e-05, "loss": 0.6371, "num_input_tokens_seen": 23753960, "step": 40935 }, { "epoch": 6.097706285373846, "grad_norm": 1.4175982475280762, "learning_rate": 4.387473013502427e-05, "loss": 0.5896, "num_input_tokens_seen": 23756808, "step": 40940 }, { "epoch": 6.098450997914805, "grad_norm": 1.3745405673980713, "learning_rate": 4.387259920900884e-05, "loss": 0.6516, "num_input_tokens_seen": 23759816, "step": 40945 }, { "epoch": 6.099195710455764, "grad_norm": 1.3663358688354492, "learning_rate": 4.3870467964160015e-05, "loss": 0.6201, "num_input_tokens_seen": 23762568, "step": 40950 }, { "epoch": 6.099940422996723, "grad_norm": 1.0534058809280396, "learning_rate": 4.3868336400513823e-05, "loss": 0.6742, "num_input_tokens_seen": 23765512, "step": 40955 }, { "epoch": 6.100685135537683, "grad_norm": 1.0099899768829346, "learning_rate": 4.386620451810626e-05, "loss": 0.6162, "num_input_tokens_seen": 23768296, "step": 40960 }, { "epoch": 6.1014298480786415, "grad_norm": 1.3571813106536865, "learning_rate": 4.3864072316973345e-05, "loss": 0.5397, "num_input_tokens_seen": 23771144, "step": 40965 }, { "epoch": 6.102174560619601, "grad_norm": 1.3667576313018799, "learning_rate": 4.386193979715111e-05, "loss": 0.6885, "num_input_tokens_seen": 23774248, "step": 40970 }, { "epoch": 6.10291927316056, "grad_norm": 1.10528564453125, "learning_rate": 4.385980695867556e-05, "loss": 0.5705, "num_input_tokens_seen": 23777384, "step": 40975 }, { "epoch": 6.10366398570152, "grad_norm": 0.8513839244842529, "learning_rate": 4.385767380158275e-05, "loss": 0.5199, "num_input_tokens_seen": 23780456, "step": 40980 }, { "epoch": 6.104408698242478, "grad_norm": 0.9304792284965515, "learning_rate": 4.38555403259087e-05, "loss": 0.6033, "num_input_tokens_seen": 23783464, "step": 40985 }, { "epoch": 6.105153410783438, "grad_norm": 1.0078845024108887, "learning_rate": 4.3853406531689465e-05, "loss": 0.5849, "num_input_tokens_seen": 23786344, "step": 40990 }, { "epoch": 6.105898123324397, "grad_norm": 1.643272042274475, "learning_rate": 4.3851272418961085e-05, "loss": 0.7474, "num_input_tokens_seen": 23789480, "step": 40995 }, { "epoch": 6.106642835865356, "grad_norm": 1.4087274074554443, "learning_rate": 4.384913798775962e-05, "loss": 0.5951, "num_input_tokens_seen": 23792840, "step": 41000 }, { "epoch": 6.107387548406315, "grad_norm": 0.957301914691925, "learning_rate": 4.384700323812112e-05, "loss": 0.6919, "num_input_tokens_seen": 23795432, "step": 41005 }, { "epoch": 6.108132260947274, "grad_norm": 0.5723753571510315, "learning_rate": 4.3844868170081665e-05, "loss": 0.5416, "num_input_tokens_seen": 23798216, "step": 41010 }, { "epoch": 6.1088769734882336, "grad_norm": 3.40548038482666, "learning_rate": 4.384273278367731e-05, "loss": 0.7126, "num_input_tokens_seen": 23801064, "step": 41015 }, { "epoch": 6.109621686029192, "grad_norm": 0.7623093724250793, "learning_rate": 4.3840597078944135e-05, "loss": 0.7322, "num_input_tokens_seen": 23803752, "step": 41020 }, { "epoch": 6.110366398570152, "grad_norm": 1.1509318351745605, "learning_rate": 4.3838461055918226e-05, "loss": 0.6665, "num_input_tokens_seen": 23806696, "step": 41025 }, { "epoch": 6.111111111111111, "grad_norm": 2.6573033332824707, "learning_rate": 4.383632471463566e-05, "loss": 0.612, "num_input_tokens_seen": 23809480, "step": 41030 }, { "epoch": 6.11185582365207, "grad_norm": 1.325111746788025, "learning_rate": 4.383418805513253e-05, "loss": 0.8256, "num_input_tokens_seen": 23811944, "step": 41035 }, { "epoch": 6.112600536193029, "grad_norm": 0.8878706693649292, "learning_rate": 4.3832051077444937e-05, "loss": 0.541, "num_input_tokens_seen": 23814856, "step": 41040 }, { "epoch": 6.113345248733989, "grad_norm": 1.9129832983016968, "learning_rate": 4.382991378160898e-05, "loss": 0.5308, "num_input_tokens_seen": 23817672, "step": 41045 }, { "epoch": 6.1140899612749475, "grad_norm": 0.7721078395843506, "learning_rate": 4.3827776167660775e-05, "loss": 0.5489, "num_input_tokens_seen": 23820584, "step": 41050 }, { "epoch": 6.114834673815907, "grad_norm": 1.944424033164978, "learning_rate": 4.382563823563642e-05, "loss": 0.6168, "num_input_tokens_seen": 23823496, "step": 41055 }, { "epoch": 6.115579386356866, "grad_norm": 0.9830671548843384, "learning_rate": 4.382349998557204e-05, "loss": 0.5558, "num_input_tokens_seen": 23826472, "step": 41060 }, { "epoch": 6.116324098897826, "grad_norm": 1.0172635316848755, "learning_rate": 4.382136141750376e-05, "loss": 0.7086, "num_input_tokens_seen": 23829480, "step": 41065 }, { "epoch": 6.117068811438784, "grad_norm": 1.007790207862854, "learning_rate": 4.381922253146771e-05, "loss": 0.577, "num_input_tokens_seen": 23831976, "step": 41070 }, { "epoch": 6.117813523979744, "grad_norm": 0.8524735569953918, "learning_rate": 4.381708332750002e-05, "loss": 0.5942, "num_input_tokens_seen": 23834824, "step": 41075 }, { "epoch": 6.118558236520703, "grad_norm": 0.9747980237007141, "learning_rate": 4.381494380563683e-05, "loss": 0.5918, "num_input_tokens_seen": 23837960, "step": 41080 }, { "epoch": 6.119302949061662, "grad_norm": 0.973006010055542, "learning_rate": 4.3812803965914296e-05, "loss": 0.7563, "num_input_tokens_seen": 23841000, "step": 41085 }, { "epoch": 6.120047661602621, "grad_norm": 1.3868449926376343, "learning_rate": 4.381066380836855e-05, "loss": 0.7371, "num_input_tokens_seen": 23843720, "step": 41090 }, { "epoch": 6.120792374143581, "grad_norm": 1.2359845638275146, "learning_rate": 4.380852333303576e-05, "loss": 0.5939, "num_input_tokens_seen": 23846888, "step": 41095 }, { "epoch": 6.1215370866845396, "grad_norm": 2.192685127258301, "learning_rate": 4.380638253995209e-05, "loss": 0.6658, "num_input_tokens_seen": 23849640, "step": 41100 }, { "epoch": 6.122281799225499, "grad_norm": 0.6733114719390869, "learning_rate": 4.380424142915369e-05, "loss": 0.5785, "num_input_tokens_seen": 23852808, "step": 41105 }, { "epoch": 6.123026511766458, "grad_norm": 1.0793074369430542, "learning_rate": 4.380210000067675e-05, "loss": 0.5524, "num_input_tokens_seen": 23855784, "step": 41110 }, { "epoch": 6.123771224307418, "grad_norm": 1.153267502784729, "learning_rate": 4.379995825455744e-05, "loss": 0.6248, "num_input_tokens_seen": 23858568, "step": 41115 }, { "epoch": 6.124515936848376, "grad_norm": 1.022326111793518, "learning_rate": 4.379781619083195e-05, "loss": 0.6904, "num_input_tokens_seen": 23861224, "step": 41120 }, { "epoch": 6.125260649389336, "grad_norm": 1.78959059715271, "learning_rate": 4.379567380953645e-05, "loss": 0.6071, "num_input_tokens_seen": 23864264, "step": 41125 }, { "epoch": 6.126005361930295, "grad_norm": 1.3089537620544434, "learning_rate": 4.3793531110707143e-05, "loss": 0.7681, "num_input_tokens_seen": 23866952, "step": 41130 }, { "epoch": 6.126750074471254, "grad_norm": 1.149535894393921, "learning_rate": 4.3791388094380236e-05, "loss": 0.4948, "num_input_tokens_seen": 23870056, "step": 41135 }, { "epoch": 6.127494787012213, "grad_norm": 1.2416458129882812, "learning_rate": 4.378924476059192e-05, "loss": 0.7195, "num_input_tokens_seen": 23873032, "step": 41140 }, { "epoch": 6.128239499553173, "grad_norm": 1.1582754850387573, "learning_rate": 4.378710110937842e-05, "loss": 0.5384, "num_input_tokens_seen": 23875880, "step": 41145 }, { "epoch": 6.128984212094132, "grad_norm": 0.6221857070922852, "learning_rate": 4.378495714077593e-05, "loss": 0.6269, "num_input_tokens_seen": 23878600, "step": 41150 }, { "epoch": 6.129728924635091, "grad_norm": 2.124617099761963, "learning_rate": 4.3782812854820687e-05, "loss": 0.5723, "num_input_tokens_seen": 23881544, "step": 41155 }, { "epoch": 6.13047363717605, "grad_norm": 0.6581215858459473, "learning_rate": 4.378066825154891e-05, "loss": 0.7615, "num_input_tokens_seen": 23884456, "step": 41160 }, { "epoch": 6.13121834971701, "grad_norm": 2.0095841884613037, "learning_rate": 4.3778523330996824e-05, "loss": 0.7169, "num_input_tokens_seen": 23887464, "step": 41165 }, { "epoch": 6.131963062257968, "grad_norm": 1.1040570735931396, "learning_rate": 4.377637809320068e-05, "loss": 0.5877, "num_input_tokens_seen": 23890280, "step": 41170 }, { "epoch": 6.132707774798928, "grad_norm": 1.915081262588501, "learning_rate": 4.377423253819671e-05, "loss": 0.7776, "num_input_tokens_seen": 23893768, "step": 41175 }, { "epoch": 6.133452487339887, "grad_norm": 0.9362480044364929, "learning_rate": 4.377208666602116e-05, "loss": 0.6161, "num_input_tokens_seen": 23896616, "step": 41180 }, { "epoch": 6.134197199880846, "grad_norm": 0.7160605192184448, "learning_rate": 4.3769940476710284e-05, "loss": 0.6149, "num_input_tokens_seen": 23899720, "step": 41185 }, { "epoch": 6.134941912421805, "grad_norm": 0.924107015132904, "learning_rate": 4.376779397030034e-05, "loss": 0.6101, "num_input_tokens_seen": 23902856, "step": 41190 }, { "epoch": 6.135686624962764, "grad_norm": 0.9089347124099731, "learning_rate": 4.376564714682761e-05, "loss": 0.6835, "num_input_tokens_seen": 23905736, "step": 41195 }, { "epoch": 6.136431337503724, "grad_norm": 1.0827993154525757, "learning_rate": 4.376350000632832e-05, "loss": 0.5693, "num_input_tokens_seen": 23908680, "step": 41200 }, { "epoch": 6.137176050044682, "grad_norm": 1.8623203039169312, "learning_rate": 4.376135254883877e-05, "loss": 0.4633, "num_input_tokens_seen": 23911368, "step": 41205 }, { "epoch": 6.137920762585642, "grad_norm": 1.2074708938598633, "learning_rate": 4.375920477439525e-05, "loss": 0.6842, "num_input_tokens_seen": 23914312, "step": 41210 }, { "epoch": 6.138665475126601, "grad_norm": 1.433287262916565, "learning_rate": 4.375705668303403e-05, "loss": 0.6841, "num_input_tokens_seen": 23917128, "step": 41215 }, { "epoch": 6.13941018766756, "grad_norm": 1.7448302507400513, "learning_rate": 4.37549082747914e-05, "loss": 0.7356, "num_input_tokens_seen": 23919624, "step": 41220 }, { "epoch": 6.140154900208519, "grad_norm": 1.6345504522323608, "learning_rate": 4.375275954970364e-05, "loss": 0.6495, "num_input_tokens_seen": 23922504, "step": 41225 }, { "epoch": 6.140899612749479, "grad_norm": 0.9797487258911133, "learning_rate": 4.3750610507807075e-05, "loss": 0.5888, "num_input_tokens_seen": 23925256, "step": 41230 }, { "epoch": 6.141644325290438, "grad_norm": 0.9982017874717712, "learning_rate": 4.3748461149138016e-05, "loss": 0.5585, "num_input_tokens_seen": 23928072, "step": 41235 }, { "epoch": 6.142389037831397, "grad_norm": 0.974097728729248, "learning_rate": 4.374631147373275e-05, "loss": 0.5864, "num_input_tokens_seen": 23931144, "step": 41240 }, { "epoch": 6.143133750372356, "grad_norm": 1.0270401239395142, "learning_rate": 4.374416148162761e-05, "loss": 0.5753, "num_input_tokens_seen": 23933736, "step": 41245 }, { "epoch": 6.143878462913316, "grad_norm": 1.654313325881958, "learning_rate": 4.374201117285891e-05, "loss": 0.5966, "num_input_tokens_seen": 23936712, "step": 41250 }, { "epoch": 6.144623175454274, "grad_norm": 0.7758767008781433, "learning_rate": 4.3739860547462976e-05, "loss": 0.521, "num_input_tokens_seen": 23939784, "step": 41255 }, { "epoch": 6.145367887995234, "grad_norm": 1.0040366649627686, "learning_rate": 4.373770960547614e-05, "loss": 0.5578, "num_input_tokens_seen": 23942664, "step": 41260 }, { "epoch": 6.146112600536193, "grad_norm": 1.8654595613479614, "learning_rate": 4.3735558346934755e-05, "loss": 0.7431, "num_input_tokens_seen": 23945608, "step": 41265 }, { "epoch": 6.146857313077152, "grad_norm": 1.2837711572647095, "learning_rate": 4.373340677187515e-05, "loss": 0.5182, "num_input_tokens_seen": 23948520, "step": 41270 }, { "epoch": 6.147602025618111, "grad_norm": 0.6324936747550964, "learning_rate": 4.373125488033368e-05, "loss": 0.6028, "num_input_tokens_seen": 23951912, "step": 41275 }, { "epoch": 6.148346738159071, "grad_norm": 0.768364667892456, "learning_rate": 4.372910267234669e-05, "loss": 0.5333, "num_input_tokens_seen": 23954696, "step": 41280 }, { "epoch": 6.14909145070003, "grad_norm": 1.1973437070846558, "learning_rate": 4.3726950147950554e-05, "loss": 0.5634, "num_input_tokens_seen": 23957448, "step": 41285 }, { "epoch": 6.149836163240989, "grad_norm": 1.8073546886444092, "learning_rate": 4.372479730718162e-05, "loss": 0.6201, "num_input_tokens_seen": 23960360, "step": 41290 }, { "epoch": 6.150580875781948, "grad_norm": 2.18566632270813, "learning_rate": 4.3722644150076275e-05, "loss": 0.6483, "num_input_tokens_seen": 23963176, "step": 41295 }, { "epoch": 6.151325588322908, "grad_norm": 1.0638633966445923, "learning_rate": 4.3720490676670886e-05, "loss": 0.6853, "num_input_tokens_seen": 23965896, "step": 41300 }, { "epoch": 6.152070300863866, "grad_norm": 1.184597134590149, "learning_rate": 4.371833688700182e-05, "loss": 0.5035, "num_input_tokens_seen": 23968840, "step": 41305 }, { "epoch": 6.152815013404826, "grad_norm": 1.5848616361618042, "learning_rate": 4.3716182781105484e-05, "loss": 0.7326, "num_input_tokens_seen": 23971784, "step": 41310 }, { "epoch": 6.153559725945785, "grad_norm": 1.752001166343689, "learning_rate": 4.3714028359018274e-05, "loss": 0.7112, "num_input_tokens_seen": 23974792, "step": 41315 }, { "epoch": 6.1543044384867445, "grad_norm": 1.9643868207931519, "learning_rate": 4.3711873620776566e-05, "loss": 0.6736, "num_input_tokens_seen": 23977608, "step": 41320 }, { "epoch": 6.155049151027703, "grad_norm": 1.430348515510559, "learning_rate": 4.370971856641677e-05, "loss": 0.6828, "num_input_tokens_seen": 23980328, "step": 41325 }, { "epoch": 6.155793863568663, "grad_norm": 0.7454615831375122, "learning_rate": 4.3707563195975296e-05, "loss": 0.4965, "num_input_tokens_seen": 23982984, "step": 41330 }, { "epoch": 6.156538576109622, "grad_norm": 1.2364704608917236, "learning_rate": 4.370540750948855e-05, "loss": 0.6327, "num_input_tokens_seen": 23986024, "step": 41335 }, { "epoch": 6.157283288650581, "grad_norm": 2.2080771923065186, "learning_rate": 4.370325150699296e-05, "loss": 0.7532, "num_input_tokens_seen": 23988808, "step": 41340 }, { "epoch": 6.15802800119154, "grad_norm": 1.1309831142425537, "learning_rate": 4.3701095188524943e-05, "loss": 0.6249, "num_input_tokens_seen": 23991528, "step": 41345 }, { "epoch": 6.1587727137325, "grad_norm": 1.5245639085769653, "learning_rate": 4.369893855412093e-05, "loss": 0.7919, "num_input_tokens_seen": 23994440, "step": 41350 }, { "epoch": 6.159517426273458, "grad_norm": 0.7961339950561523, "learning_rate": 4.369678160381736e-05, "loss": 0.6994, "num_input_tokens_seen": 23997320, "step": 41355 }, { "epoch": 6.160262138814417, "grad_norm": 1.4378490447998047, "learning_rate": 4.3694624337650656e-05, "loss": 0.5504, "num_input_tokens_seen": 24000296, "step": 41360 }, { "epoch": 6.161006851355377, "grad_norm": 1.8524060249328613, "learning_rate": 4.369246675565729e-05, "loss": 0.5937, "num_input_tokens_seen": 24003144, "step": 41365 }, { "epoch": 6.161751563896336, "grad_norm": 0.9836756587028503, "learning_rate": 4.369030885787369e-05, "loss": 0.7239, "num_input_tokens_seen": 24006376, "step": 41370 }, { "epoch": 6.162496276437295, "grad_norm": 1.5022510290145874, "learning_rate": 4.368815064433631e-05, "loss": 0.7646, "num_input_tokens_seen": 24009032, "step": 41375 }, { "epoch": 6.163240988978254, "grad_norm": 1.096718430519104, "learning_rate": 4.368599211508162e-05, "loss": 0.5398, "num_input_tokens_seen": 24011944, "step": 41380 }, { "epoch": 6.163985701519214, "grad_norm": 0.9709988236427307, "learning_rate": 4.3683833270146095e-05, "loss": 0.6181, "num_input_tokens_seen": 24014760, "step": 41385 }, { "epoch": 6.164730414060172, "grad_norm": 0.6954472064971924, "learning_rate": 4.368167410956619e-05, "loss": 0.6776, "num_input_tokens_seen": 24017864, "step": 41390 }, { "epoch": 6.165475126601132, "grad_norm": 2.09889554977417, "learning_rate": 4.367951463337839e-05, "loss": 0.7451, "num_input_tokens_seen": 24020744, "step": 41395 }, { "epoch": 6.166219839142091, "grad_norm": 1.1163463592529297, "learning_rate": 4.367735484161918e-05, "loss": 0.7379, "num_input_tokens_seen": 24023400, "step": 41400 }, { "epoch": 6.1669645516830505, "grad_norm": 0.7756564021110535, "learning_rate": 4.367519473432503e-05, "loss": 0.7739, "num_input_tokens_seen": 24026312, "step": 41405 }, { "epoch": 6.167709264224009, "grad_norm": 1.413719892501831, "learning_rate": 4.367303431153245e-05, "loss": 0.5893, "num_input_tokens_seen": 24029160, "step": 41410 }, { "epoch": 6.168453976764969, "grad_norm": 1.075868010520935, "learning_rate": 4.367087357327794e-05, "loss": 0.5386, "num_input_tokens_seen": 24032392, "step": 41415 }, { "epoch": 6.169198689305928, "grad_norm": 1.1283395290374756, "learning_rate": 4.366871251959799e-05, "loss": 0.6945, "num_input_tokens_seen": 24035784, "step": 41420 }, { "epoch": 6.169943401846887, "grad_norm": 1.4220247268676758, "learning_rate": 4.3666551150529124e-05, "loss": 0.5892, "num_input_tokens_seen": 24038696, "step": 41425 }, { "epoch": 6.170688114387846, "grad_norm": 1.0032634735107422, "learning_rate": 4.366438946610784e-05, "loss": 0.7457, "num_input_tokens_seen": 24041672, "step": 41430 }, { "epoch": 6.171432826928806, "grad_norm": 1.7239999771118164, "learning_rate": 4.366222746637067e-05, "loss": 0.6025, "num_input_tokens_seen": 24044456, "step": 41435 }, { "epoch": 6.172177539469764, "grad_norm": 0.9578257203102112, "learning_rate": 4.366006515135413e-05, "loss": 0.6525, "num_input_tokens_seen": 24047240, "step": 41440 }, { "epoch": 6.172922252010724, "grad_norm": 1.0985075235366821, "learning_rate": 4.3657902521094764e-05, "loss": 0.7781, "num_input_tokens_seen": 24050248, "step": 41445 }, { "epoch": 6.173666964551683, "grad_norm": 2.1588430404663086, "learning_rate": 4.365573957562909e-05, "loss": 0.687, "num_input_tokens_seen": 24053128, "step": 41450 }, { "epoch": 6.1744116770926425, "grad_norm": 1.2725749015808105, "learning_rate": 4.365357631499366e-05, "loss": 0.5591, "num_input_tokens_seen": 24055976, "step": 41455 }, { "epoch": 6.175156389633601, "grad_norm": 1.9209216833114624, "learning_rate": 4.365141273922502e-05, "loss": 0.6439, "num_input_tokens_seen": 24059176, "step": 41460 }, { "epoch": 6.175901102174561, "grad_norm": 0.9751908183097839, "learning_rate": 4.3649248848359706e-05, "loss": 0.7764, "num_input_tokens_seen": 24062056, "step": 41465 }, { "epoch": 6.17664581471552, "grad_norm": 1.2502940893173218, "learning_rate": 4.36470846424343e-05, "loss": 0.6461, "num_input_tokens_seen": 24064616, "step": 41470 }, { "epoch": 6.177390527256479, "grad_norm": 1.3860780000686646, "learning_rate": 4.364492012148534e-05, "loss": 0.7372, "num_input_tokens_seen": 24067848, "step": 41475 }, { "epoch": 6.178135239797438, "grad_norm": 0.973241925239563, "learning_rate": 4.364275528554941e-05, "loss": 0.6219, "num_input_tokens_seen": 24070536, "step": 41480 }, { "epoch": 6.178879952338398, "grad_norm": 1.2607347965240479, "learning_rate": 4.3640590134663076e-05, "loss": 0.5423, "num_input_tokens_seen": 24073320, "step": 41485 }, { "epoch": 6.1796246648793565, "grad_norm": 1.1033227443695068, "learning_rate": 4.363842466886292e-05, "loss": 0.6713, "num_input_tokens_seen": 24076232, "step": 41490 }, { "epoch": 6.180369377420316, "grad_norm": 1.6128932237625122, "learning_rate": 4.363625888818552e-05, "loss": 0.589, "num_input_tokens_seen": 24079112, "step": 41495 }, { "epoch": 6.181114089961275, "grad_norm": 2.2587926387786865, "learning_rate": 4.363409279266747e-05, "loss": 0.7048, "num_input_tokens_seen": 24082120, "step": 41500 }, { "epoch": 6.1818588025022345, "grad_norm": 1.3360697031021118, "learning_rate": 4.3631926382345356e-05, "loss": 0.6499, "num_input_tokens_seen": 24084936, "step": 41505 }, { "epoch": 6.182603515043193, "grad_norm": 0.8937350511550903, "learning_rate": 4.3629759657255786e-05, "loss": 0.5648, "num_input_tokens_seen": 24087944, "step": 41510 }, { "epoch": 6.183348227584153, "grad_norm": 1.6430895328521729, "learning_rate": 4.3627592617435363e-05, "loss": 0.738, "num_input_tokens_seen": 24090632, "step": 41515 }, { "epoch": 6.184092940125112, "grad_norm": 1.1520678997039795, "learning_rate": 4.362542526292069e-05, "loss": 0.7029, "num_input_tokens_seen": 24093736, "step": 41520 }, { "epoch": 6.18483765266607, "grad_norm": 1.074057936668396, "learning_rate": 4.362325759374839e-05, "loss": 0.7743, "num_input_tokens_seen": 24096744, "step": 41525 }, { "epoch": 6.18558236520703, "grad_norm": 1.171977162361145, "learning_rate": 4.3621089609955084e-05, "loss": 0.7209, "num_input_tokens_seen": 24099624, "step": 41530 }, { "epoch": 6.18632707774799, "grad_norm": 1.2187048196792603, "learning_rate": 4.3618921311577384e-05, "loss": 0.7824, "num_input_tokens_seen": 24102600, "step": 41535 }, { "epoch": 6.1870717902889485, "grad_norm": 1.161811113357544, "learning_rate": 4.361675269865194e-05, "loss": 0.7213, "num_input_tokens_seen": 24105640, "step": 41540 }, { "epoch": 6.187816502829907, "grad_norm": 0.6426686644554138, "learning_rate": 4.361458377121538e-05, "loss": 0.5748, "num_input_tokens_seen": 24108392, "step": 41545 }, { "epoch": 6.188561215370867, "grad_norm": 0.8845385313034058, "learning_rate": 4.3612414529304344e-05, "loss": 0.7698, "num_input_tokens_seen": 24111304, "step": 41550 }, { "epoch": 6.189305927911826, "grad_norm": 1.5359448194503784, "learning_rate": 4.3610244972955486e-05, "loss": 0.5441, "num_input_tokens_seen": 24114280, "step": 41555 }, { "epoch": 6.190050640452785, "grad_norm": 1.00125253200531, "learning_rate": 4.3608075102205454e-05, "loss": 0.6211, "num_input_tokens_seen": 24117448, "step": 41560 }, { "epoch": 6.190795352993744, "grad_norm": 1.323142647743225, "learning_rate": 4.36059049170909e-05, "loss": 0.6479, "num_input_tokens_seen": 24120040, "step": 41565 }, { "epoch": 6.191540065534704, "grad_norm": 1.0140498876571655, "learning_rate": 4.36037344176485e-05, "loss": 0.7298, "num_input_tokens_seen": 24123336, "step": 41570 }, { "epoch": 6.1922847780756625, "grad_norm": 1.2647368907928467, "learning_rate": 4.3601563603914906e-05, "loss": 0.582, "num_input_tokens_seen": 24125896, "step": 41575 }, { "epoch": 6.193029490616622, "grad_norm": 0.9653440713882446, "learning_rate": 4.3599392475926806e-05, "loss": 0.6933, "num_input_tokens_seen": 24129032, "step": 41580 }, { "epoch": 6.193774203157581, "grad_norm": 0.9909411668777466, "learning_rate": 4.359722103372087e-05, "loss": 0.5913, "num_input_tokens_seen": 24132072, "step": 41585 }, { "epoch": 6.1945189156985405, "grad_norm": 1.0627455711364746, "learning_rate": 4.3595049277333785e-05, "loss": 0.5648, "num_input_tokens_seen": 24134792, "step": 41590 }, { "epoch": 6.195263628239499, "grad_norm": 1.1196489334106445, "learning_rate": 4.359287720680225e-05, "loss": 0.5173, "num_input_tokens_seen": 24137416, "step": 41595 }, { "epoch": 6.196008340780459, "grad_norm": 1.109363317489624, "learning_rate": 4.359070482216295e-05, "loss": 0.5285, "num_input_tokens_seen": 24140264, "step": 41600 }, { "epoch": 6.196753053321418, "grad_norm": 0.9436336755752563, "learning_rate": 4.358853212345258e-05, "loss": 0.5957, "num_input_tokens_seen": 24143624, "step": 41605 }, { "epoch": 6.197497765862377, "grad_norm": 0.9120582342147827, "learning_rate": 4.358635911070785e-05, "loss": 0.5089, "num_input_tokens_seen": 24146696, "step": 41610 }, { "epoch": 6.198242478403336, "grad_norm": 1.0839834213256836, "learning_rate": 4.3584185783965484e-05, "loss": 0.5536, "num_input_tokens_seen": 24149448, "step": 41615 }, { "epoch": 6.198987190944296, "grad_norm": 1.0872466564178467, "learning_rate": 4.358201214326218e-05, "loss": 0.5925, "num_input_tokens_seen": 24152232, "step": 41620 }, { "epoch": 6.1997319034852545, "grad_norm": 1.200663447380066, "learning_rate": 4.357983818863467e-05, "loss": 0.8453, "num_input_tokens_seen": 24154984, "step": 41625 }, { "epoch": 6.200476616026214, "grad_norm": 1.1559860706329346, "learning_rate": 4.357766392011968e-05, "loss": 0.5246, "num_input_tokens_seen": 24157800, "step": 41630 }, { "epoch": 6.201221328567173, "grad_norm": 0.5128803849220276, "learning_rate": 4.357548933775393e-05, "loss": 0.6089, "num_input_tokens_seen": 24160552, "step": 41635 }, { "epoch": 6.2019660411081325, "grad_norm": 1.127081274986267, "learning_rate": 4.3573314441574176e-05, "loss": 0.5542, "num_input_tokens_seen": 24163304, "step": 41640 }, { "epoch": 6.202710753649091, "grad_norm": 1.107667088508606, "learning_rate": 4.357113923161715e-05, "loss": 0.5907, "num_input_tokens_seen": 24166600, "step": 41645 }, { "epoch": 6.203455466190051, "grad_norm": 1.1501834392547607, "learning_rate": 4.35689637079196e-05, "loss": 0.6551, "num_input_tokens_seen": 24169704, "step": 41650 }, { "epoch": 6.20420017873101, "grad_norm": 1.3628886938095093, "learning_rate": 4.356678787051828e-05, "loss": 0.8229, "num_input_tokens_seen": 24172392, "step": 41655 }, { "epoch": 6.204944891271969, "grad_norm": 1.203999638557434, "learning_rate": 4.356461171944994e-05, "loss": 0.6707, "num_input_tokens_seen": 24175656, "step": 41660 }, { "epoch": 6.205689603812928, "grad_norm": 0.791039228439331, "learning_rate": 4.356243525475137e-05, "loss": 0.7902, "num_input_tokens_seen": 24178760, "step": 41665 }, { "epoch": 6.206434316353888, "grad_norm": 1.5046221017837524, "learning_rate": 4.3560258476459315e-05, "loss": 0.6453, "num_input_tokens_seen": 24181448, "step": 41670 }, { "epoch": 6.2071790288948465, "grad_norm": 0.8634000420570374, "learning_rate": 4.355808138461056e-05, "loss": 0.5845, "num_input_tokens_seen": 24184456, "step": 41675 }, { "epoch": 6.207923741435806, "grad_norm": 2.2402184009552, "learning_rate": 4.355590397924188e-05, "loss": 0.5057, "num_input_tokens_seen": 24187400, "step": 41680 }, { "epoch": 6.208668453976765, "grad_norm": 0.6812790036201477, "learning_rate": 4.355372626039006e-05, "loss": 0.5559, "num_input_tokens_seen": 24190440, "step": 41685 }, { "epoch": 6.209413166517725, "grad_norm": 0.8179552555084229, "learning_rate": 4.355154822809189e-05, "loss": 0.4747, "num_input_tokens_seen": 24193160, "step": 41690 }, { "epoch": 6.210157879058683, "grad_norm": 3.006948471069336, "learning_rate": 4.3549369882384174e-05, "loss": 0.7447, "num_input_tokens_seen": 24196008, "step": 41695 }, { "epoch": 6.210902591599643, "grad_norm": 3.0072765350341797, "learning_rate": 4.35471912233037e-05, "loss": 0.7653, "num_input_tokens_seen": 24198792, "step": 41700 }, { "epoch": 6.211647304140602, "grad_norm": 0.7834905385971069, "learning_rate": 4.3545012250887286e-05, "loss": 0.679, "num_input_tokens_seen": 24201832, "step": 41705 }, { "epoch": 6.2123920166815605, "grad_norm": 0.8967047929763794, "learning_rate": 4.354283296517173e-05, "loss": 0.7841, "num_input_tokens_seen": 24204648, "step": 41710 }, { "epoch": 6.21313672922252, "grad_norm": 1.1628055572509766, "learning_rate": 4.354065336619387e-05, "loss": 0.5532, "num_input_tokens_seen": 24207656, "step": 41715 }, { "epoch": 6.213881441763479, "grad_norm": 1.1896414756774902, "learning_rate": 4.3538473453990506e-05, "loss": 0.6676, "num_input_tokens_seen": 24210440, "step": 41720 }, { "epoch": 6.2146261543044385, "grad_norm": 0.9566412568092346, "learning_rate": 4.353629322859848e-05, "loss": 0.5443, "num_input_tokens_seen": 24213512, "step": 41725 }, { "epoch": 6.215370866845397, "grad_norm": 1.4233511686325073, "learning_rate": 4.353411269005462e-05, "loss": 0.7962, "num_input_tokens_seen": 24216584, "step": 41730 }, { "epoch": 6.216115579386357, "grad_norm": 0.9379373788833618, "learning_rate": 4.353193183839576e-05, "loss": 0.7184, "num_input_tokens_seen": 24219560, "step": 41735 }, { "epoch": 6.216860291927316, "grad_norm": 1.121453046798706, "learning_rate": 4.352975067365874e-05, "loss": 0.6756, "num_input_tokens_seen": 24222216, "step": 41740 }, { "epoch": 6.217605004468275, "grad_norm": 1.2875947952270508, "learning_rate": 4.352756919588042e-05, "loss": 0.6581, "num_input_tokens_seen": 24225064, "step": 41745 }, { "epoch": 6.218349717009234, "grad_norm": 0.9532626271247864, "learning_rate": 4.3525387405097654e-05, "loss": 0.4846, "num_input_tokens_seen": 24228232, "step": 41750 }, { "epoch": 6.219094429550194, "grad_norm": 1.3132871389389038, "learning_rate": 4.352320530134729e-05, "loss": 0.654, "num_input_tokens_seen": 24231208, "step": 41755 }, { "epoch": 6.2198391420911525, "grad_norm": 1.4172213077545166, "learning_rate": 4.35210228846662e-05, "loss": 0.6739, "num_input_tokens_seen": 24233800, "step": 41760 }, { "epoch": 6.220583854632112, "grad_norm": 1.0632699728012085, "learning_rate": 4.3518840155091255e-05, "loss": 0.6758, "num_input_tokens_seen": 24236840, "step": 41765 }, { "epoch": 6.221328567173071, "grad_norm": 1.0480690002441406, "learning_rate": 4.351665711265933e-05, "loss": 0.5389, "num_input_tokens_seen": 24239688, "step": 41770 }, { "epoch": 6.222073279714031, "grad_norm": 1.3028712272644043, "learning_rate": 4.351447375740729e-05, "loss": 0.5814, "num_input_tokens_seen": 24242696, "step": 41775 }, { "epoch": 6.222817992254989, "grad_norm": 0.7949910163879395, "learning_rate": 4.351229008937205e-05, "loss": 0.6962, "num_input_tokens_seen": 24245576, "step": 41780 }, { "epoch": 6.223562704795949, "grad_norm": 0.9269932508468628, "learning_rate": 4.3510106108590476e-05, "loss": 0.621, "num_input_tokens_seen": 24248712, "step": 41785 }, { "epoch": 6.224307417336908, "grad_norm": 1.0444254875183105, "learning_rate": 4.350792181509947e-05, "loss": 0.7737, "num_input_tokens_seen": 24251624, "step": 41790 }, { "epoch": 6.225052129877867, "grad_norm": 1.1503266096115112, "learning_rate": 4.350573720893594e-05, "loss": 0.5838, "num_input_tokens_seen": 24254600, "step": 41795 }, { "epoch": 6.225796842418826, "grad_norm": 1.3002965450286865, "learning_rate": 4.350355229013679e-05, "loss": 0.6762, "num_input_tokens_seen": 24257512, "step": 41800 }, { "epoch": 6.226541554959786, "grad_norm": 1.3650778532028198, "learning_rate": 4.3501367058738916e-05, "loss": 0.6648, "num_input_tokens_seen": 24260296, "step": 41805 }, { "epoch": 6.2272862675007445, "grad_norm": 0.7413528561592102, "learning_rate": 4.3499181514779266e-05, "loss": 0.6184, "num_input_tokens_seen": 24263080, "step": 41810 }, { "epoch": 6.228030980041704, "grad_norm": 1.3581091165542603, "learning_rate": 4.3496995658294735e-05, "loss": 0.7268, "num_input_tokens_seen": 24265928, "step": 41815 }, { "epoch": 6.228775692582663, "grad_norm": 1.3334802389144897, "learning_rate": 4.349480948932226e-05, "loss": 0.7697, "num_input_tokens_seen": 24269128, "step": 41820 }, { "epoch": 6.229520405123623, "grad_norm": 1.6514322757720947, "learning_rate": 4.3492623007898786e-05, "loss": 0.5708, "num_input_tokens_seen": 24271912, "step": 41825 }, { "epoch": 6.230265117664581, "grad_norm": 0.8175675868988037, "learning_rate": 4.3490436214061236e-05, "loss": 0.5894, "num_input_tokens_seen": 24275656, "step": 41830 }, { "epoch": 6.231009830205541, "grad_norm": 0.6824191808700562, "learning_rate": 4.348824910784656e-05, "loss": 0.5756, "num_input_tokens_seen": 24278248, "step": 41835 }, { "epoch": 6.2317545427465, "grad_norm": 1.4622581005096436, "learning_rate": 4.348606168929171e-05, "loss": 0.6083, "num_input_tokens_seen": 24281288, "step": 41840 }, { "epoch": 6.232499255287459, "grad_norm": 1.2960084676742554, "learning_rate": 4.348387395843363e-05, "loss": 0.5257, "num_input_tokens_seen": 24284040, "step": 41845 }, { "epoch": 6.233243967828418, "grad_norm": 1.5692228078842163, "learning_rate": 4.348168591530929e-05, "loss": 0.6796, "num_input_tokens_seen": 24287144, "step": 41850 }, { "epoch": 6.233988680369378, "grad_norm": 0.991462230682373, "learning_rate": 4.3479497559955654e-05, "loss": 0.5024, "num_input_tokens_seen": 24290120, "step": 41855 }, { "epoch": 6.234733392910337, "grad_norm": 2.0775058269500732, "learning_rate": 4.347730889240968e-05, "loss": 0.6323, "num_input_tokens_seen": 24293096, "step": 41860 }, { "epoch": 6.235478105451296, "grad_norm": 1.0442954301834106, "learning_rate": 4.347511991270835e-05, "loss": 0.6255, "num_input_tokens_seen": 24295944, "step": 41865 }, { "epoch": 6.236222817992255, "grad_norm": 0.8247158527374268, "learning_rate": 4.347293062088865e-05, "loss": 0.7037, "num_input_tokens_seen": 24298952, "step": 41870 }, { "epoch": 6.236967530533214, "grad_norm": 1.885032057762146, "learning_rate": 4.3470741016987574e-05, "loss": 0.7145, "num_input_tokens_seen": 24301768, "step": 41875 }, { "epoch": 6.237712243074173, "grad_norm": 1.1109527349472046, "learning_rate": 4.3468551101042084e-05, "loss": 0.611, "num_input_tokens_seen": 24304776, "step": 41880 }, { "epoch": 6.238456955615132, "grad_norm": 2.233914375305176, "learning_rate": 4.3466360873089204e-05, "loss": 0.665, "num_input_tokens_seen": 24307784, "step": 41885 }, { "epoch": 6.239201668156092, "grad_norm": 1.3228529691696167, "learning_rate": 4.346417033316592e-05, "loss": 0.6739, "num_input_tokens_seen": 24310856, "step": 41890 }, { "epoch": 6.2399463806970505, "grad_norm": 0.9303932189941406, "learning_rate": 4.346197948130925e-05, "loss": 0.6417, "num_input_tokens_seen": 24314088, "step": 41895 }, { "epoch": 6.24069109323801, "grad_norm": 1.1387540102005005, "learning_rate": 4.34597883175562e-05, "loss": 0.6058, "num_input_tokens_seen": 24317192, "step": 41900 }, { "epoch": 6.241435805778969, "grad_norm": 1.2410826683044434, "learning_rate": 4.3457596841943775e-05, "loss": 0.7887, "num_input_tokens_seen": 24319848, "step": 41905 }, { "epoch": 6.242180518319929, "grad_norm": 1.309171438217163, "learning_rate": 4.345540505450902e-05, "loss": 0.5688, "num_input_tokens_seen": 24322824, "step": 41910 }, { "epoch": 6.242925230860887, "grad_norm": 1.1652923822402954, "learning_rate": 4.345321295528896e-05, "loss": 0.6817, "num_input_tokens_seen": 24325736, "step": 41915 }, { "epoch": 6.243669943401847, "grad_norm": 1.6434097290039062, "learning_rate": 4.345102054432061e-05, "loss": 0.6813, "num_input_tokens_seen": 24328360, "step": 41920 }, { "epoch": 6.244414655942806, "grad_norm": 1.2693030834197998, "learning_rate": 4.344882782164103e-05, "loss": 0.6485, "num_input_tokens_seen": 24331848, "step": 41925 }, { "epoch": 6.245159368483765, "grad_norm": 1.7644044160842896, "learning_rate": 4.344663478728725e-05, "loss": 0.7682, "num_input_tokens_seen": 24334664, "step": 41930 }, { "epoch": 6.245904081024724, "grad_norm": 0.8653023838996887, "learning_rate": 4.3444441441296324e-05, "loss": 0.564, "num_input_tokens_seen": 24337640, "step": 41935 }, { "epoch": 6.246648793565684, "grad_norm": 1.9994324445724487, "learning_rate": 4.344224778370531e-05, "loss": 0.7079, "num_input_tokens_seen": 24340712, "step": 41940 }, { "epoch": 6.247393506106643, "grad_norm": 1.0996540784835815, "learning_rate": 4.344005381455126e-05, "loss": 0.665, "num_input_tokens_seen": 24343560, "step": 41945 }, { "epoch": 6.248138218647602, "grad_norm": 1.1739822626113892, "learning_rate": 4.343785953387125e-05, "loss": 0.5692, "num_input_tokens_seen": 24346568, "step": 41950 }, { "epoch": 6.248882931188561, "grad_norm": 1.1483407020568848, "learning_rate": 4.343566494170233e-05, "loss": 0.6766, "num_input_tokens_seen": 24349256, "step": 41955 }, { "epoch": 6.249627643729521, "grad_norm": 0.7917089462280273, "learning_rate": 4.34334700380816e-05, "loss": 0.7177, "num_input_tokens_seen": 24352040, "step": 41960 }, { "epoch": 6.250372356270479, "grad_norm": 1.1708509922027588, "learning_rate": 4.343127482304612e-05, "loss": 0.7988, "num_input_tokens_seen": 24355176, "step": 41965 }, { "epoch": 6.251117068811439, "grad_norm": 1.3629833459854126, "learning_rate": 4.342907929663299e-05, "loss": 0.6475, "num_input_tokens_seen": 24358056, "step": 41970 }, { "epoch": 6.251861781352398, "grad_norm": 0.7360957860946655, "learning_rate": 4.342688345887929e-05, "loss": 0.6811, "num_input_tokens_seen": 24361192, "step": 41975 }, { "epoch": 6.252606493893357, "grad_norm": 1.3366482257843018, "learning_rate": 4.342468730982212e-05, "loss": 0.7338, "num_input_tokens_seen": 24364168, "step": 41980 }, { "epoch": 6.253351206434316, "grad_norm": 1.6189899444580078, "learning_rate": 4.342249084949859e-05, "loss": 0.6581, "num_input_tokens_seen": 24367144, "step": 41985 }, { "epoch": 6.254095918975276, "grad_norm": 2.01023530960083, "learning_rate": 4.34202940779458e-05, "loss": 0.6288, "num_input_tokens_seen": 24369992, "step": 41990 }, { "epoch": 6.254840631516235, "grad_norm": 0.9349178075790405, "learning_rate": 4.341809699520086e-05, "loss": 0.6937, "num_input_tokens_seen": 24373480, "step": 41995 }, { "epoch": 6.255585344057194, "grad_norm": 1.0218485593795776, "learning_rate": 4.34158996013009e-05, "loss": 0.6153, "num_input_tokens_seen": 24376520, "step": 42000 }, { "epoch": 6.256330056598153, "grad_norm": 0.7711340188980103, "learning_rate": 4.3413701896283024e-05, "loss": 0.7732, "num_input_tokens_seen": 24379752, "step": 42005 }, { "epoch": 6.257074769139113, "grad_norm": 0.6334992051124573, "learning_rate": 4.341150388018437e-05, "loss": 0.6293, "num_input_tokens_seen": 24382728, "step": 42010 }, { "epoch": 6.257819481680071, "grad_norm": 1.3079198598861694, "learning_rate": 4.340930555304208e-05, "loss": 0.5951, "num_input_tokens_seen": 24385704, "step": 42015 }, { "epoch": 6.258564194221031, "grad_norm": 1.0001354217529297, "learning_rate": 4.340710691489327e-05, "loss": 0.5842, "num_input_tokens_seen": 24388232, "step": 42020 }, { "epoch": 6.25930890676199, "grad_norm": 1.0759600400924683, "learning_rate": 4.340490796577511e-05, "loss": 0.5926, "num_input_tokens_seen": 24391112, "step": 42025 }, { "epoch": 6.2600536193029495, "grad_norm": 0.9150886535644531, "learning_rate": 4.340270870572472e-05, "loss": 0.4956, "num_input_tokens_seen": 24393928, "step": 42030 }, { "epoch": 6.260798331843908, "grad_norm": 1.0035778284072876, "learning_rate": 4.340050913477928e-05, "loss": 0.638, "num_input_tokens_seen": 24397000, "step": 42035 }, { "epoch": 6.261543044384867, "grad_norm": 0.8667522668838501, "learning_rate": 4.339830925297594e-05, "loss": 0.6557, "num_input_tokens_seen": 24399880, "step": 42040 }, { "epoch": 6.262287756925827, "grad_norm": 1.2081377506256104, "learning_rate": 4.3396109060351864e-05, "loss": 0.6416, "num_input_tokens_seen": 24402664, "step": 42045 }, { "epoch": 6.263032469466786, "grad_norm": 1.2945457696914673, "learning_rate": 4.339390855694422e-05, "loss": 0.595, "num_input_tokens_seen": 24405640, "step": 42050 }, { "epoch": 6.263777182007745, "grad_norm": 0.8239720463752747, "learning_rate": 4.339170774279019e-05, "loss": 0.5205, "num_input_tokens_seen": 24409128, "step": 42055 }, { "epoch": 6.264521894548704, "grad_norm": 1.8662794828414917, "learning_rate": 4.3389506617926945e-05, "loss": 0.6438, "num_input_tokens_seen": 24411816, "step": 42060 }, { "epoch": 6.265266607089663, "grad_norm": 1.1449083089828491, "learning_rate": 4.3387305182391677e-05, "loss": 0.6401, "num_input_tokens_seen": 24414536, "step": 42065 }, { "epoch": 6.266011319630622, "grad_norm": 1.0464329719543457, "learning_rate": 4.3385103436221575e-05, "loss": 0.8124, "num_input_tokens_seen": 24417192, "step": 42070 }, { "epoch": 6.266756032171582, "grad_norm": 1.1301822662353516, "learning_rate": 4.338290137945384e-05, "loss": 0.7688, "num_input_tokens_seen": 24419816, "step": 42075 }, { "epoch": 6.267500744712541, "grad_norm": 1.0673372745513916, "learning_rate": 4.338069901212567e-05, "loss": 0.8548, "num_input_tokens_seen": 24422728, "step": 42080 }, { "epoch": 6.2682454572535, "grad_norm": 0.8951984643936157, "learning_rate": 4.337849633427427e-05, "loss": 0.4671, "num_input_tokens_seen": 24425448, "step": 42085 }, { "epoch": 6.268990169794459, "grad_norm": 3.134641408920288, "learning_rate": 4.337629334593685e-05, "loss": 0.7324, "num_input_tokens_seen": 24428488, "step": 42090 }, { "epoch": 6.269734882335419, "grad_norm": 0.8008860945701599, "learning_rate": 4.337409004715063e-05, "loss": 0.6818, "num_input_tokens_seen": 24431720, "step": 42095 }, { "epoch": 6.270479594876377, "grad_norm": 2.6178879737854004, "learning_rate": 4.337188643795284e-05, "loss": 0.7036, "num_input_tokens_seen": 24434440, "step": 42100 }, { "epoch": 6.271224307417337, "grad_norm": 1.5474194288253784, "learning_rate": 4.33696825183807e-05, "loss": 0.4225, "num_input_tokens_seen": 24437352, "step": 42105 }, { "epoch": 6.271969019958296, "grad_norm": 1.59037184715271, "learning_rate": 4.3367478288471444e-05, "loss": 0.6371, "num_input_tokens_seen": 24440136, "step": 42110 }, { "epoch": 6.2727137324992555, "grad_norm": 1.174957275390625, "learning_rate": 4.33652737482623e-05, "loss": 0.6643, "num_input_tokens_seen": 24443144, "step": 42115 }, { "epoch": 6.273458445040214, "grad_norm": 1.6918928623199463, "learning_rate": 4.336306889779054e-05, "loss": 0.7755, "num_input_tokens_seen": 24445896, "step": 42120 }, { "epoch": 6.274203157581174, "grad_norm": 0.9295151233673096, "learning_rate": 4.3360863737093375e-05, "loss": 0.6649, "num_input_tokens_seen": 24449160, "step": 42125 }, { "epoch": 6.274947870122133, "grad_norm": 1.1483854055404663, "learning_rate": 4.335865826620809e-05, "loss": 0.5993, "num_input_tokens_seen": 24452168, "step": 42130 }, { "epoch": 6.275692582663092, "grad_norm": 1.1959983110427856, "learning_rate": 4.335645248517193e-05, "loss": 0.589, "num_input_tokens_seen": 24455048, "step": 42135 }, { "epoch": 6.276437295204051, "grad_norm": 1.1036877632141113, "learning_rate": 4.335424639402216e-05, "loss": 0.6666, "num_input_tokens_seen": 24457800, "step": 42140 }, { "epoch": 6.277182007745011, "grad_norm": 1.5653444528579712, "learning_rate": 4.3352039992796056e-05, "loss": 0.865, "num_input_tokens_seen": 24460520, "step": 42145 }, { "epoch": 6.277926720285969, "grad_norm": 1.7999564409255981, "learning_rate": 4.334983328153088e-05, "loss": 0.7057, "num_input_tokens_seen": 24463208, "step": 42150 }, { "epoch": 6.278671432826929, "grad_norm": 1.289085030555725, "learning_rate": 4.334762626026393e-05, "loss": 0.6579, "num_input_tokens_seen": 24466184, "step": 42155 }, { "epoch": 6.279416145367888, "grad_norm": 1.2262067794799805, "learning_rate": 4.3345418929032475e-05, "loss": 0.6471, "num_input_tokens_seen": 24469256, "step": 42160 }, { "epoch": 6.2801608579088475, "grad_norm": 1.4526081085205078, "learning_rate": 4.334321128787382e-05, "loss": 0.7188, "num_input_tokens_seen": 24472104, "step": 42165 }, { "epoch": 6.280905570449806, "grad_norm": 1.2359447479248047, "learning_rate": 4.3341003336825246e-05, "loss": 0.6311, "num_input_tokens_seen": 24474728, "step": 42170 }, { "epoch": 6.281650282990766, "grad_norm": 1.2111310958862305, "learning_rate": 4.333879507592407e-05, "loss": 0.7911, "num_input_tokens_seen": 24477480, "step": 42175 }, { "epoch": 6.282394995531725, "grad_norm": 0.9412544965744019, "learning_rate": 4.3336586505207587e-05, "loss": 0.7419, "num_input_tokens_seen": 24480424, "step": 42180 }, { "epoch": 6.283139708072684, "grad_norm": 1.0512890815734863, "learning_rate": 4.3334377624713104e-05, "loss": 0.7077, "num_input_tokens_seen": 24483560, "step": 42185 }, { "epoch": 6.283884420613643, "grad_norm": 1.5389398336410522, "learning_rate": 4.333216843447795e-05, "loss": 0.7704, "num_input_tokens_seen": 24486504, "step": 42190 }, { "epoch": 6.284629133154603, "grad_norm": 1.0432853698730469, "learning_rate": 4.332995893453945e-05, "loss": 0.5526, "num_input_tokens_seen": 24489544, "step": 42195 }, { "epoch": 6.2853738456955615, "grad_norm": 0.802147388458252, "learning_rate": 4.3327749124934916e-05, "loss": 0.4981, "num_input_tokens_seen": 24492456, "step": 42200 }, { "epoch": 6.286118558236521, "grad_norm": 0.9844998121261597, "learning_rate": 4.332553900570169e-05, "loss": 0.7523, "num_input_tokens_seen": 24495368, "step": 42205 }, { "epoch": 6.28686327077748, "grad_norm": 1.2200409173965454, "learning_rate": 4.3323328576877104e-05, "loss": 0.7235, "num_input_tokens_seen": 24498344, "step": 42210 }, { "epoch": 6.2876079833184395, "grad_norm": 2.13942813873291, "learning_rate": 4.33211178384985e-05, "loss": 0.5272, "num_input_tokens_seen": 24501480, "step": 42215 }, { "epoch": 6.288352695859398, "grad_norm": 1.397416591644287, "learning_rate": 4.331890679060324e-05, "loss": 0.7353, "num_input_tokens_seen": 24504456, "step": 42220 }, { "epoch": 6.289097408400357, "grad_norm": 0.5880866050720215, "learning_rate": 4.331669543322867e-05, "loss": 0.5366, "num_input_tokens_seen": 24507272, "step": 42225 }, { "epoch": 6.289842120941317, "grad_norm": 1.139837384223938, "learning_rate": 4.331448376641214e-05, "loss": 0.6652, "num_input_tokens_seen": 24510216, "step": 42230 }, { "epoch": 6.290586833482275, "grad_norm": 1.3524881601333618, "learning_rate": 4.331227179019103e-05, "loss": 0.5462, "num_input_tokens_seen": 24512872, "step": 42235 }, { "epoch": 6.291331546023235, "grad_norm": 0.7646045684814453, "learning_rate": 4.3310059504602685e-05, "loss": 0.5026, "num_input_tokens_seen": 24515592, "step": 42240 }, { "epoch": 6.292076258564194, "grad_norm": 0.8626927137374878, "learning_rate": 4.330784690968451e-05, "loss": 0.5683, "num_input_tokens_seen": 24518504, "step": 42245 }, { "epoch": 6.2928209711051535, "grad_norm": 1.134803295135498, "learning_rate": 4.330563400547386e-05, "loss": 0.465, "num_input_tokens_seen": 24521480, "step": 42250 }, { "epoch": 6.293565683646112, "grad_norm": 1.0130877494812012, "learning_rate": 4.330342079200813e-05, "loss": 0.4631, "num_input_tokens_seen": 24524328, "step": 42255 }, { "epoch": 6.294310396187072, "grad_norm": 1.4253216981887817, "learning_rate": 4.330120726932471e-05, "loss": 0.6151, "num_input_tokens_seen": 24527144, "step": 42260 }, { "epoch": 6.295055108728031, "grad_norm": 0.9060672521591187, "learning_rate": 4.329899343746099e-05, "loss": 0.5897, "num_input_tokens_seen": 24530344, "step": 42265 }, { "epoch": 6.29579982126899, "grad_norm": 1.802558183670044, "learning_rate": 4.3296779296454374e-05, "loss": 0.7231, "num_input_tokens_seen": 24533160, "step": 42270 }, { "epoch": 6.296544533809949, "grad_norm": 0.6920989751815796, "learning_rate": 4.3294564846342275e-05, "loss": 0.8092, "num_input_tokens_seen": 24536328, "step": 42275 }, { "epoch": 6.297289246350909, "grad_norm": 0.915952205657959, "learning_rate": 4.329235008716209e-05, "loss": 0.7567, "num_input_tokens_seen": 24539144, "step": 42280 }, { "epoch": 6.2980339588918675, "grad_norm": 0.9854223132133484, "learning_rate": 4.329013501895125e-05, "loss": 0.6417, "num_input_tokens_seen": 24541928, "step": 42285 }, { "epoch": 6.298778671432827, "grad_norm": 2.0919816493988037, "learning_rate": 4.3287919641747155e-05, "loss": 0.656, "num_input_tokens_seen": 24544744, "step": 42290 }, { "epoch": 6.299523383973786, "grad_norm": 0.7643349766731262, "learning_rate": 4.328570395558725e-05, "loss": 0.5924, "num_input_tokens_seen": 24547592, "step": 42295 }, { "epoch": 6.3002680965147455, "grad_norm": 0.8584951758384705, "learning_rate": 4.328348796050896e-05, "loss": 0.6597, "num_input_tokens_seen": 24550440, "step": 42300 }, { "epoch": 6.301012809055704, "grad_norm": 0.7987756133079529, "learning_rate": 4.3281271656549734e-05, "loss": 0.5562, "num_input_tokens_seen": 24553544, "step": 42305 }, { "epoch": 6.301757521596664, "grad_norm": 1.235396385192871, "learning_rate": 4.3279055043746996e-05, "loss": 0.5973, "num_input_tokens_seen": 24556360, "step": 42310 }, { "epoch": 6.302502234137623, "grad_norm": 0.9473291039466858, "learning_rate": 4.3276838122138196e-05, "loss": 0.707, "num_input_tokens_seen": 24559272, "step": 42315 }, { "epoch": 6.303246946678582, "grad_norm": 1.9626518487930298, "learning_rate": 4.3274620891760795e-05, "loss": 0.6046, "num_input_tokens_seen": 24562216, "step": 42320 }, { "epoch": 6.303991659219541, "grad_norm": 1.2109109163284302, "learning_rate": 4.327240335265226e-05, "loss": 0.6581, "num_input_tokens_seen": 24565288, "step": 42325 }, { "epoch": 6.304736371760501, "grad_norm": 1.7869541645050049, "learning_rate": 4.3270185504850024e-05, "loss": 0.7026, "num_input_tokens_seen": 24568296, "step": 42330 }, { "epoch": 6.3054810843014595, "grad_norm": 0.8179959058761597, "learning_rate": 4.326796734839158e-05, "loss": 0.6448, "num_input_tokens_seen": 24571176, "step": 42335 }, { "epoch": 6.306225796842419, "grad_norm": 0.818250834941864, "learning_rate": 4.32657488833144e-05, "loss": 0.6869, "num_input_tokens_seen": 24574376, "step": 42340 }, { "epoch": 6.306970509383378, "grad_norm": 1.3131208419799805, "learning_rate": 4.326353010965595e-05, "loss": 0.6784, "num_input_tokens_seen": 24577256, "step": 42345 }, { "epoch": 6.3077152219243375, "grad_norm": 2.0638997554779053, "learning_rate": 4.326131102745372e-05, "loss": 0.587, "num_input_tokens_seen": 24580136, "step": 42350 }, { "epoch": 6.308459934465296, "grad_norm": 1.2349525690078735, "learning_rate": 4.3259091636745196e-05, "loss": 0.5646, "num_input_tokens_seen": 24583272, "step": 42355 }, { "epoch": 6.309204647006256, "grad_norm": 0.9009618163108826, "learning_rate": 4.325687193756789e-05, "loss": 0.5578, "num_input_tokens_seen": 24586376, "step": 42360 }, { "epoch": 6.309949359547215, "grad_norm": 1.043316125869751, "learning_rate": 4.325465192995928e-05, "loss": 0.55, "num_input_tokens_seen": 24589320, "step": 42365 }, { "epoch": 6.310694072088174, "grad_norm": 0.9906989336013794, "learning_rate": 4.325243161395688e-05, "loss": 0.6774, "num_input_tokens_seen": 24592072, "step": 42370 }, { "epoch": 6.311438784629133, "grad_norm": 1.2344728708267212, "learning_rate": 4.3250210989598196e-05, "loss": 0.5766, "num_input_tokens_seen": 24594792, "step": 42375 }, { "epoch": 6.312183497170093, "grad_norm": 0.8121830821037292, "learning_rate": 4.324799005692075e-05, "loss": 0.5573, "num_input_tokens_seen": 24597576, "step": 42380 }, { "epoch": 6.3129282097110515, "grad_norm": 0.9389997720718384, "learning_rate": 4.3245768815962055e-05, "loss": 0.6293, "num_input_tokens_seen": 24600840, "step": 42385 }, { "epoch": 6.31367292225201, "grad_norm": 0.9197441339492798, "learning_rate": 4.3243547266759646e-05, "loss": 0.703, "num_input_tokens_seen": 24603784, "step": 42390 }, { "epoch": 6.31441763479297, "grad_norm": 0.7932912111282349, "learning_rate": 4.3241325409351044e-05, "loss": 0.6048, "num_input_tokens_seen": 24606952, "step": 42395 }, { "epoch": 6.31516234733393, "grad_norm": 2.418530225753784, "learning_rate": 4.323910324377379e-05, "loss": 0.8958, "num_input_tokens_seen": 24609736, "step": 42400 }, { "epoch": 6.315907059874888, "grad_norm": 1.0977683067321777, "learning_rate": 4.3236880770065426e-05, "loss": 0.5924, "num_input_tokens_seen": 24612648, "step": 42405 }, { "epoch": 6.316651772415847, "grad_norm": 0.9036684632301331, "learning_rate": 4.323465798826349e-05, "loss": 0.567, "num_input_tokens_seen": 24615304, "step": 42410 }, { "epoch": 6.317396484956807, "grad_norm": 2.3592894077301025, "learning_rate": 4.323243489840554e-05, "loss": 0.7984, "num_input_tokens_seen": 24618408, "step": 42415 }, { "epoch": 6.3181411974977655, "grad_norm": 0.8078729510307312, "learning_rate": 4.323021150052914e-05, "loss": 0.7366, "num_input_tokens_seen": 24621672, "step": 42420 }, { "epoch": 6.318885910038725, "grad_norm": 1.1755119562149048, "learning_rate": 4.322798779467184e-05, "loss": 0.5797, "num_input_tokens_seen": 24624488, "step": 42425 }, { "epoch": 6.319630622579684, "grad_norm": 1.553568720817566, "learning_rate": 4.322576378087121e-05, "loss": 0.5645, "num_input_tokens_seen": 24627528, "step": 42430 }, { "epoch": 6.3203753351206435, "grad_norm": 2.87797474861145, "learning_rate": 4.322353945916483e-05, "loss": 0.8512, "num_input_tokens_seen": 24630216, "step": 42435 }, { "epoch": 6.321120047661602, "grad_norm": 1.4654384851455688, "learning_rate": 4.322131482959027e-05, "loss": 0.6669, "num_input_tokens_seen": 24633320, "step": 42440 }, { "epoch": 6.321864760202562, "grad_norm": 0.7820535898208618, "learning_rate": 4.321908989218512e-05, "loss": 0.6948, "num_input_tokens_seen": 24636040, "step": 42445 }, { "epoch": 6.322609472743521, "grad_norm": 1.0740342140197754, "learning_rate": 4.321686464698696e-05, "loss": 0.6999, "num_input_tokens_seen": 24639272, "step": 42450 }, { "epoch": 6.32335418528448, "grad_norm": 0.9948267936706543, "learning_rate": 4.321463909403338e-05, "loss": 0.7625, "num_input_tokens_seen": 24642312, "step": 42455 }, { "epoch": 6.324098897825439, "grad_norm": 1.2884843349456787, "learning_rate": 4.3212413233362e-05, "loss": 0.6856, "num_input_tokens_seen": 24645448, "step": 42460 }, { "epoch": 6.324843610366399, "grad_norm": 0.879160463809967, "learning_rate": 4.32101870650104e-05, "loss": 0.5102, "num_input_tokens_seen": 24648264, "step": 42465 }, { "epoch": 6.3255883229073575, "grad_norm": 2.197178602218628, "learning_rate": 4.3207960589016196e-05, "loss": 0.5748, "num_input_tokens_seen": 24651144, "step": 42470 }, { "epoch": 6.326333035448317, "grad_norm": 0.3470894992351532, "learning_rate": 4.3205733805417e-05, "loss": 0.4739, "num_input_tokens_seen": 24653928, "step": 42475 }, { "epoch": 6.327077747989276, "grad_norm": 1.050793170928955, "learning_rate": 4.320350671425044e-05, "loss": 0.5893, "num_input_tokens_seen": 24656840, "step": 42480 }, { "epoch": 6.327822460530236, "grad_norm": 1.067046880722046, "learning_rate": 4.320127931555415e-05, "loss": 0.6877, "num_input_tokens_seen": 24659656, "step": 42485 }, { "epoch": 6.328567173071194, "grad_norm": 0.9684887528419495, "learning_rate": 4.319905160936572e-05, "loss": 0.7125, "num_input_tokens_seen": 24662440, "step": 42490 }, { "epoch": 6.329311885612154, "grad_norm": 1.1377838850021362, "learning_rate": 4.319682359572282e-05, "loss": 0.7482, "num_input_tokens_seen": 24665224, "step": 42495 }, { "epoch": 6.330056598153113, "grad_norm": 0.8369470834732056, "learning_rate": 4.319459527466308e-05, "loss": 0.6989, "num_input_tokens_seen": 24668168, "step": 42500 }, { "epoch": 6.330801310694072, "grad_norm": 1.7956005334854126, "learning_rate": 4.3192366646224146e-05, "loss": 0.4721, "num_input_tokens_seen": 24670760, "step": 42505 }, { "epoch": 6.331546023235031, "grad_norm": 1.0497840642929077, "learning_rate": 4.3190137710443666e-05, "loss": 0.5649, "num_input_tokens_seen": 24673832, "step": 42510 }, { "epoch": 6.332290735775991, "grad_norm": 0.6414803266525269, "learning_rate": 4.3187908467359294e-05, "loss": 0.4607, "num_input_tokens_seen": 24676776, "step": 42515 }, { "epoch": 6.3330354483169495, "grad_norm": 1.251996397972107, "learning_rate": 4.31856789170087e-05, "loss": 0.6878, "num_input_tokens_seen": 24679688, "step": 42520 }, { "epoch": 6.333780160857909, "grad_norm": 1.5619144439697266, "learning_rate": 4.318344905942954e-05, "loss": 0.4341, "num_input_tokens_seen": 24682408, "step": 42525 }, { "epoch": 6.334524873398868, "grad_norm": 0.9576510190963745, "learning_rate": 4.318121889465949e-05, "loss": 0.6693, "num_input_tokens_seen": 24685256, "step": 42530 }, { "epoch": 6.335269585939828, "grad_norm": 1.8337700366973877, "learning_rate": 4.317898842273622e-05, "loss": 0.5403, "num_input_tokens_seen": 24688008, "step": 42535 }, { "epoch": 6.336014298480786, "grad_norm": 2.5990893840789795, "learning_rate": 4.317675764369743e-05, "loss": 0.8241, "num_input_tokens_seen": 24690824, "step": 42540 }, { "epoch": 6.336759011021746, "grad_norm": 0.802590012550354, "learning_rate": 4.3174526557580785e-05, "loss": 0.6804, "num_input_tokens_seen": 24693832, "step": 42545 }, { "epoch": 6.337503723562705, "grad_norm": 0.8096094131469727, "learning_rate": 4.317229516442398e-05, "loss": 0.658, "num_input_tokens_seen": 24696872, "step": 42550 }, { "epoch": 6.338248436103664, "grad_norm": 1.1276205778121948, "learning_rate": 4.317006346426473e-05, "loss": 0.5169, "num_input_tokens_seen": 24699784, "step": 42555 }, { "epoch": 6.338993148644623, "grad_norm": 1.0723731517791748, "learning_rate": 4.3167831457140715e-05, "loss": 0.6324, "num_input_tokens_seen": 24702920, "step": 42560 }, { "epoch": 6.339737861185583, "grad_norm": 1.5938950777053833, "learning_rate": 4.316559914308966e-05, "loss": 0.5393, "num_input_tokens_seen": 24705928, "step": 42565 }, { "epoch": 6.340482573726542, "grad_norm": 0.5901293754577637, "learning_rate": 4.316336652214926e-05, "loss": 0.6088, "num_input_tokens_seen": 24708808, "step": 42570 }, { "epoch": 6.3412272862675, "grad_norm": 1.1688997745513916, "learning_rate": 4.316113359435725e-05, "loss": 0.6629, "num_input_tokens_seen": 24711624, "step": 42575 }, { "epoch": 6.34197199880846, "grad_norm": 1.1057630777359009, "learning_rate": 4.315890035975135e-05, "loss": 0.6726, "num_input_tokens_seen": 24714728, "step": 42580 }, { "epoch": 6.342716711349419, "grad_norm": 1.6970289945602417, "learning_rate": 4.315666681836928e-05, "loss": 0.7897, "num_input_tokens_seen": 24717384, "step": 42585 }, { "epoch": 6.343461423890378, "grad_norm": 1.6793911457061768, "learning_rate": 4.315443297024878e-05, "loss": 0.6723, "num_input_tokens_seen": 24720296, "step": 42590 }, { "epoch": 6.344206136431337, "grad_norm": 0.8153544068336487, "learning_rate": 4.315219881542758e-05, "loss": 0.3964, "num_input_tokens_seen": 24723272, "step": 42595 }, { "epoch": 6.344950848972297, "grad_norm": 1.0355125665664673, "learning_rate": 4.314996435394344e-05, "loss": 0.4416, "num_input_tokens_seen": 24726152, "step": 42600 }, { "epoch": 6.3456955615132555, "grad_norm": 0.9747783541679382, "learning_rate": 4.314772958583408e-05, "loss": 0.6374, "num_input_tokens_seen": 24729000, "step": 42605 }, { "epoch": 6.346440274054215, "grad_norm": 2.018892765045166, "learning_rate": 4.3145494511137294e-05, "loss": 0.7164, "num_input_tokens_seen": 24731816, "step": 42610 }, { "epoch": 6.347184986595174, "grad_norm": 0.6553953886032104, "learning_rate": 4.3143259129890814e-05, "loss": 0.8547, "num_input_tokens_seen": 24734568, "step": 42615 }, { "epoch": 6.347929699136134, "grad_norm": 2.2449827194213867, "learning_rate": 4.314102344213241e-05, "loss": 0.6513, "num_input_tokens_seen": 24737672, "step": 42620 }, { "epoch": 6.348674411677092, "grad_norm": 1.290540099143982, "learning_rate": 4.3138787447899854e-05, "loss": 0.6775, "num_input_tokens_seen": 24740424, "step": 42625 }, { "epoch": 6.349419124218052, "grad_norm": 0.6527166366577148, "learning_rate": 4.313655114723092e-05, "loss": 0.5882, "num_input_tokens_seen": 24743240, "step": 42630 }, { "epoch": 6.350163836759011, "grad_norm": 0.753660261631012, "learning_rate": 4.3134314540163376e-05, "loss": 0.6096, "num_input_tokens_seen": 24746120, "step": 42635 }, { "epoch": 6.35090854929997, "grad_norm": 0.9850696325302124, "learning_rate": 4.3132077626735036e-05, "loss": 0.5694, "num_input_tokens_seen": 24749160, "step": 42640 }, { "epoch": 6.351653261840929, "grad_norm": 0.7169673442840576, "learning_rate": 4.312984040698366e-05, "loss": 0.6141, "num_input_tokens_seen": 24752008, "step": 42645 }, { "epoch": 6.352397974381889, "grad_norm": 1.4042004346847534, "learning_rate": 4.3127602880947065e-05, "loss": 0.6132, "num_input_tokens_seen": 24754824, "step": 42650 }, { "epoch": 6.353142686922848, "grad_norm": 1.8177757263183594, "learning_rate": 4.3125365048663035e-05, "loss": 0.7012, "num_input_tokens_seen": 24757576, "step": 42655 }, { "epoch": 6.353887399463807, "grad_norm": 0.917485237121582, "learning_rate": 4.31231269101694e-05, "loss": 0.6388, "num_input_tokens_seen": 24760168, "step": 42660 }, { "epoch": 6.354632112004766, "grad_norm": 1.4858537912368774, "learning_rate": 4.312088846550394e-05, "loss": 0.7221, "num_input_tokens_seen": 24763240, "step": 42665 }, { "epoch": 6.355376824545726, "grad_norm": 0.8707340359687805, "learning_rate": 4.311864971470449e-05, "loss": 0.6356, "num_input_tokens_seen": 24765960, "step": 42670 }, { "epoch": 6.356121537086684, "grad_norm": 1.3368862867355347, "learning_rate": 4.311641065780887e-05, "loss": 0.7857, "num_input_tokens_seen": 24768808, "step": 42675 }, { "epoch": 6.356866249627644, "grad_norm": 0.7524826526641846, "learning_rate": 4.31141712948549e-05, "loss": 0.5596, "num_input_tokens_seen": 24771912, "step": 42680 }, { "epoch": 6.357610962168603, "grad_norm": 0.9893682599067688, "learning_rate": 4.311193162588043e-05, "loss": 0.5615, "num_input_tokens_seen": 24774952, "step": 42685 }, { "epoch": 6.358355674709562, "grad_norm": 0.9784823060035706, "learning_rate": 4.3109691650923265e-05, "loss": 0.628, "num_input_tokens_seen": 24777832, "step": 42690 }, { "epoch": 6.359100387250521, "grad_norm": 0.8350444436073303, "learning_rate": 4.310745137002128e-05, "loss": 0.4855, "num_input_tokens_seen": 24781160, "step": 42695 }, { "epoch": 6.359845099791481, "grad_norm": 1.2406622171401978, "learning_rate": 4.3105210783212304e-05, "loss": 0.635, "num_input_tokens_seen": 24784104, "step": 42700 }, { "epoch": 6.36058981233244, "grad_norm": 0.9512200355529785, "learning_rate": 4.310296989053419e-05, "loss": 0.6234, "num_input_tokens_seen": 24786632, "step": 42705 }, { "epoch": 6.361334524873399, "grad_norm": 1.2028357982635498, "learning_rate": 4.31007286920248e-05, "loss": 0.9125, "num_input_tokens_seen": 24789544, "step": 42710 }, { "epoch": 6.362079237414358, "grad_norm": 1.2989170551300049, "learning_rate": 4.3098487187721995e-05, "loss": 0.8355, "num_input_tokens_seen": 24792488, "step": 42715 }, { "epoch": 6.362823949955318, "grad_norm": 1.3118624687194824, "learning_rate": 4.3096245377663645e-05, "loss": 0.56, "num_input_tokens_seen": 24795272, "step": 42720 }, { "epoch": 6.363568662496276, "grad_norm": 1.45601487159729, "learning_rate": 4.3094003261887625e-05, "loss": 0.7949, "num_input_tokens_seen": 24798152, "step": 42725 }, { "epoch": 6.364313375037236, "grad_norm": 1.8100409507751465, "learning_rate": 4.30917608404318e-05, "loss": 0.4782, "num_input_tokens_seen": 24801000, "step": 42730 }, { "epoch": 6.365058087578195, "grad_norm": 1.1580474376678467, "learning_rate": 4.308951811333407e-05, "loss": 0.6019, "num_input_tokens_seen": 24803816, "step": 42735 }, { "epoch": 6.365802800119154, "grad_norm": 0.6533370018005371, "learning_rate": 4.3087275080632314e-05, "loss": 0.4761, "num_input_tokens_seen": 24806856, "step": 42740 }, { "epoch": 6.366547512660113, "grad_norm": 1.1202720403671265, "learning_rate": 4.308503174236443e-05, "loss": 0.6866, "num_input_tokens_seen": 24809608, "step": 42745 }, { "epoch": 6.367292225201073, "grad_norm": 1.0212115049362183, "learning_rate": 4.308278809856832e-05, "loss": 0.4833, "num_input_tokens_seen": 24812584, "step": 42750 }, { "epoch": 6.368036937742032, "grad_norm": 1.4128919839859009, "learning_rate": 4.3080544149281875e-05, "loss": 0.6042, "num_input_tokens_seen": 24815272, "step": 42755 }, { "epoch": 6.36878165028299, "grad_norm": 1.2986806631088257, "learning_rate": 4.307829989454302e-05, "loss": 0.7467, "num_input_tokens_seen": 24818376, "step": 42760 }, { "epoch": 6.36952636282395, "grad_norm": 0.9159225821495056, "learning_rate": 4.307605533438965e-05, "loss": 0.5889, "num_input_tokens_seen": 24821160, "step": 42765 }, { "epoch": 6.370271075364909, "grad_norm": 1.0216020345687866, "learning_rate": 4.307381046885971e-05, "loss": 0.6479, "num_input_tokens_seen": 24824008, "step": 42770 }, { "epoch": 6.371015787905868, "grad_norm": 1.08042573928833, "learning_rate": 4.307156529799111e-05, "loss": 0.5686, "num_input_tokens_seen": 24826696, "step": 42775 }, { "epoch": 6.371760500446827, "grad_norm": 1.6762572526931763, "learning_rate": 4.306931982182178e-05, "loss": 0.7494, "num_input_tokens_seen": 24829384, "step": 42780 }, { "epoch": 6.372505212987787, "grad_norm": 0.8275713920593262, "learning_rate": 4.306707404038966e-05, "loss": 0.6319, "num_input_tokens_seen": 24832168, "step": 42785 }, { "epoch": 6.373249925528746, "grad_norm": 1.0809365510940552, "learning_rate": 4.306482795373268e-05, "loss": 0.6152, "num_input_tokens_seen": 24835208, "step": 42790 }, { "epoch": 6.373994638069705, "grad_norm": 0.9030414819717407, "learning_rate": 4.306258156188879e-05, "loss": 0.6427, "num_input_tokens_seen": 24837896, "step": 42795 }, { "epoch": 6.374739350610664, "grad_norm": 1.2699710130691528, "learning_rate": 4.306033486489595e-05, "loss": 0.4972, "num_input_tokens_seen": 24840936, "step": 42800 }, { "epoch": 6.375484063151624, "grad_norm": 1.0803425312042236, "learning_rate": 4.30580878627921e-05, "loss": 0.5383, "num_input_tokens_seen": 24843816, "step": 42805 }, { "epoch": 6.376228775692582, "grad_norm": 1.3515219688415527, "learning_rate": 4.305584055561522e-05, "loss": 0.7071, "num_input_tokens_seen": 24846440, "step": 42810 }, { "epoch": 6.376973488233542, "grad_norm": 1.4662829637527466, "learning_rate": 4.3053592943403256e-05, "loss": 0.7532, "num_input_tokens_seen": 24849480, "step": 42815 }, { "epoch": 6.377718200774501, "grad_norm": 1.0471148490905762, "learning_rate": 4.305134502619419e-05, "loss": 0.6077, "num_input_tokens_seen": 24852360, "step": 42820 }, { "epoch": 6.3784629133154604, "grad_norm": 0.9152579307556152, "learning_rate": 4.3049096804026e-05, "loss": 0.7298, "num_input_tokens_seen": 24855464, "step": 42825 }, { "epoch": 6.379207625856419, "grad_norm": 1.177631139755249, "learning_rate": 4.304684827693666e-05, "loss": 0.7762, "num_input_tokens_seen": 24858344, "step": 42830 }, { "epoch": 6.379952338397379, "grad_norm": 1.2239216566085815, "learning_rate": 4.304459944496416e-05, "loss": 0.4482, "num_input_tokens_seen": 24861352, "step": 42835 }, { "epoch": 6.380697050938338, "grad_norm": 2.2365996837615967, "learning_rate": 4.3042350308146496e-05, "loss": 0.75, "num_input_tokens_seen": 24864264, "step": 42840 }, { "epoch": 6.381441763479297, "grad_norm": 1.3144927024841309, "learning_rate": 4.304010086652165e-05, "loss": 0.5867, "num_input_tokens_seen": 24867656, "step": 42845 }, { "epoch": 6.382186476020256, "grad_norm": 0.7240957617759705, "learning_rate": 4.3037851120127645e-05, "loss": 0.6727, "num_input_tokens_seen": 24870344, "step": 42850 }, { "epoch": 6.382931188561216, "grad_norm": 1.3464146852493286, "learning_rate": 4.3035601069002476e-05, "loss": 0.5971, "num_input_tokens_seen": 24873224, "step": 42855 }, { "epoch": 6.383675901102174, "grad_norm": 0.9348472356796265, "learning_rate": 4.303335071318416e-05, "loss": 0.6883, "num_input_tokens_seen": 24875976, "step": 42860 }, { "epoch": 6.384420613643134, "grad_norm": 1.0854665040969849, "learning_rate": 4.303110005271071e-05, "loss": 0.688, "num_input_tokens_seen": 24879144, "step": 42865 }, { "epoch": 6.385165326184093, "grad_norm": 1.1886470317840576, "learning_rate": 4.302884908762015e-05, "loss": 0.594, "num_input_tokens_seen": 24881960, "step": 42870 }, { "epoch": 6.3859100387250525, "grad_norm": 1.3786357641220093, "learning_rate": 4.302659781795051e-05, "loss": 0.4861, "num_input_tokens_seen": 24884840, "step": 42875 }, { "epoch": 6.386654751266011, "grad_norm": 1.9088644981384277, "learning_rate": 4.302434624373982e-05, "loss": 0.7014, "num_input_tokens_seen": 24887464, "step": 42880 }, { "epoch": 6.387399463806971, "grad_norm": 0.9603760838508606, "learning_rate": 4.3022094365026124e-05, "loss": 0.6529, "num_input_tokens_seen": 24890408, "step": 42885 }, { "epoch": 6.38814417634793, "grad_norm": 1.281812071800232, "learning_rate": 4.3019842181847456e-05, "loss": 0.57, "num_input_tokens_seen": 24893608, "step": 42890 }, { "epoch": 6.388888888888889, "grad_norm": 1.733079433441162, "learning_rate": 4.301758969424187e-05, "loss": 0.4991, "num_input_tokens_seen": 24896136, "step": 42895 }, { "epoch": 6.389633601429848, "grad_norm": 0.93185955286026, "learning_rate": 4.301533690224741e-05, "loss": 0.6135, "num_input_tokens_seen": 24899208, "step": 42900 }, { "epoch": 6.390378313970807, "grad_norm": 1.7179362773895264, "learning_rate": 4.3013083805902156e-05, "loss": 0.6562, "num_input_tokens_seen": 24902120, "step": 42905 }, { "epoch": 6.3911230265117664, "grad_norm": 1.5447088479995728, "learning_rate": 4.301083040524415e-05, "loss": 0.7078, "num_input_tokens_seen": 24904776, "step": 42910 }, { "epoch": 6.391867739052726, "grad_norm": 1.3232156038284302, "learning_rate": 4.3008576700311473e-05, "loss": 0.6322, "num_input_tokens_seen": 24907688, "step": 42915 }, { "epoch": 6.392612451593685, "grad_norm": 1.1037222146987915, "learning_rate": 4.30063226911422e-05, "loss": 0.6391, "num_input_tokens_seen": 24910792, "step": 42920 }, { "epoch": 6.393357164134644, "grad_norm": 1.1010634899139404, "learning_rate": 4.30040683777744e-05, "loss": 0.5581, "num_input_tokens_seen": 24913736, "step": 42925 }, { "epoch": 6.394101876675603, "grad_norm": 1.1207243204116821, "learning_rate": 4.300181376024616e-05, "loss": 0.7372, "num_input_tokens_seen": 24916712, "step": 42930 }, { "epoch": 6.394846589216562, "grad_norm": 1.6598485708236694, "learning_rate": 4.299955883859558e-05, "loss": 0.773, "num_input_tokens_seen": 24919432, "step": 42935 }, { "epoch": 6.395591301757522, "grad_norm": 1.6977232694625854, "learning_rate": 4.2997303612860746e-05, "loss": 0.533, "num_input_tokens_seen": 24922184, "step": 42940 }, { "epoch": 6.39633601429848, "grad_norm": 2.048574686050415, "learning_rate": 4.299504808307976e-05, "loss": 0.8818, "num_input_tokens_seen": 24925000, "step": 42945 }, { "epoch": 6.39708072683944, "grad_norm": 1.0487024784088135, "learning_rate": 4.299279224929072e-05, "loss": 0.6647, "num_input_tokens_seen": 24928232, "step": 42950 }, { "epoch": 6.397825439380399, "grad_norm": 1.8606079816818237, "learning_rate": 4.299053611153175e-05, "loss": 0.645, "num_input_tokens_seen": 24930952, "step": 42955 }, { "epoch": 6.3985701519213585, "grad_norm": 2.274972438812256, "learning_rate": 4.2988279669840945e-05, "loss": 0.7115, "num_input_tokens_seen": 24933864, "step": 42960 }, { "epoch": 6.399314864462317, "grad_norm": 0.5986725091934204, "learning_rate": 4.298602292425645e-05, "loss": 0.6733, "num_input_tokens_seen": 24936648, "step": 42965 }, { "epoch": 6.400059577003277, "grad_norm": 1.5179102420806885, "learning_rate": 4.298376587481637e-05, "loss": 0.6114, "num_input_tokens_seen": 24939688, "step": 42970 }, { "epoch": 6.400804289544236, "grad_norm": 1.5822975635528564, "learning_rate": 4.2981508521558854e-05, "loss": 0.6156, "num_input_tokens_seen": 24942376, "step": 42975 }, { "epoch": 6.401549002085195, "grad_norm": 1.008614420890808, "learning_rate": 4.2979250864522016e-05, "loss": 0.401, "num_input_tokens_seen": 24945000, "step": 42980 }, { "epoch": 6.402293714626154, "grad_norm": 1.6254197359085083, "learning_rate": 4.297699290374401e-05, "loss": 0.785, "num_input_tokens_seen": 24947848, "step": 42985 }, { "epoch": 6.403038427167114, "grad_norm": 1.3676191568374634, "learning_rate": 4.297473463926299e-05, "loss": 0.7149, "num_input_tokens_seen": 24950696, "step": 42990 }, { "epoch": 6.4037831397080724, "grad_norm": 1.261542558670044, "learning_rate": 4.2972476071117086e-05, "loss": 0.6739, "num_input_tokens_seen": 24953736, "step": 42995 }, { "epoch": 6.404527852249032, "grad_norm": 1.5832351446151733, "learning_rate": 4.2970217199344465e-05, "loss": 0.7157, "num_input_tokens_seen": 24956616, "step": 43000 }, { "epoch": 6.405272564789991, "grad_norm": 1.9113538265228271, "learning_rate": 4.296795802398329e-05, "loss": 0.8562, "num_input_tokens_seen": 24959656, "step": 43005 }, { "epoch": 6.4060172773309505, "grad_norm": 2.120500087738037, "learning_rate": 4.296569854507173e-05, "loss": 0.6357, "num_input_tokens_seen": 24962504, "step": 43010 }, { "epoch": 6.406761989871909, "grad_norm": 2.1008822917938232, "learning_rate": 4.2963438762647954e-05, "loss": 0.7276, "num_input_tokens_seen": 24965288, "step": 43015 }, { "epoch": 6.407506702412869, "grad_norm": 1.0208395719528198, "learning_rate": 4.2961178676750124e-05, "loss": 0.5581, "num_input_tokens_seen": 24968168, "step": 43020 }, { "epoch": 6.408251414953828, "grad_norm": 1.0788264274597168, "learning_rate": 4.295891828741645e-05, "loss": 0.5881, "num_input_tokens_seen": 24970856, "step": 43025 }, { "epoch": 6.408996127494787, "grad_norm": 1.2280523777008057, "learning_rate": 4.29566575946851e-05, "loss": 0.6076, "num_input_tokens_seen": 24973736, "step": 43030 }, { "epoch": 6.409740840035746, "grad_norm": 1.1934281587600708, "learning_rate": 4.295439659859427e-05, "loss": 0.6295, "num_input_tokens_seen": 24976648, "step": 43035 }, { "epoch": 6.410485552576706, "grad_norm": 1.534443736076355, "learning_rate": 4.2952135299182155e-05, "loss": 0.7681, "num_input_tokens_seen": 24979496, "step": 43040 }, { "epoch": 6.4112302651176645, "grad_norm": 2.031705856323242, "learning_rate": 4.294987369648696e-05, "loss": 0.6334, "num_input_tokens_seen": 24982248, "step": 43045 }, { "epoch": 6.411974977658624, "grad_norm": 1.1032527685165405, "learning_rate": 4.2947611790546894e-05, "loss": 0.565, "num_input_tokens_seen": 24984936, "step": 43050 }, { "epoch": 6.412719690199583, "grad_norm": 1.5206598043441772, "learning_rate": 4.2945349581400174e-05, "loss": 0.6091, "num_input_tokens_seen": 24988040, "step": 43055 }, { "epoch": 6.4134644027405425, "grad_norm": 0.9587695598602295, "learning_rate": 4.2943087069085e-05, "loss": 0.6599, "num_input_tokens_seen": 24990824, "step": 43060 }, { "epoch": 6.414209115281501, "grad_norm": 1.3107610940933228, "learning_rate": 4.294082425363961e-05, "loss": 0.5138, "num_input_tokens_seen": 24993736, "step": 43065 }, { "epoch": 6.414953827822461, "grad_norm": 1.032385230064392, "learning_rate": 4.293856113510223e-05, "loss": 0.7178, "num_input_tokens_seen": 24996520, "step": 43070 }, { "epoch": 6.41569854036342, "grad_norm": 2.125718355178833, "learning_rate": 4.29362977135111e-05, "loss": 0.5944, "num_input_tokens_seen": 24999240, "step": 43075 }, { "epoch": 6.416443252904379, "grad_norm": 1.6134345531463623, "learning_rate": 4.2934033988904437e-05, "loss": 0.8556, "num_input_tokens_seen": 25002216, "step": 43080 }, { "epoch": 6.417187965445338, "grad_norm": 1.0451580286026, "learning_rate": 4.2931769961320504e-05, "loss": 0.6284, "num_input_tokens_seen": 25004872, "step": 43085 }, { "epoch": 6.417932677986297, "grad_norm": 1.128495693206787, "learning_rate": 4.292950563079754e-05, "loss": 0.6305, "num_input_tokens_seen": 25008136, "step": 43090 }, { "epoch": 6.4186773905272565, "grad_norm": 1.0956284999847412, "learning_rate": 4.2927240997373795e-05, "loss": 0.6764, "num_input_tokens_seen": 25011048, "step": 43095 }, { "epoch": 6.419422103068215, "grad_norm": 1.7080973386764526, "learning_rate": 4.292497606108754e-05, "loss": 0.6234, "num_input_tokens_seen": 25013896, "step": 43100 }, { "epoch": 6.420166815609175, "grad_norm": 0.7688011527061462, "learning_rate": 4.2922710821977044e-05, "loss": 0.5711, "num_input_tokens_seen": 25016648, "step": 43105 }, { "epoch": 6.420911528150134, "grad_norm": 1.2663546800613403, "learning_rate": 4.2920445280080544e-05, "loss": 0.7637, "num_input_tokens_seen": 25019880, "step": 43110 }, { "epoch": 6.421656240691093, "grad_norm": 1.8423373699188232, "learning_rate": 4.291817943543634e-05, "loss": 0.6505, "num_input_tokens_seen": 25022760, "step": 43115 }, { "epoch": 6.422400953232052, "grad_norm": 0.8441342711448669, "learning_rate": 4.291591328808272e-05, "loss": 0.6794, "num_input_tokens_seen": 25025608, "step": 43120 }, { "epoch": 6.423145665773012, "grad_norm": 1.0299732685089111, "learning_rate": 4.291364683805794e-05, "loss": 0.5503, "num_input_tokens_seen": 25028648, "step": 43125 }, { "epoch": 6.4238903783139705, "grad_norm": 1.277341604232788, "learning_rate": 4.291138008540031e-05, "loss": 0.7359, "num_input_tokens_seen": 25031784, "step": 43130 }, { "epoch": 6.42463509085493, "grad_norm": 1.3231093883514404, "learning_rate": 4.2909113030148106e-05, "loss": 0.6173, "num_input_tokens_seen": 25034664, "step": 43135 }, { "epoch": 6.425379803395889, "grad_norm": 1.5041824579238892, "learning_rate": 4.290684567233965e-05, "loss": 0.6437, "num_input_tokens_seen": 25037576, "step": 43140 }, { "epoch": 6.4261245159368485, "grad_norm": 1.279778003692627, "learning_rate": 4.2904578012013233e-05, "loss": 0.653, "num_input_tokens_seen": 25040296, "step": 43145 }, { "epoch": 6.426869228477807, "grad_norm": 0.6374139785766602, "learning_rate": 4.290231004920717e-05, "loss": 0.5029, "num_input_tokens_seen": 25043016, "step": 43150 }, { "epoch": 6.427613941018767, "grad_norm": 1.3653502464294434, "learning_rate": 4.2900041783959775e-05, "loss": 0.6449, "num_input_tokens_seen": 25046184, "step": 43155 }, { "epoch": 6.428358653559726, "grad_norm": 0.7197962999343872, "learning_rate": 4.2897773216309366e-05, "loss": 0.708, "num_input_tokens_seen": 25049064, "step": 43160 }, { "epoch": 6.429103366100685, "grad_norm": 1.692360281944275, "learning_rate": 4.289550434629426e-05, "loss": 0.7799, "num_input_tokens_seen": 25051848, "step": 43165 }, { "epoch": 6.429848078641644, "grad_norm": 1.2619119882583618, "learning_rate": 4.2893235173952805e-05, "loss": 0.6409, "num_input_tokens_seen": 25054696, "step": 43170 }, { "epoch": 6.430592791182604, "grad_norm": 1.0267034769058228, "learning_rate": 4.2890965699323335e-05, "loss": 0.631, "num_input_tokens_seen": 25057544, "step": 43175 }, { "epoch": 6.4313375037235625, "grad_norm": 1.0820475816726685, "learning_rate": 4.288869592244417e-05, "loss": 0.5436, "num_input_tokens_seen": 25060648, "step": 43180 }, { "epoch": 6.432082216264522, "grad_norm": 1.552742838859558, "learning_rate": 4.288642584335367e-05, "loss": 0.7009, "num_input_tokens_seen": 25063432, "step": 43185 }, { "epoch": 6.432826928805481, "grad_norm": 0.8061830401420593, "learning_rate": 4.2884155462090194e-05, "loss": 0.5735, "num_input_tokens_seen": 25066056, "step": 43190 }, { "epoch": 6.4335716413464406, "grad_norm": 0.8833125233650208, "learning_rate": 4.2881884778692076e-05, "loss": 0.7016, "num_input_tokens_seen": 25069384, "step": 43195 }, { "epoch": 6.434316353887399, "grad_norm": 2.112496852874756, "learning_rate": 4.287961379319769e-05, "loss": 0.6514, "num_input_tokens_seen": 25072168, "step": 43200 }, { "epoch": 6.435061066428359, "grad_norm": 0.7004398107528687, "learning_rate": 4.287734250564541e-05, "loss": 0.5321, "num_input_tokens_seen": 25075016, "step": 43205 }, { "epoch": 6.435805778969318, "grad_norm": 1.650680661201477, "learning_rate": 4.28750709160736e-05, "loss": 0.6332, "num_input_tokens_seen": 25077768, "step": 43210 }, { "epoch": 6.436550491510277, "grad_norm": 2.1941165924072266, "learning_rate": 4.2872799024520626e-05, "loss": 0.6787, "num_input_tokens_seen": 25080360, "step": 43215 }, { "epoch": 6.437295204051236, "grad_norm": 1.4356204271316528, "learning_rate": 4.287052683102488e-05, "loss": 0.8435, "num_input_tokens_seen": 25083496, "step": 43220 }, { "epoch": 6.438039916592196, "grad_norm": 2.292980194091797, "learning_rate": 4.286825433562474e-05, "loss": 0.686, "num_input_tokens_seen": 25086536, "step": 43225 }, { "epoch": 6.4387846291331545, "grad_norm": 1.0970221757888794, "learning_rate": 4.286598153835861e-05, "loss": 0.7496, "num_input_tokens_seen": 25089448, "step": 43230 }, { "epoch": 6.439529341674114, "grad_norm": 1.2929388284683228, "learning_rate": 4.2863708439264886e-05, "loss": 0.523, "num_input_tokens_seen": 25092296, "step": 43235 }, { "epoch": 6.440274054215073, "grad_norm": 1.1920732259750366, "learning_rate": 4.286143503838195e-05, "loss": 0.715, "num_input_tokens_seen": 25095176, "step": 43240 }, { "epoch": 6.441018766756033, "grad_norm": 0.9234002828598022, "learning_rate": 4.285916133574823e-05, "loss": 0.553, "num_input_tokens_seen": 25097992, "step": 43245 }, { "epoch": 6.441763479296991, "grad_norm": 1.35030198097229, "learning_rate": 4.2856887331402126e-05, "loss": 0.6284, "num_input_tokens_seen": 25100776, "step": 43250 }, { "epoch": 6.44250819183795, "grad_norm": 1.1844017505645752, "learning_rate": 4.285461302538207e-05, "loss": 0.6354, "num_input_tokens_seen": 25103880, "step": 43255 }, { "epoch": 6.44325290437891, "grad_norm": 1.256052851676941, "learning_rate": 4.285233841772647e-05, "loss": 0.5646, "num_input_tokens_seen": 25106760, "step": 43260 }, { "epoch": 6.443997616919869, "grad_norm": 1.2515074014663696, "learning_rate": 4.2850063508473746e-05, "loss": 0.5975, "num_input_tokens_seen": 25109672, "step": 43265 }, { "epoch": 6.444742329460828, "grad_norm": 1.063567042350769, "learning_rate": 4.284778829766235e-05, "loss": 0.4017, "num_input_tokens_seen": 25112424, "step": 43270 }, { "epoch": 6.445487042001787, "grad_norm": 1.1906551122665405, "learning_rate": 4.284551278533071e-05, "loss": 0.5458, "num_input_tokens_seen": 25115112, "step": 43275 }, { "epoch": 6.4462317545427466, "grad_norm": 0.6591653823852539, "learning_rate": 4.284323697151726e-05, "loss": 0.5667, "num_input_tokens_seen": 25118056, "step": 43280 }, { "epoch": 6.446976467083705, "grad_norm": 1.0579694509506226, "learning_rate": 4.284096085626047e-05, "loss": 0.5768, "num_input_tokens_seen": 25121064, "step": 43285 }, { "epoch": 6.447721179624665, "grad_norm": 2.05232834815979, "learning_rate": 4.283868443959877e-05, "loss": 0.6983, "num_input_tokens_seen": 25124040, "step": 43290 }, { "epoch": 6.448465892165624, "grad_norm": 1.8041080236434937, "learning_rate": 4.283640772157064e-05, "loss": 0.6777, "num_input_tokens_seen": 25126856, "step": 43295 }, { "epoch": 6.449210604706583, "grad_norm": 0.8603767156600952, "learning_rate": 4.283413070221452e-05, "loss": 0.6045, "num_input_tokens_seen": 25129704, "step": 43300 }, { "epoch": 6.449955317247542, "grad_norm": 1.289858102798462, "learning_rate": 4.283185338156888e-05, "loss": 0.5245, "num_input_tokens_seen": 25132520, "step": 43305 }, { "epoch": 6.450700029788502, "grad_norm": 1.8913486003875732, "learning_rate": 4.282957575967221e-05, "loss": 0.66, "num_input_tokens_seen": 25135272, "step": 43310 }, { "epoch": 6.4514447423294605, "grad_norm": 2.3695194721221924, "learning_rate": 4.282729783656298e-05, "loss": 0.6765, "num_input_tokens_seen": 25138440, "step": 43315 }, { "epoch": 6.45218945487042, "grad_norm": 1.1776812076568604, "learning_rate": 4.2825019612279666e-05, "loss": 0.6232, "num_input_tokens_seen": 25141576, "step": 43320 }, { "epoch": 6.452934167411379, "grad_norm": 2.1322062015533447, "learning_rate": 4.282274108686076e-05, "loss": 0.5951, "num_input_tokens_seen": 25144264, "step": 43325 }, { "epoch": 6.453678879952339, "grad_norm": 1.1194684505462646, "learning_rate": 4.282046226034476e-05, "loss": 0.5672, "num_input_tokens_seen": 25147400, "step": 43330 }, { "epoch": 6.454423592493297, "grad_norm": 1.1631516218185425, "learning_rate": 4.2818183132770175e-05, "loss": 0.7988, "num_input_tokens_seen": 25150472, "step": 43335 }, { "epoch": 6.455168305034257, "grad_norm": 0.9586103558540344, "learning_rate": 4.281590370417548e-05, "loss": 0.5262, "num_input_tokens_seen": 25153192, "step": 43340 }, { "epoch": 6.455913017575216, "grad_norm": 0.7944055795669556, "learning_rate": 4.28136239745992e-05, "loss": 0.5342, "num_input_tokens_seen": 25156200, "step": 43345 }, { "epoch": 6.456657730116175, "grad_norm": 1.914814829826355, "learning_rate": 4.2811343944079855e-05, "loss": 0.5345, "num_input_tokens_seen": 25158984, "step": 43350 }, { "epoch": 6.457402442657134, "grad_norm": 1.843024730682373, "learning_rate": 4.280906361265595e-05, "loss": 0.5033, "num_input_tokens_seen": 25161640, "step": 43355 }, { "epoch": 6.458147155198094, "grad_norm": 0.9283320903778076, "learning_rate": 4.2806782980366025e-05, "loss": 0.5358, "num_input_tokens_seen": 25164360, "step": 43360 }, { "epoch": 6.4588918677390526, "grad_norm": 0.9724192023277283, "learning_rate": 4.2804502047248594e-05, "loss": 0.5553, "num_input_tokens_seen": 25167112, "step": 43365 }, { "epoch": 6.459636580280012, "grad_norm": 1.1247543096542358, "learning_rate": 4.2802220813342194e-05, "loss": 0.6777, "num_input_tokens_seen": 25169928, "step": 43370 }, { "epoch": 6.460381292820971, "grad_norm": 1.7116053104400635, "learning_rate": 4.2799939278685376e-05, "loss": 0.6719, "num_input_tokens_seen": 25172712, "step": 43375 }, { "epoch": 6.461126005361931, "grad_norm": 2.2815377712249756, "learning_rate": 4.279765744331666e-05, "loss": 0.7895, "num_input_tokens_seen": 25175592, "step": 43380 }, { "epoch": 6.461870717902889, "grad_norm": 0.8669072985649109, "learning_rate": 4.2795375307274624e-05, "loss": 0.5989, "num_input_tokens_seen": 25178376, "step": 43385 }, { "epoch": 6.462615430443849, "grad_norm": 1.2102489471435547, "learning_rate": 4.2793092870597804e-05, "loss": 0.7667, "num_input_tokens_seen": 25181064, "step": 43390 }, { "epoch": 6.463360142984808, "grad_norm": 2.2667605876922607, "learning_rate": 4.279081013332476e-05, "loss": 0.5845, "num_input_tokens_seen": 25184104, "step": 43395 }, { "epoch": 6.464104855525767, "grad_norm": 2.101179361343384, "learning_rate": 4.278852709549406e-05, "loss": 0.5384, "num_input_tokens_seen": 25186920, "step": 43400 }, { "epoch": 6.464849568066726, "grad_norm": 1.1108158826828003, "learning_rate": 4.2786243757144284e-05, "loss": 0.7084, "num_input_tokens_seen": 25189928, "step": 43405 }, { "epoch": 6.465594280607686, "grad_norm": 1.062208890914917, "learning_rate": 4.278396011831399e-05, "loss": 0.8055, "num_input_tokens_seen": 25192872, "step": 43410 }, { "epoch": 6.466338993148645, "grad_norm": 0.928074300289154, "learning_rate": 4.2781676179041764e-05, "loss": 0.6115, "num_input_tokens_seen": 25195496, "step": 43415 }, { "epoch": 6.467083705689604, "grad_norm": 1.0701065063476562, "learning_rate": 4.2779391939366194e-05, "loss": 0.594, "num_input_tokens_seen": 25198088, "step": 43420 }, { "epoch": 6.467828418230563, "grad_norm": 1.2491745948791504, "learning_rate": 4.277710739932586e-05, "loss": 0.5736, "num_input_tokens_seen": 25201224, "step": 43425 }, { "epoch": 6.468573130771523, "grad_norm": 1.4793139696121216, "learning_rate": 4.277482255895937e-05, "loss": 0.785, "num_input_tokens_seen": 25204488, "step": 43430 }, { "epoch": 6.469317843312481, "grad_norm": 1.0979979038238525, "learning_rate": 4.277253741830532e-05, "loss": 0.5592, "num_input_tokens_seen": 25207752, "step": 43435 }, { "epoch": 6.47006255585344, "grad_norm": 1.0405662059783936, "learning_rate": 4.2770251977402314e-05, "loss": 0.5749, "num_input_tokens_seen": 25210664, "step": 43440 }, { "epoch": 6.4708072683944, "grad_norm": 0.9653668999671936, "learning_rate": 4.2767966236288956e-05, "loss": 0.6236, "num_input_tokens_seen": 25213512, "step": 43445 }, { "epoch": 6.4715519809353586, "grad_norm": 2.150813341140747, "learning_rate": 4.276568019500388e-05, "loss": 0.6296, "num_input_tokens_seen": 25216456, "step": 43450 }, { "epoch": 6.472296693476318, "grad_norm": 0.7181011438369751, "learning_rate": 4.276339385358568e-05, "loss": 0.6453, "num_input_tokens_seen": 25219144, "step": 43455 }, { "epoch": 6.473041406017277, "grad_norm": 0.8926382660865784, "learning_rate": 4.2761107212073e-05, "loss": 0.6667, "num_input_tokens_seen": 25222024, "step": 43460 }, { "epoch": 6.473786118558237, "grad_norm": 1.182931900024414, "learning_rate": 4.275882027050446e-05, "loss": 0.7585, "num_input_tokens_seen": 25224616, "step": 43465 }, { "epoch": 6.474530831099195, "grad_norm": 1.5761585235595703, "learning_rate": 4.275653302891871e-05, "loss": 0.678, "num_input_tokens_seen": 25227400, "step": 43470 }, { "epoch": 6.475275543640155, "grad_norm": 1.3395593166351318, "learning_rate": 4.275424548735437e-05, "loss": 0.7178, "num_input_tokens_seen": 25230088, "step": 43475 }, { "epoch": 6.476020256181114, "grad_norm": 1.2144432067871094, "learning_rate": 4.27519576458501e-05, "loss": 0.5792, "num_input_tokens_seen": 25233032, "step": 43480 }, { "epoch": 6.476764968722073, "grad_norm": 1.0335609912872314, "learning_rate": 4.274966950444456e-05, "loss": 0.7453, "num_input_tokens_seen": 25235848, "step": 43485 }, { "epoch": 6.477509681263032, "grad_norm": 1.5611969232559204, "learning_rate": 4.2747381063176384e-05, "loss": 0.6511, "num_input_tokens_seen": 25239080, "step": 43490 }, { "epoch": 6.478254393803992, "grad_norm": 0.8102494478225708, "learning_rate": 4.274509232208425e-05, "loss": 0.6984, "num_input_tokens_seen": 25242120, "step": 43495 }, { "epoch": 6.478999106344951, "grad_norm": 1.8287988901138306, "learning_rate": 4.274280328120681e-05, "loss": 0.7548, "num_input_tokens_seen": 25245032, "step": 43500 }, { "epoch": 6.47974381888591, "grad_norm": 0.7754233479499817, "learning_rate": 4.274051394058274e-05, "loss": 0.5398, "num_input_tokens_seen": 25247688, "step": 43505 }, { "epoch": 6.480488531426869, "grad_norm": 1.0852805376052856, "learning_rate": 4.273822430025072e-05, "loss": 0.5141, "num_input_tokens_seen": 25250728, "step": 43510 }, { "epoch": 6.481233243967829, "grad_norm": 1.1405718326568604, "learning_rate": 4.2735934360249426e-05, "loss": 0.711, "num_input_tokens_seen": 25253544, "step": 43515 }, { "epoch": 6.481977956508787, "grad_norm": 1.1612645387649536, "learning_rate": 4.2733644120617547e-05, "loss": 0.7277, "num_input_tokens_seen": 25256424, "step": 43520 }, { "epoch": 6.482722669049747, "grad_norm": 0.8364174365997314, "learning_rate": 4.273135358139377e-05, "loss": 0.6437, "num_input_tokens_seen": 25259432, "step": 43525 }, { "epoch": 6.483467381590706, "grad_norm": 1.3464772701263428, "learning_rate": 4.272906274261681e-05, "loss": 0.5772, "num_input_tokens_seen": 25262152, "step": 43530 }, { "epoch": 6.484212094131665, "grad_norm": 1.1538631916046143, "learning_rate": 4.2726771604325346e-05, "loss": 0.6267, "num_input_tokens_seen": 25264936, "step": 43535 }, { "epoch": 6.484956806672624, "grad_norm": 1.4096009731292725, "learning_rate": 4.272448016655809e-05, "loss": 0.6775, "num_input_tokens_seen": 25267656, "step": 43540 }, { "epoch": 6.485701519213584, "grad_norm": 0.7132601141929626, "learning_rate": 4.272218842935376e-05, "loss": 0.637, "num_input_tokens_seen": 25270536, "step": 43545 }, { "epoch": 6.486446231754543, "grad_norm": 1.4790385961532593, "learning_rate": 4.271989639275107e-05, "loss": 0.6469, "num_input_tokens_seen": 25273448, "step": 43550 }, { "epoch": 6.487190944295502, "grad_norm": 1.4398884773254395, "learning_rate": 4.271760405678874e-05, "loss": 0.6298, "num_input_tokens_seen": 25276200, "step": 43555 }, { "epoch": 6.487935656836461, "grad_norm": 1.2234464883804321, "learning_rate": 4.2715311421505486e-05, "loss": 0.5392, "num_input_tokens_seen": 25279048, "step": 43560 }, { "epoch": 6.488680369377421, "grad_norm": 1.784175157546997, "learning_rate": 4.271301848694006e-05, "loss": 0.8069, "num_input_tokens_seen": 25281832, "step": 43565 }, { "epoch": 6.489425081918379, "grad_norm": 0.688471257686615, "learning_rate": 4.271072525313119e-05, "loss": 0.7939, "num_input_tokens_seen": 25284872, "step": 43570 }, { "epoch": 6.490169794459339, "grad_norm": 1.7177085876464844, "learning_rate": 4.2708431720117614e-05, "loss": 0.6889, "num_input_tokens_seen": 25288072, "step": 43575 }, { "epoch": 6.490914507000298, "grad_norm": 1.4701493978500366, "learning_rate": 4.270613788793808e-05, "loss": 0.6091, "num_input_tokens_seen": 25290984, "step": 43580 }, { "epoch": 6.4916592195412575, "grad_norm": 1.0651254653930664, "learning_rate": 4.2703843756631344e-05, "loss": 0.542, "num_input_tokens_seen": 25293704, "step": 43585 }, { "epoch": 6.492403932082216, "grad_norm": 0.9771426320075989, "learning_rate": 4.270154932623617e-05, "loss": 0.7368, "num_input_tokens_seen": 25296904, "step": 43590 }, { "epoch": 6.493148644623176, "grad_norm": 1.2092809677124023, "learning_rate": 4.26992545967913e-05, "loss": 0.5573, "num_input_tokens_seen": 25299816, "step": 43595 }, { "epoch": 6.493893357164135, "grad_norm": 1.2348812818527222, "learning_rate": 4.2696959568335515e-05, "loss": 0.6185, "num_input_tokens_seen": 25302504, "step": 43600 }, { "epoch": 6.494638069705093, "grad_norm": 2.0143959522247314, "learning_rate": 4.2694664240907586e-05, "loss": 0.6545, "num_input_tokens_seen": 25305384, "step": 43605 }, { "epoch": 6.495382782246053, "grad_norm": 1.3023805618286133, "learning_rate": 4.269236861454629e-05, "loss": 0.6334, "num_input_tokens_seen": 25308488, "step": 43610 }, { "epoch": 6.496127494787013, "grad_norm": 1.1985142230987549, "learning_rate": 4.2690072689290405e-05, "loss": 0.6197, "num_input_tokens_seen": 25311112, "step": 43615 }, { "epoch": 6.496872207327971, "grad_norm": 1.4416847229003906, "learning_rate": 4.268777646517872e-05, "loss": 0.6132, "num_input_tokens_seen": 25313832, "step": 43620 }, { "epoch": 6.49761691986893, "grad_norm": 1.6780277490615845, "learning_rate": 4.268547994225003e-05, "loss": 0.6983, "num_input_tokens_seen": 25316808, "step": 43625 }, { "epoch": 6.49836163240989, "grad_norm": 1.400780200958252, "learning_rate": 4.2683183120543134e-05, "loss": 0.5769, "num_input_tokens_seen": 25319752, "step": 43630 }, { "epoch": 6.499106344950849, "grad_norm": 0.7209802269935608, "learning_rate": 4.2680886000096834e-05, "loss": 0.5752, "num_input_tokens_seen": 25322632, "step": 43635 }, { "epoch": 6.499851057491808, "grad_norm": 1.3416529893875122, "learning_rate": 4.267858858094993e-05, "loss": 0.7523, "num_input_tokens_seen": 25325448, "step": 43640 }, { "epoch": 6.500595770032767, "grad_norm": 2.9720919132232666, "learning_rate": 4.267629086314123e-05, "loss": 0.6565, "num_input_tokens_seen": 25328104, "step": 43645 }, { "epoch": 6.501340482573727, "grad_norm": 1.826894760131836, "learning_rate": 4.2673992846709574e-05, "loss": 0.6798, "num_input_tokens_seen": 25331304, "step": 43650 }, { "epoch": 6.502085195114685, "grad_norm": 1.034429669380188, "learning_rate": 4.267169453169377e-05, "loss": 0.6875, "num_input_tokens_seen": 25334216, "step": 43655 }, { "epoch": 6.502829907655645, "grad_norm": 1.2595316171646118, "learning_rate": 4.266939591813265e-05, "loss": 0.6613, "num_input_tokens_seen": 25337064, "step": 43660 }, { "epoch": 6.503574620196604, "grad_norm": 0.9696934819221497, "learning_rate": 4.266709700606504e-05, "loss": 0.7426, "num_input_tokens_seen": 25340296, "step": 43665 }, { "epoch": 6.5043193327375635, "grad_norm": 1.0160754919052124, "learning_rate": 4.266479779552979e-05, "loss": 0.6399, "num_input_tokens_seen": 25343496, "step": 43670 }, { "epoch": 6.505064045278522, "grad_norm": 1.281972885131836, "learning_rate": 4.266249828656572e-05, "loss": 0.7638, "num_input_tokens_seen": 25346312, "step": 43675 }, { "epoch": 6.505808757819482, "grad_norm": 0.8107725381851196, "learning_rate": 4.2660198479211705e-05, "loss": 0.4917, "num_input_tokens_seen": 25349192, "step": 43680 }, { "epoch": 6.506553470360441, "grad_norm": 0.9768154621124268, "learning_rate": 4.265789837350658e-05, "loss": 0.6457, "num_input_tokens_seen": 25352264, "step": 43685 }, { "epoch": 6.5072981829014, "grad_norm": 1.6277129650115967, "learning_rate": 4.2655597969489216e-05, "loss": 0.6158, "num_input_tokens_seen": 25355656, "step": 43690 }, { "epoch": 6.508042895442359, "grad_norm": 0.7829680442810059, "learning_rate": 4.265329726719845e-05, "loss": 0.4736, "num_input_tokens_seen": 25358440, "step": 43695 }, { "epoch": 6.508787607983319, "grad_norm": 0.8207727670669556, "learning_rate": 4.2650996266673197e-05, "loss": 0.6802, "num_input_tokens_seen": 25361160, "step": 43700 }, { "epoch": 6.509532320524277, "grad_norm": 1.171351432800293, "learning_rate": 4.264869496795229e-05, "loss": 0.7939, "num_input_tokens_seen": 25364200, "step": 43705 }, { "epoch": 6.510277033065237, "grad_norm": 1.2847908735275269, "learning_rate": 4.264639337107461e-05, "loss": 0.5916, "num_input_tokens_seen": 25367016, "step": 43710 }, { "epoch": 6.511021745606196, "grad_norm": 0.5950351357460022, "learning_rate": 4.264409147607905e-05, "loss": 0.6575, "num_input_tokens_seen": 25369864, "step": 43715 }, { "epoch": 6.5117664581471555, "grad_norm": 0.9974787831306458, "learning_rate": 4.264178928300451e-05, "loss": 0.5404, "num_input_tokens_seen": 25372904, "step": 43720 }, { "epoch": 6.512511170688114, "grad_norm": 1.2447631359100342, "learning_rate": 4.263948679188986e-05, "loss": 0.534, "num_input_tokens_seen": 25375784, "step": 43725 }, { "epoch": 6.513255883229074, "grad_norm": 0.9421852827072144, "learning_rate": 4.263718400277401e-05, "loss": 0.5238, "num_input_tokens_seen": 25378664, "step": 43730 }, { "epoch": 6.514000595770033, "grad_norm": 2.3743762969970703, "learning_rate": 4.263488091569586e-05, "loss": 0.5192, "num_input_tokens_seen": 25381448, "step": 43735 }, { "epoch": 6.514745308310992, "grad_norm": 2.340231418609619, "learning_rate": 4.263257753069432e-05, "loss": 0.5698, "num_input_tokens_seen": 25384296, "step": 43740 }, { "epoch": 6.515490020851951, "grad_norm": 1.334471583366394, "learning_rate": 4.263027384780831e-05, "loss": 0.7355, "num_input_tokens_seen": 25387272, "step": 43745 }, { "epoch": 6.516234733392911, "grad_norm": 0.9191936254501343, "learning_rate": 4.2627969867076736e-05, "loss": 0.5995, "num_input_tokens_seen": 25390696, "step": 43750 }, { "epoch": 6.5169794459338695, "grad_norm": 1.032613754272461, "learning_rate": 4.2625665588538534e-05, "loss": 0.6744, "num_input_tokens_seen": 25393640, "step": 43755 }, { "epoch": 6.517724158474829, "grad_norm": 1.615950345993042, "learning_rate": 4.262336101223262e-05, "loss": 0.6573, "num_input_tokens_seen": 25396552, "step": 43760 }, { "epoch": 6.518468871015788, "grad_norm": 0.9574558734893799, "learning_rate": 4.2621056138197936e-05, "loss": 0.5961, "num_input_tokens_seen": 25399560, "step": 43765 }, { "epoch": 6.519213583556747, "grad_norm": 0.808682918548584, "learning_rate": 4.261875096647341e-05, "loss": 0.4605, "num_input_tokens_seen": 25402568, "step": 43770 }, { "epoch": 6.519958296097706, "grad_norm": 1.1573444604873657, "learning_rate": 4.2616445497098e-05, "loss": 0.7269, "num_input_tokens_seen": 25405576, "step": 43775 }, { "epoch": 6.520703008638666, "grad_norm": 1.0035184621810913, "learning_rate": 4.261413973011065e-05, "loss": 0.7141, "num_input_tokens_seen": 25408520, "step": 43780 }, { "epoch": 6.521447721179625, "grad_norm": 0.6183497905731201, "learning_rate": 4.261183366555032e-05, "loss": 0.3344, "num_input_tokens_seen": 25411176, "step": 43785 }, { "epoch": 6.522192433720583, "grad_norm": 1.2774378061294556, "learning_rate": 4.260952730345594e-05, "loss": 0.6122, "num_input_tokens_seen": 25413992, "step": 43790 }, { "epoch": 6.522937146261543, "grad_norm": 1.234331488609314, "learning_rate": 4.260722064386651e-05, "loss": 0.5736, "num_input_tokens_seen": 25417480, "step": 43795 }, { "epoch": 6.523681858802503, "grad_norm": 0.9623258709907532, "learning_rate": 4.2604913686820966e-05, "loss": 0.7426, "num_input_tokens_seen": 25420264, "step": 43800 }, { "epoch": 6.5244265713434615, "grad_norm": 1.43426513671875, "learning_rate": 4.260260643235831e-05, "loss": 0.6364, "num_input_tokens_seen": 25423464, "step": 43805 }, { "epoch": 6.52517128388442, "grad_norm": 1.6487616300582886, "learning_rate": 4.260029888051751e-05, "loss": 0.6812, "num_input_tokens_seen": 25426120, "step": 43810 }, { "epoch": 6.52591599642538, "grad_norm": 1.689238429069519, "learning_rate": 4.259799103133754e-05, "loss": 0.7147, "num_input_tokens_seen": 25429192, "step": 43815 }, { "epoch": 6.526660708966339, "grad_norm": 0.6993674039840698, "learning_rate": 4.25956828848574e-05, "loss": 0.5013, "num_input_tokens_seen": 25431944, "step": 43820 }, { "epoch": 6.527405421507298, "grad_norm": 1.2612978219985962, "learning_rate": 4.259337444111609e-05, "loss": 0.7514, "num_input_tokens_seen": 25434728, "step": 43825 }, { "epoch": 6.528150134048257, "grad_norm": 0.8323398232460022, "learning_rate": 4.259106570015259e-05, "loss": 0.5015, "num_input_tokens_seen": 25437640, "step": 43830 }, { "epoch": 6.528894846589217, "grad_norm": 1.0997402667999268, "learning_rate": 4.2588756662005926e-05, "loss": 0.4946, "num_input_tokens_seen": 25440520, "step": 43835 }, { "epoch": 6.5296395591301755, "grad_norm": 0.5915467143058777, "learning_rate": 4.258644732671508e-05, "loss": 0.5693, "num_input_tokens_seen": 25443336, "step": 43840 }, { "epoch": 6.530384271671135, "grad_norm": 1.1645816564559937, "learning_rate": 4.258413769431908e-05, "loss": 0.7062, "num_input_tokens_seen": 25446376, "step": 43845 }, { "epoch": 6.531128984212094, "grad_norm": 2.0736420154571533, "learning_rate": 4.2581827764856955e-05, "loss": 0.7402, "num_input_tokens_seen": 25449224, "step": 43850 }, { "epoch": 6.5318736967530535, "grad_norm": 1.0375347137451172, "learning_rate": 4.257951753836772e-05, "loss": 0.7018, "num_input_tokens_seen": 25452072, "step": 43855 }, { "epoch": 6.532618409294012, "grad_norm": 0.7865891456604004, "learning_rate": 4.2577207014890394e-05, "loss": 0.6169, "num_input_tokens_seen": 25454792, "step": 43860 }, { "epoch": 6.533363121834972, "grad_norm": 0.9547924995422363, "learning_rate": 4.2574896194464033e-05, "loss": 0.5913, "num_input_tokens_seen": 25457736, "step": 43865 }, { "epoch": 6.534107834375931, "grad_norm": 1.668371558189392, "learning_rate": 4.2572585077127654e-05, "loss": 0.6468, "num_input_tokens_seen": 25460584, "step": 43870 }, { "epoch": 6.53485254691689, "grad_norm": 1.6765121221542358, "learning_rate": 4.2570273662920315e-05, "loss": 0.6043, "num_input_tokens_seen": 25463272, "step": 43875 }, { "epoch": 6.535597259457849, "grad_norm": 2.699211359024048, "learning_rate": 4.2567961951881053e-05, "loss": 0.7732, "num_input_tokens_seen": 25465960, "step": 43880 }, { "epoch": 6.536341971998809, "grad_norm": 1.6574417352676392, "learning_rate": 4.256564994404893e-05, "loss": 0.5908, "num_input_tokens_seen": 25468904, "step": 43885 }, { "epoch": 6.5370866845397675, "grad_norm": 0.941874086856842, "learning_rate": 4.2563337639463005e-05, "loss": 0.6678, "num_input_tokens_seen": 25471848, "step": 43890 }, { "epoch": 6.537831397080727, "grad_norm": 1.1159210205078125, "learning_rate": 4.256102503816234e-05, "loss": 0.84, "num_input_tokens_seen": 25474696, "step": 43895 }, { "epoch": 6.538576109621686, "grad_norm": 1.420439600944519, "learning_rate": 4.255871214018601e-05, "loss": 0.5313, "num_input_tokens_seen": 25477608, "step": 43900 }, { "epoch": 6.5393208221626455, "grad_norm": 0.8999513387680054, "learning_rate": 4.255639894557309e-05, "loss": 0.6591, "num_input_tokens_seen": 25480808, "step": 43905 }, { "epoch": 6.540065534703604, "grad_norm": 1.9432252645492554, "learning_rate": 4.255408545436264e-05, "loss": 0.6254, "num_input_tokens_seen": 25483784, "step": 43910 }, { "epoch": 6.540810247244564, "grad_norm": 1.2643415927886963, "learning_rate": 4.255177166659376e-05, "loss": 0.6299, "num_input_tokens_seen": 25486856, "step": 43915 }, { "epoch": 6.541554959785523, "grad_norm": 1.0136040449142456, "learning_rate": 4.254945758230554e-05, "loss": 0.5998, "num_input_tokens_seen": 25489544, "step": 43920 }, { "epoch": 6.542299672326482, "grad_norm": 1.0544047355651855, "learning_rate": 4.254714320153708e-05, "loss": 0.5983, "num_input_tokens_seen": 25492488, "step": 43925 }, { "epoch": 6.543044384867441, "grad_norm": 0.9717723727226257, "learning_rate": 4.254482852432745e-05, "loss": 0.7325, "num_input_tokens_seen": 25495400, "step": 43930 }, { "epoch": 6.5437890974084, "grad_norm": 1.5493258237838745, "learning_rate": 4.254251355071579e-05, "loss": 0.652, "num_input_tokens_seen": 25498344, "step": 43935 }, { "epoch": 6.5445338099493595, "grad_norm": 1.365370273590088, "learning_rate": 4.254019828074118e-05, "loss": 0.5084, "num_input_tokens_seen": 25501128, "step": 43940 }, { "epoch": 6.545278522490319, "grad_norm": 3.8209316730499268, "learning_rate": 4.2537882714442756e-05, "loss": 0.8993, "num_input_tokens_seen": 25504168, "step": 43945 }, { "epoch": 6.546023235031278, "grad_norm": 1.15399968624115, "learning_rate": 4.253556685185963e-05, "loss": 0.4996, "num_input_tokens_seen": 25506952, "step": 43950 }, { "epoch": 6.546767947572237, "grad_norm": 2.04109787940979, "learning_rate": 4.2533250693030924e-05, "loss": 0.7664, "num_input_tokens_seen": 25509768, "step": 43955 }, { "epoch": 6.547512660113196, "grad_norm": 1.3634417057037354, "learning_rate": 4.2530934237995756e-05, "loss": 1.0447, "num_input_tokens_seen": 25512936, "step": 43960 }, { "epoch": 6.548257372654156, "grad_norm": 1.4887876510620117, "learning_rate": 4.252861748679329e-05, "loss": 0.5997, "num_input_tokens_seen": 25515944, "step": 43965 }, { "epoch": 6.549002085195115, "grad_norm": 0.9077139496803284, "learning_rate": 4.252630043946263e-05, "loss": 0.5638, "num_input_tokens_seen": 25518888, "step": 43970 }, { "epoch": 6.5497467977360735, "grad_norm": 1.4786123037338257, "learning_rate": 4.252398309604294e-05, "loss": 0.5932, "num_input_tokens_seen": 25521544, "step": 43975 }, { "epoch": 6.550491510277033, "grad_norm": 1.04623281955719, "learning_rate": 4.252166545657337e-05, "loss": 0.8128, "num_input_tokens_seen": 25524424, "step": 43980 }, { "epoch": 6.551236222817992, "grad_norm": 1.1833356618881226, "learning_rate": 4.2519347521093077e-05, "loss": 0.6724, "num_input_tokens_seen": 25527208, "step": 43985 }, { "epoch": 6.5519809353589515, "grad_norm": 1.3965668678283691, "learning_rate": 4.251702928964121e-05, "loss": 0.6061, "num_input_tokens_seen": 25530088, "step": 43990 }, { "epoch": 6.55272564789991, "grad_norm": 1.3964216709136963, "learning_rate": 4.2514710762256925e-05, "loss": 0.7515, "num_input_tokens_seen": 25532744, "step": 43995 }, { "epoch": 6.55347036044087, "grad_norm": 1.0639549493789673, "learning_rate": 4.2512391938979416e-05, "loss": 0.7286, "num_input_tokens_seen": 25535496, "step": 44000 }, { "epoch": 6.554215072981829, "grad_norm": 1.1832005977630615, "learning_rate": 4.251007281984783e-05, "loss": 0.6918, "num_input_tokens_seen": 25538600, "step": 44005 }, { "epoch": 6.554959785522788, "grad_norm": 1.3425042629241943, "learning_rate": 4.250775340490137e-05, "loss": 0.6514, "num_input_tokens_seen": 25541384, "step": 44010 }, { "epoch": 6.555704498063747, "grad_norm": 1.7017987966537476, "learning_rate": 4.2505433694179216e-05, "loss": 0.5926, "num_input_tokens_seen": 25544168, "step": 44015 }, { "epoch": 6.556449210604707, "grad_norm": 0.8290008306503296, "learning_rate": 4.250311368772054e-05, "loss": 0.7206, "num_input_tokens_seen": 25547048, "step": 44020 }, { "epoch": 6.5571939231456655, "grad_norm": 1.1809277534484863, "learning_rate": 4.250079338556455e-05, "loss": 0.7363, "num_input_tokens_seen": 25549768, "step": 44025 }, { "epoch": 6.557938635686625, "grad_norm": 0.9196441769599915, "learning_rate": 4.2498472787750456e-05, "loss": 0.4799, "num_input_tokens_seen": 25553128, "step": 44030 }, { "epoch": 6.558683348227584, "grad_norm": 1.4582875967025757, "learning_rate": 4.249615189431744e-05, "loss": 0.7196, "num_input_tokens_seen": 25555976, "step": 44035 }, { "epoch": 6.559428060768544, "grad_norm": 1.6385301351547241, "learning_rate": 4.2493830705304716e-05, "loss": 0.6246, "num_input_tokens_seen": 25558728, "step": 44040 }, { "epoch": 6.560172773309502, "grad_norm": 1.086236834526062, "learning_rate": 4.24915092207515e-05, "loss": 0.6775, "num_input_tokens_seen": 25561608, "step": 44045 }, { "epoch": 6.560917485850462, "grad_norm": 0.7376397848129272, "learning_rate": 4.248918744069702e-05, "loss": 0.5048, "num_input_tokens_seen": 25564616, "step": 44050 }, { "epoch": 6.561662198391421, "grad_norm": 2.041267156600952, "learning_rate": 4.2486865365180494e-05, "loss": 0.7183, "num_input_tokens_seen": 25567592, "step": 44055 }, { "epoch": 6.56240691093238, "grad_norm": 0.6699554920196533, "learning_rate": 4.2484542994241145e-05, "loss": 0.6245, "num_input_tokens_seen": 25570376, "step": 44060 }, { "epoch": 6.563151623473339, "grad_norm": 0.5296568870544434, "learning_rate": 4.2482220327918214e-05, "loss": 0.6452, "num_input_tokens_seen": 25573352, "step": 44065 }, { "epoch": 6.563896336014299, "grad_norm": 1.4210821390151978, "learning_rate": 4.2479897366250946e-05, "loss": 0.7166, "num_input_tokens_seen": 25576232, "step": 44070 }, { "epoch": 6.5646410485552575, "grad_norm": 1.2125753164291382, "learning_rate": 4.247757410927857e-05, "loss": 0.5487, "num_input_tokens_seen": 25579016, "step": 44075 }, { "epoch": 6.565385761096217, "grad_norm": 1.7637126445770264, "learning_rate": 4.247525055704034e-05, "loss": 0.675, "num_input_tokens_seen": 25581896, "step": 44080 }, { "epoch": 6.566130473637176, "grad_norm": 1.9065312147140503, "learning_rate": 4.247292670957552e-05, "loss": 0.7483, "num_input_tokens_seen": 25584616, "step": 44085 }, { "epoch": 6.566875186178136, "grad_norm": 1.1907058954238892, "learning_rate": 4.247060256692336e-05, "loss": 0.7184, "num_input_tokens_seen": 25587688, "step": 44090 }, { "epoch": 6.567619898719094, "grad_norm": 0.8280228972434998, "learning_rate": 4.246827812912313e-05, "loss": 0.706, "num_input_tokens_seen": 25590760, "step": 44095 }, { "epoch": 6.568364611260054, "grad_norm": 1.1167278289794922, "learning_rate": 4.246595339621409e-05, "loss": 0.7747, "num_input_tokens_seen": 25593448, "step": 44100 }, { "epoch": 6.569109323801013, "grad_norm": 1.0186846256256104, "learning_rate": 4.246362836823551e-05, "loss": 0.436, "num_input_tokens_seen": 25596264, "step": 44105 }, { "epoch": 6.569854036341972, "grad_norm": 0.9431222081184387, "learning_rate": 4.2461303045226695e-05, "loss": 0.5134, "num_input_tokens_seen": 25599080, "step": 44110 }, { "epoch": 6.570598748882931, "grad_norm": 1.775424599647522, "learning_rate": 4.24589774272269e-05, "loss": 0.7474, "num_input_tokens_seen": 25602088, "step": 44115 }, { "epoch": 6.57134346142389, "grad_norm": 0.9600276947021484, "learning_rate": 4.245665151427544e-05, "loss": 0.606, "num_input_tokens_seen": 25604968, "step": 44120 }, { "epoch": 6.57208817396485, "grad_norm": 1.6184790134429932, "learning_rate": 4.245432530641158e-05, "loss": 0.7123, "num_input_tokens_seen": 25607912, "step": 44125 }, { "epoch": 6.572832886505809, "grad_norm": 0.9513404369354248, "learning_rate": 4.245199880367464e-05, "loss": 0.621, "num_input_tokens_seen": 25611144, "step": 44130 }, { "epoch": 6.573577599046768, "grad_norm": 0.7572778463363647, "learning_rate": 4.2449672006103914e-05, "loss": 0.5147, "num_input_tokens_seen": 25613992, "step": 44135 }, { "epoch": 6.574322311587727, "grad_norm": 1.1532114744186401, "learning_rate": 4.244734491373872e-05, "loss": 0.6356, "num_input_tokens_seen": 25616776, "step": 44140 }, { "epoch": 6.575067024128686, "grad_norm": 1.5051897764205933, "learning_rate": 4.244501752661836e-05, "loss": 0.5174, "num_input_tokens_seen": 25619592, "step": 44145 }, { "epoch": 6.575811736669645, "grad_norm": 1.850229263305664, "learning_rate": 4.244268984478216e-05, "loss": 0.636, "num_input_tokens_seen": 25622440, "step": 44150 }, { "epoch": 6.576556449210605, "grad_norm": 1.1344143152236938, "learning_rate": 4.2440361868269453e-05, "loss": 0.6197, "num_input_tokens_seen": 25625672, "step": 44155 }, { "epoch": 6.5773011617515635, "grad_norm": 0.8673353791236877, "learning_rate": 4.243803359711954e-05, "loss": 0.5345, "num_input_tokens_seen": 25628616, "step": 44160 }, { "epoch": 6.578045874292523, "grad_norm": 1.026810646057129, "learning_rate": 4.243570503137179e-05, "loss": 0.712, "num_input_tokens_seen": 25631432, "step": 44165 }, { "epoch": 6.578790586833482, "grad_norm": 1.6891086101531982, "learning_rate": 4.2433376171065514e-05, "loss": 0.486, "num_input_tokens_seen": 25634728, "step": 44170 }, { "epoch": 6.579535299374442, "grad_norm": 1.5751198530197144, "learning_rate": 4.2431047016240064e-05, "loss": 0.6781, "num_input_tokens_seen": 25638024, "step": 44175 }, { "epoch": 6.5802800119154, "grad_norm": 2.444385528564453, "learning_rate": 4.242871756693481e-05, "loss": 0.8075, "num_input_tokens_seen": 25641000, "step": 44180 }, { "epoch": 6.58102472445636, "grad_norm": 1.2381689548492432, "learning_rate": 4.242638782318906e-05, "loss": 0.6954, "num_input_tokens_seen": 25643848, "step": 44185 }, { "epoch": 6.581769436997319, "grad_norm": 1.6189159154891968, "learning_rate": 4.242405778504221e-05, "loss": 0.6677, "num_input_tokens_seen": 25646472, "step": 44190 }, { "epoch": 6.582514149538278, "grad_norm": 1.0834436416625977, "learning_rate": 4.242172745253362e-05, "loss": 0.5435, "num_input_tokens_seen": 25649384, "step": 44195 }, { "epoch": 6.583258862079237, "grad_norm": 0.7404927015304565, "learning_rate": 4.241939682570265e-05, "loss": 0.6037, "num_input_tokens_seen": 25652520, "step": 44200 }, { "epoch": 6.584003574620197, "grad_norm": 2.480712652206421, "learning_rate": 4.241706590458867e-05, "loss": 0.8729, "num_input_tokens_seen": 25655560, "step": 44205 }, { "epoch": 6.584748287161156, "grad_norm": 1.1294294595718384, "learning_rate": 4.241473468923106e-05, "loss": 0.7558, "num_input_tokens_seen": 25658632, "step": 44210 }, { "epoch": 6.585492999702115, "grad_norm": 1.1091068983078003, "learning_rate": 4.2412403179669216e-05, "loss": 0.4872, "num_input_tokens_seen": 25661864, "step": 44215 }, { "epoch": 6.586237712243074, "grad_norm": 1.4244712591171265, "learning_rate": 4.2410071375942505e-05, "loss": 0.6797, "num_input_tokens_seen": 25665096, "step": 44220 }, { "epoch": 6.586982424784034, "grad_norm": 0.7193768620491028, "learning_rate": 4.240773927809034e-05, "loss": 0.5293, "num_input_tokens_seen": 25668008, "step": 44225 }, { "epoch": 6.587727137324992, "grad_norm": 0.8344939351081848, "learning_rate": 4.240540688615212e-05, "loss": 0.5424, "num_input_tokens_seen": 25670728, "step": 44230 }, { "epoch": 6.588471849865952, "grad_norm": 1.0558714866638184, "learning_rate": 4.240307420016724e-05, "loss": 0.7502, "num_input_tokens_seen": 25673672, "step": 44235 }, { "epoch": 6.589216562406911, "grad_norm": 1.439125418663025, "learning_rate": 4.24007412201751e-05, "loss": 0.5185, "num_input_tokens_seen": 25676328, "step": 44240 }, { "epoch": 6.58996127494787, "grad_norm": 1.2524405717849731, "learning_rate": 4.239840794621512e-05, "loss": 0.6315, "num_input_tokens_seen": 25679144, "step": 44245 }, { "epoch": 6.590705987488829, "grad_norm": 0.9307566285133362, "learning_rate": 4.2396074378326725e-05, "loss": 0.7394, "num_input_tokens_seen": 25681896, "step": 44250 }, { "epoch": 6.591450700029789, "grad_norm": 1.1427043676376343, "learning_rate": 4.239374051654934e-05, "loss": 0.5998, "num_input_tokens_seen": 25684584, "step": 44255 }, { "epoch": 6.592195412570748, "grad_norm": 1.1569818258285522, "learning_rate": 4.239140636092238e-05, "loss": 0.671, "num_input_tokens_seen": 25687528, "step": 44260 }, { "epoch": 6.592940125111707, "grad_norm": 0.7926586270332336, "learning_rate": 4.238907191148528e-05, "loss": 0.5053, "num_input_tokens_seen": 25690536, "step": 44265 }, { "epoch": 6.593684837652666, "grad_norm": 0.7366265058517456, "learning_rate": 4.23867371682775e-05, "loss": 0.6098, "num_input_tokens_seen": 25693544, "step": 44270 }, { "epoch": 6.594429550193626, "grad_norm": 1.1577931642532349, "learning_rate": 4.2384402131338455e-05, "loss": 0.571, "num_input_tokens_seen": 25696104, "step": 44275 }, { "epoch": 6.595174262734584, "grad_norm": 1.2547293901443481, "learning_rate": 4.2382066800707606e-05, "loss": 0.5571, "num_input_tokens_seen": 25699080, "step": 44280 }, { "epoch": 6.595918975275543, "grad_norm": 1.07752525806427, "learning_rate": 4.237973117642441e-05, "loss": 0.7911, "num_input_tokens_seen": 25701896, "step": 44285 }, { "epoch": 6.596663687816503, "grad_norm": 1.0482714176177979, "learning_rate": 4.237739525852831e-05, "loss": 0.5607, "num_input_tokens_seen": 25704968, "step": 44290 }, { "epoch": 6.5974084003574625, "grad_norm": 2.3627891540527344, "learning_rate": 4.237505904705879e-05, "loss": 0.8381, "num_input_tokens_seen": 25707912, "step": 44295 }, { "epoch": 6.598153112898421, "grad_norm": 1.7840564250946045, "learning_rate": 4.23727225420553e-05, "loss": 0.6111, "num_input_tokens_seen": 25710824, "step": 44300 }, { "epoch": 6.59889782543938, "grad_norm": 0.7176195979118347, "learning_rate": 4.237038574355732e-05, "loss": 0.6478, "num_input_tokens_seen": 25714408, "step": 44305 }, { "epoch": 6.59964253798034, "grad_norm": 1.0920323133468628, "learning_rate": 4.236804865160433e-05, "loss": 0.7378, "num_input_tokens_seen": 25717064, "step": 44310 }, { "epoch": 6.600387250521299, "grad_norm": 1.35280179977417, "learning_rate": 4.236571126623581e-05, "loss": 0.7303, "num_input_tokens_seen": 25719816, "step": 44315 }, { "epoch": 6.601131963062258, "grad_norm": 1.4126322269439697, "learning_rate": 4.236337358749124e-05, "loss": 0.6573, "num_input_tokens_seen": 25722760, "step": 44320 }, { "epoch": 6.601876675603217, "grad_norm": 1.367340326309204, "learning_rate": 4.2361035615410127e-05, "loss": 0.593, "num_input_tokens_seen": 25725960, "step": 44325 }, { "epoch": 6.602621388144176, "grad_norm": 1.3229820728302002, "learning_rate": 4.2358697350031964e-05, "loss": 0.4906, "num_input_tokens_seen": 25729096, "step": 44330 }, { "epoch": 6.603366100685135, "grad_norm": 1.2292596101760864, "learning_rate": 4.2356358791396244e-05, "loss": 0.5434, "num_input_tokens_seen": 25731816, "step": 44335 }, { "epoch": 6.604110813226095, "grad_norm": 1.1315381526947021, "learning_rate": 4.235401993954249e-05, "loss": 0.467, "num_input_tokens_seen": 25734824, "step": 44340 }, { "epoch": 6.604855525767054, "grad_norm": 1.1410226821899414, "learning_rate": 4.2351680794510205e-05, "loss": 0.6724, "num_input_tokens_seen": 25737704, "step": 44345 }, { "epoch": 6.605600238308013, "grad_norm": 1.2056277990341187, "learning_rate": 4.234934135633891e-05, "loss": 0.6992, "num_input_tokens_seen": 25740488, "step": 44350 }, { "epoch": 6.606344950848972, "grad_norm": 0.8895420432090759, "learning_rate": 4.234700162506813e-05, "loss": 0.5011, "num_input_tokens_seen": 25743144, "step": 44355 }, { "epoch": 6.607089663389932, "grad_norm": 0.9969406127929688, "learning_rate": 4.234466160073738e-05, "loss": 0.6056, "num_input_tokens_seen": 25746280, "step": 44360 }, { "epoch": 6.60783437593089, "grad_norm": 0.9065267443656921, "learning_rate": 4.23423212833862e-05, "loss": 0.5665, "num_input_tokens_seen": 25749096, "step": 44365 }, { "epoch": 6.60857908847185, "grad_norm": 1.3313958644866943, "learning_rate": 4.233998067305413e-05, "loss": 0.5901, "num_input_tokens_seen": 25752328, "step": 44370 }, { "epoch": 6.609323801012809, "grad_norm": 1.10009765625, "learning_rate": 4.233763976978071e-05, "loss": 0.5739, "num_input_tokens_seen": 25755112, "step": 44375 }, { "epoch": 6.6100685135537685, "grad_norm": 1.1053357124328613, "learning_rate": 4.233529857360549e-05, "loss": 0.7477, "num_input_tokens_seen": 25757832, "step": 44380 }, { "epoch": 6.610813226094727, "grad_norm": 2.2326278686523438, "learning_rate": 4.233295708456801e-05, "loss": 0.6117, "num_input_tokens_seen": 25760904, "step": 44385 }, { "epoch": 6.611557938635687, "grad_norm": 2.2185258865356445, "learning_rate": 4.2330615302707856e-05, "loss": 0.8835, "num_input_tokens_seen": 25763880, "step": 44390 }, { "epoch": 6.612302651176646, "grad_norm": 1.102083444595337, "learning_rate": 4.2328273228064555e-05, "loss": 0.6877, "num_input_tokens_seen": 25766920, "step": 44395 }, { "epoch": 6.613047363717605, "grad_norm": 1.456099510192871, "learning_rate": 4.2325930860677695e-05, "loss": 0.8127, "num_input_tokens_seen": 25769800, "step": 44400 }, { "epoch": 6.613792076258564, "grad_norm": 0.7822834849357605, "learning_rate": 4.232358820058684e-05, "loss": 0.6987, "num_input_tokens_seen": 25772744, "step": 44405 }, { "epoch": 6.614536788799524, "grad_norm": 1.3008313179016113, "learning_rate": 4.232124524783157e-05, "loss": 0.6906, "num_input_tokens_seen": 25775688, "step": 44410 }, { "epoch": 6.615281501340482, "grad_norm": 1.0771293640136719, "learning_rate": 4.231890200245147e-05, "loss": 0.7237, "num_input_tokens_seen": 25778312, "step": 44415 }, { "epoch": 6.616026213881442, "grad_norm": 0.902858555316925, "learning_rate": 4.231655846448611e-05, "loss": 0.6095, "num_input_tokens_seen": 25781000, "step": 44420 }, { "epoch": 6.616770926422401, "grad_norm": 1.6974681615829468, "learning_rate": 4.2314214633975105e-05, "loss": 0.5886, "num_input_tokens_seen": 25783816, "step": 44425 }, { "epoch": 6.6175156389633605, "grad_norm": 1.7278008460998535, "learning_rate": 4.231187051095804e-05, "loss": 0.654, "num_input_tokens_seen": 25786760, "step": 44430 }, { "epoch": 6.618260351504319, "grad_norm": 1.1558088064193726, "learning_rate": 4.2309526095474514e-05, "loss": 0.6252, "num_input_tokens_seen": 25789512, "step": 44435 }, { "epoch": 6.619005064045279, "grad_norm": 0.7844818830490112, "learning_rate": 4.230718138756414e-05, "loss": 0.7498, "num_input_tokens_seen": 25792200, "step": 44440 }, { "epoch": 6.619749776586238, "grad_norm": 1.747541069984436, "learning_rate": 4.2304836387266534e-05, "loss": 0.6004, "num_input_tokens_seen": 25795048, "step": 44445 }, { "epoch": 6.620494489127196, "grad_norm": 0.9175692796707153, "learning_rate": 4.230249109462129e-05, "loss": 0.4782, "num_input_tokens_seen": 25798312, "step": 44450 }, { "epoch": 6.621239201668156, "grad_norm": 0.9254588484764099, "learning_rate": 4.2300145509668054e-05, "loss": 0.6429, "num_input_tokens_seen": 25801096, "step": 44455 }, { "epoch": 6.621983914209116, "grad_norm": 0.7773246765136719, "learning_rate": 4.2297799632446444e-05, "loss": 0.6986, "num_input_tokens_seen": 25803912, "step": 44460 }, { "epoch": 6.6227286267500745, "grad_norm": 1.4693816900253296, "learning_rate": 4.229545346299609e-05, "loss": 0.661, "num_input_tokens_seen": 25807016, "step": 44465 }, { "epoch": 6.623473339291033, "grad_norm": 1.922505259513855, "learning_rate": 4.2293107001356624e-05, "loss": 0.7335, "num_input_tokens_seen": 25809864, "step": 44470 }, { "epoch": 6.624218051831993, "grad_norm": 1.2711974382400513, "learning_rate": 4.2290760247567695e-05, "loss": 0.5872, "num_input_tokens_seen": 25812552, "step": 44475 }, { "epoch": 6.6249627643729525, "grad_norm": 1.7655528783798218, "learning_rate": 4.2288413201668945e-05, "loss": 0.5929, "num_input_tokens_seen": 25815432, "step": 44480 }, { "epoch": 6.625707476913911, "grad_norm": 1.0981295108795166, "learning_rate": 4.228606586370002e-05, "loss": 0.6801, "num_input_tokens_seen": 25818376, "step": 44485 }, { "epoch": 6.62645218945487, "grad_norm": 1.3336788415908813, "learning_rate": 4.228371823370058e-05, "loss": 0.675, "num_input_tokens_seen": 25821160, "step": 44490 }, { "epoch": 6.62719690199583, "grad_norm": 1.3855735063552856, "learning_rate": 4.228137031171029e-05, "loss": 0.4772, "num_input_tokens_seen": 25823976, "step": 44495 }, { "epoch": 6.627941614536788, "grad_norm": 1.3511708974838257, "learning_rate": 4.227902209776881e-05, "loss": 0.6577, "num_input_tokens_seen": 25827176, "step": 44500 }, { "epoch": 6.628686327077748, "grad_norm": 1.981472373008728, "learning_rate": 4.227667359191582e-05, "loss": 0.78, "num_input_tokens_seen": 25830248, "step": 44505 }, { "epoch": 6.629431039618707, "grad_norm": 1.7950676679611206, "learning_rate": 4.227432479419099e-05, "loss": 0.7004, "num_input_tokens_seen": 25833288, "step": 44510 }, { "epoch": 6.6301757521596665, "grad_norm": 0.9428337216377258, "learning_rate": 4.227197570463399e-05, "loss": 0.7184, "num_input_tokens_seen": 25836296, "step": 44515 }, { "epoch": 6.630920464700625, "grad_norm": 1.139369010925293, "learning_rate": 4.2269626323284514e-05, "loss": 0.584, "num_input_tokens_seen": 25839112, "step": 44520 }, { "epoch": 6.631665177241585, "grad_norm": 0.9506381154060364, "learning_rate": 4.226727665018226e-05, "loss": 0.6974, "num_input_tokens_seen": 25841992, "step": 44525 }, { "epoch": 6.632409889782544, "grad_norm": 1.3769419193267822, "learning_rate": 4.2264926685366925e-05, "loss": 0.6586, "num_input_tokens_seen": 25844872, "step": 44530 }, { "epoch": 6.633154602323503, "grad_norm": 0.9022054076194763, "learning_rate": 4.2262576428878184e-05, "loss": 0.5869, "num_input_tokens_seen": 25847752, "step": 44535 }, { "epoch": 6.633899314864462, "grad_norm": 1.4308767318725586, "learning_rate": 4.226022588075577e-05, "loss": 0.7118, "num_input_tokens_seen": 25850664, "step": 44540 }, { "epoch": 6.634644027405422, "grad_norm": 1.1338036060333252, "learning_rate": 4.2257875041039375e-05, "loss": 0.5005, "num_input_tokens_seen": 25853928, "step": 44545 }, { "epoch": 6.6353887399463805, "grad_norm": 1.2588235139846802, "learning_rate": 4.225552390976873e-05, "loss": 0.5625, "num_input_tokens_seen": 25856840, "step": 44550 }, { "epoch": 6.63613345248734, "grad_norm": 1.5086499452590942, "learning_rate": 4.225317248698354e-05, "loss": 0.5618, "num_input_tokens_seen": 25859752, "step": 44555 }, { "epoch": 6.636878165028299, "grad_norm": 1.9589262008666992, "learning_rate": 4.225082077272354e-05, "loss": 0.7374, "num_input_tokens_seen": 25862888, "step": 44560 }, { "epoch": 6.6376228775692585, "grad_norm": 0.8482190370559692, "learning_rate": 4.224846876702845e-05, "loss": 0.7294, "num_input_tokens_seen": 25865640, "step": 44565 }, { "epoch": 6.638367590110217, "grad_norm": 2.5960850715637207, "learning_rate": 4.224611646993801e-05, "loss": 0.7073, "num_input_tokens_seen": 25868424, "step": 44570 }, { "epoch": 6.639112302651177, "grad_norm": 1.2215901613235474, "learning_rate": 4.224376388149197e-05, "loss": 0.6797, "num_input_tokens_seen": 25871368, "step": 44575 }, { "epoch": 6.639857015192136, "grad_norm": 1.0412368774414062, "learning_rate": 4.2241411001730057e-05, "loss": 0.7408, "num_input_tokens_seen": 25874088, "step": 44580 }, { "epoch": 6.640601727733095, "grad_norm": 1.9335412979125977, "learning_rate": 4.223905783069203e-05, "loss": 0.6009, "num_input_tokens_seen": 25877000, "step": 44585 }, { "epoch": 6.641346440274054, "grad_norm": 0.7991006970405579, "learning_rate": 4.2236704368417644e-05, "loss": 0.6868, "num_input_tokens_seen": 25879912, "step": 44590 }, { "epoch": 6.642091152815014, "grad_norm": 0.7858550548553467, "learning_rate": 4.223435061494666e-05, "loss": 0.7335, "num_input_tokens_seen": 25883176, "step": 44595 }, { "epoch": 6.6428358653559725, "grad_norm": 0.8773645162582397, "learning_rate": 4.223199657031883e-05, "loss": 0.5898, "num_input_tokens_seen": 25886216, "step": 44600 }, { "epoch": 6.643580577896932, "grad_norm": 1.5328116416931152, "learning_rate": 4.222964223457394e-05, "loss": 0.6698, "num_input_tokens_seen": 25888904, "step": 44605 }, { "epoch": 6.644325290437891, "grad_norm": 1.2581783533096313, "learning_rate": 4.2227287607751756e-05, "loss": 0.6304, "num_input_tokens_seen": 25891816, "step": 44610 }, { "epoch": 6.6450700029788505, "grad_norm": 1.4241780042648315, "learning_rate": 4.222493268989205e-05, "loss": 0.6548, "num_input_tokens_seen": 25894984, "step": 44615 }, { "epoch": 6.645814715519809, "grad_norm": 1.583776593208313, "learning_rate": 4.222257748103461e-05, "loss": 0.4672, "num_input_tokens_seen": 25897832, "step": 44620 }, { "epoch": 6.646559428060769, "grad_norm": 1.0556151866912842, "learning_rate": 4.222022198121923e-05, "loss": 0.5373, "num_input_tokens_seen": 25900712, "step": 44625 }, { "epoch": 6.647304140601728, "grad_norm": 1.152806043624878, "learning_rate": 4.221786619048571e-05, "loss": 0.6191, "num_input_tokens_seen": 25903432, "step": 44630 }, { "epoch": 6.6480488531426865, "grad_norm": 1.205756425857544, "learning_rate": 4.221551010887384e-05, "loss": 0.5212, "num_input_tokens_seen": 25906216, "step": 44635 }, { "epoch": 6.648793565683646, "grad_norm": 1.3090732097625732, "learning_rate": 4.2213153736423417e-05, "loss": 0.4833, "num_input_tokens_seen": 25908872, "step": 44640 }, { "epoch": 6.649538278224606, "grad_norm": 1.0704126358032227, "learning_rate": 4.221079707317426e-05, "loss": 0.6024, "num_input_tokens_seen": 25912008, "step": 44645 }, { "epoch": 6.6502829907655645, "grad_norm": 1.625365138053894, "learning_rate": 4.220844011916617e-05, "loss": 0.6576, "num_input_tokens_seen": 25914856, "step": 44650 }, { "epoch": 6.651027703306523, "grad_norm": 1.4934207201004028, "learning_rate": 4.2206082874438976e-05, "loss": 0.5805, "num_input_tokens_seen": 25918056, "step": 44655 }, { "epoch": 6.651772415847483, "grad_norm": 0.9816672205924988, "learning_rate": 4.2203725339032505e-05, "loss": 0.5159, "num_input_tokens_seen": 25920936, "step": 44660 }, { "epoch": 6.652517128388443, "grad_norm": 1.5134385824203491, "learning_rate": 4.220136751298659e-05, "loss": 0.7387, "num_input_tokens_seen": 25923816, "step": 44665 }, { "epoch": 6.653261840929401, "grad_norm": 1.1785507202148438, "learning_rate": 4.219900939634103e-05, "loss": 0.6676, "num_input_tokens_seen": 25926696, "step": 44670 }, { "epoch": 6.65400655347036, "grad_norm": 0.8719085454940796, "learning_rate": 4.2196650989135706e-05, "loss": 0.5861, "num_input_tokens_seen": 25929736, "step": 44675 }, { "epoch": 6.65475126601132, "grad_norm": 1.6918169260025024, "learning_rate": 4.219429229141043e-05, "loss": 0.5328, "num_input_tokens_seen": 25932744, "step": 44680 }, { "epoch": 6.6554959785522785, "grad_norm": 1.1556304693222046, "learning_rate": 4.219193330320507e-05, "loss": 0.5194, "num_input_tokens_seen": 25935752, "step": 44685 }, { "epoch": 6.656240691093238, "grad_norm": 1.453993797302246, "learning_rate": 4.2189574024559465e-05, "loss": 0.737, "num_input_tokens_seen": 25938440, "step": 44690 }, { "epoch": 6.656985403634197, "grad_norm": 0.9086483716964722, "learning_rate": 4.218721445551348e-05, "loss": 0.5979, "num_input_tokens_seen": 25941448, "step": 44695 }, { "epoch": 6.6577301161751565, "grad_norm": 1.0226631164550781, "learning_rate": 4.218485459610697e-05, "loss": 0.6236, "num_input_tokens_seen": 25944136, "step": 44700 }, { "epoch": 6.658474828716115, "grad_norm": 1.0741441249847412, "learning_rate": 4.2182494446379805e-05, "loss": 0.5613, "num_input_tokens_seen": 25946984, "step": 44705 }, { "epoch": 6.659219541257075, "grad_norm": 1.0191879272460938, "learning_rate": 4.218013400637187e-05, "loss": 0.5952, "num_input_tokens_seen": 25950280, "step": 44710 }, { "epoch": 6.659964253798034, "grad_norm": 1.234656810760498, "learning_rate": 4.217777327612303e-05, "loss": 0.7432, "num_input_tokens_seen": 25953288, "step": 44715 }, { "epoch": 6.660708966338993, "grad_norm": 1.302255630493164, "learning_rate": 4.2175412255673164e-05, "loss": 0.4931, "num_input_tokens_seen": 25956072, "step": 44720 }, { "epoch": 6.661453678879952, "grad_norm": 1.7437001466751099, "learning_rate": 4.2173050945062165e-05, "loss": 0.6369, "num_input_tokens_seen": 25958792, "step": 44725 }, { "epoch": 6.662198391420912, "grad_norm": 2.037914752960205, "learning_rate": 4.217068934432993e-05, "loss": 0.8732, "num_input_tokens_seen": 25961800, "step": 44730 }, { "epoch": 6.6629431039618705, "grad_norm": 1.4474085569381714, "learning_rate": 4.216832745351634e-05, "loss": 0.6841, "num_input_tokens_seen": 25964744, "step": 44735 }, { "epoch": 6.66368781650283, "grad_norm": 1.2291350364685059, "learning_rate": 4.2165965272661315e-05, "loss": 0.6108, "num_input_tokens_seen": 25967496, "step": 44740 }, { "epoch": 6.664432529043789, "grad_norm": 2.004345655441284, "learning_rate": 4.2163602801804745e-05, "loss": 0.7715, "num_input_tokens_seen": 25970728, "step": 44745 }, { "epoch": 6.665177241584749, "grad_norm": 1.2904773950576782, "learning_rate": 4.216124004098656e-05, "loss": 0.4143, "num_input_tokens_seen": 25973608, "step": 44750 }, { "epoch": 6.665921954125707, "grad_norm": 0.9213007688522339, "learning_rate": 4.2158876990246664e-05, "loss": 0.5787, "num_input_tokens_seen": 25976616, "step": 44755 }, { "epoch": 6.666666666666667, "grad_norm": 1.1839808225631714, "learning_rate": 4.215651364962498e-05, "loss": 0.4907, "num_input_tokens_seen": 25979656, "step": 44760 }, { "epoch": 6.667411379207626, "grad_norm": 1.0467894077301025, "learning_rate": 4.215415001916144e-05, "loss": 0.6672, "num_input_tokens_seen": 25982504, "step": 44765 }, { "epoch": 6.668156091748585, "grad_norm": 1.021592140197754, "learning_rate": 4.215178609889596e-05, "loss": 0.5741, "num_input_tokens_seen": 25986152, "step": 44770 }, { "epoch": 6.668900804289544, "grad_norm": 0.8397430777549744, "learning_rate": 4.214942188886849e-05, "loss": 0.6577, "num_input_tokens_seen": 25989672, "step": 44775 }, { "epoch": 6.669645516830504, "grad_norm": 1.6737874746322632, "learning_rate": 4.2147057389118964e-05, "loss": 0.669, "num_input_tokens_seen": 25992584, "step": 44780 }, { "epoch": 6.6703902293714625, "grad_norm": 1.3395565748214722, "learning_rate": 4.2144692599687334e-05, "loss": 0.7264, "num_input_tokens_seen": 25995624, "step": 44785 }, { "epoch": 6.671134941912422, "grad_norm": 1.6650705337524414, "learning_rate": 4.214232752061355e-05, "loss": 0.823, "num_input_tokens_seen": 25998536, "step": 44790 }, { "epoch": 6.671879654453381, "grad_norm": 1.1168529987335205, "learning_rate": 4.213996215193756e-05, "loss": 0.5742, "num_input_tokens_seen": 26001288, "step": 44795 }, { "epoch": 6.67262436699434, "grad_norm": 2.8479137420654297, "learning_rate": 4.213759649369934e-05, "loss": 0.7309, "num_input_tokens_seen": 26004232, "step": 44800 }, { "epoch": 6.673369079535299, "grad_norm": 1.0643506050109863, "learning_rate": 4.2135230545938835e-05, "loss": 0.6692, "num_input_tokens_seen": 26007272, "step": 44805 }, { "epoch": 6.674113792076259, "grad_norm": 1.3089529275894165, "learning_rate": 4.213286430869603e-05, "loss": 0.7718, "num_input_tokens_seen": 26010024, "step": 44810 }, { "epoch": 6.674858504617218, "grad_norm": 0.6349654793739319, "learning_rate": 4.2130497782010894e-05, "loss": 0.6233, "num_input_tokens_seen": 26012904, "step": 44815 }, { "epoch": 6.6756032171581765, "grad_norm": 1.4347975254058838, "learning_rate": 4.212813096592341e-05, "loss": 0.7787, "num_input_tokens_seen": 26015624, "step": 44820 }, { "epoch": 6.676347929699136, "grad_norm": 1.0687267780303955, "learning_rate": 4.212576386047356e-05, "loss": 0.6067, "num_input_tokens_seen": 26018600, "step": 44825 }, { "epoch": 6.677092642240096, "grad_norm": 1.0518391132354736, "learning_rate": 4.2123396465701336e-05, "loss": 0.4817, "num_input_tokens_seen": 26021576, "step": 44830 }, { "epoch": 6.677837354781055, "grad_norm": 1.4163585901260376, "learning_rate": 4.212102878164673e-05, "loss": 0.5463, "num_input_tokens_seen": 26024456, "step": 44835 }, { "epoch": 6.678582067322013, "grad_norm": 1.331889033317566, "learning_rate": 4.211866080834975e-05, "loss": 0.5623, "num_input_tokens_seen": 26027400, "step": 44840 }, { "epoch": 6.679326779862973, "grad_norm": 1.7515984773635864, "learning_rate": 4.2116292545850386e-05, "loss": 0.7772, "num_input_tokens_seen": 26030056, "step": 44845 }, { "epoch": 6.680071492403932, "grad_norm": 1.0016168355941772, "learning_rate": 4.2113923994188665e-05, "loss": 0.7604, "num_input_tokens_seen": 26033224, "step": 44850 }, { "epoch": 6.680816204944891, "grad_norm": 1.1805622577667236, "learning_rate": 4.211155515340458e-05, "loss": 0.6602, "num_input_tokens_seen": 26036136, "step": 44855 }, { "epoch": 6.68156091748585, "grad_norm": 1.1038440465927124, "learning_rate": 4.210918602353817e-05, "loss": 0.5872, "num_input_tokens_seen": 26039368, "step": 44860 }, { "epoch": 6.68230563002681, "grad_norm": 0.5995155572891235, "learning_rate": 4.210681660462945e-05, "loss": 0.7568, "num_input_tokens_seen": 26042600, "step": 44865 }, { "epoch": 6.6830503425677685, "grad_norm": 2.862409830093384, "learning_rate": 4.210444689671845e-05, "loss": 0.8126, "num_input_tokens_seen": 26045416, "step": 44870 }, { "epoch": 6.683795055108728, "grad_norm": 1.2361432313919067, "learning_rate": 4.2102076899845207e-05, "loss": 0.7553, "num_input_tokens_seen": 26048296, "step": 44875 }, { "epoch": 6.684539767649687, "grad_norm": 1.5078154802322388, "learning_rate": 4.209970661404975e-05, "loss": 0.8026, "num_input_tokens_seen": 26051048, "step": 44880 }, { "epoch": 6.685284480190647, "grad_norm": 1.1374757289886475, "learning_rate": 4.209733603937214e-05, "loss": 0.7379, "num_input_tokens_seen": 26054120, "step": 44885 }, { "epoch": 6.686029192731605, "grad_norm": 0.9709582328796387, "learning_rate": 4.2094965175852395e-05, "loss": 0.5561, "num_input_tokens_seen": 26057128, "step": 44890 }, { "epoch": 6.686773905272565, "grad_norm": 0.8463758230209351, "learning_rate": 4.209259402353061e-05, "loss": 0.6718, "num_input_tokens_seen": 26059880, "step": 44895 }, { "epoch": 6.687518617813524, "grad_norm": 0.761501669883728, "learning_rate": 4.20902225824468e-05, "loss": 0.6004, "num_input_tokens_seen": 26062920, "step": 44900 }, { "epoch": 6.688263330354483, "grad_norm": 0.7829854488372803, "learning_rate": 4.208785085264106e-05, "loss": 0.5509, "num_input_tokens_seen": 26065544, "step": 44905 }, { "epoch": 6.689008042895442, "grad_norm": 1.2640390396118164, "learning_rate": 4.2085478834153454e-05, "loss": 0.5041, "num_input_tokens_seen": 26068168, "step": 44910 }, { "epoch": 6.689752755436402, "grad_norm": 1.2384721040725708, "learning_rate": 4.208310652702404e-05, "loss": 0.7524, "num_input_tokens_seen": 26071048, "step": 44915 }, { "epoch": 6.690497467977361, "grad_norm": 1.2935717105865479, "learning_rate": 4.208073393129291e-05, "loss": 0.6638, "num_input_tokens_seen": 26074344, "step": 44920 }, { "epoch": 6.69124218051832, "grad_norm": 0.859571099281311, "learning_rate": 4.207836104700013e-05, "loss": 0.5655, "num_input_tokens_seen": 26077480, "step": 44925 }, { "epoch": 6.691986893059279, "grad_norm": 1.2998719215393066, "learning_rate": 4.207598787418581e-05, "loss": 0.4068, "num_input_tokens_seen": 26080552, "step": 44930 }, { "epoch": 6.692731605600239, "grad_norm": 1.2970943450927734, "learning_rate": 4.207361441289002e-05, "loss": 0.7497, "num_input_tokens_seen": 26083528, "step": 44935 }, { "epoch": 6.693476318141197, "grad_norm": 3.904053211212158, "learning_rate": 4.207124066315287e-05, "loss": 0.7022, "num_input_tokens_seen": 26086568, "step": 44940 }, { "epoch": 6.694221030682157, "grad_norm": 0.8094388246536255, "learning_rate": 4.206886662501446e-05, "loss": 0.5523, "num_input_tokens_seen": 26089480, "step": 44945 }, { "epoch": 6.694965743223116, "grad_norm": 1.093841791152954, "learning_rate": 4.2066492298514895e-05, "loss": 0.6991, "num_input_tokens_seen": 26092136, "step": 44950 }, { "epoch": 6.695710455764075, "grad_norm": 1.8317919969558716, "learning_rate": 4.2064117683694294e-05, "loss": 0.715, "num_input_tokens_seen": 26095144, "step": 44955 }, { "epoch": 6.696455168305034, "grad_norm": 1.7760779857635498, "learning_rate": 4.206174278059276e-05, "loss": 0.6926, "num_input_tokens_seen": 26098184, "step": 44960 }, { "epoch": 6.697199880845994, "grad_norm": 1.8440223932266235, "learning_rate": 4.205936758925043e-05, "loss": 0.8753, "num_input_tokens_seen": 26101096, "step": 44965 }, { "epoch": 6.697944593386953, "grad_norm": 1.1707987785339355, "learning_rate": 4.2056992109707415e-05, "loss": 0.7373, "num_input_tokens_seen": 26103976, "step": 44970 }, { "epoch": 6.698689305927912, "grad_norm": 1.6884855031967163, "learning_rate": 4.205461634200386e-05, "loss": 0.5096, "num_input_tokens_seen": 26107016, "step": 44975 }, { "epoch": 6.699434018468871, "grad_norm": 0.8212472796440125, "learning_rate": 4.2052240286179886e-05, "loss": 0.5822, "num_input_tokens_seen": 26109768, "step": 44980 }, { "epoch": 6.70017873100983, "grad_norm": 2.303333282470703, "learning_rate": 4.204986394227566e-05, "loss": 0.7903, "num_input_tokens_seen": 26112808, "step": 44985 }, { "epoch": 6.700923443550789, "grad_norm": 1.6895477771759033, "learning_rate": 4.20474873103313e-05, "loss": 0.5573, "num_input_tokens_seen": 26115688, "step": 44990 }, { "epoch": 6.701668156091749, "grad_norm": 1.0288796424865723, "learning_rate": 4.204511039038697e-05, "loss": 0.5635, "num_input_tokens_seen": 26118696, "step": 44995 }, { "epoch": 6.702412868632708, "grad_norm": 1.757056474685669, "learning_rate": 4.204273318248283e-05, "loss": 0.7729, "num_input_tokens_seen": 26121768, "step": 45000 }, { "epoch": 6.703157581173667, "grad_norm": 2.0183465480804443, "learning_rate": 4.204035568665903e-05, "loss": 0.6555, "num_input_tokens_seen": 26124360, "step": 45005 }, { "epoch": 6.703902293714626, "grad_norm": 0.9191532135009766, "learning_rate": 4.203797790295574e-05, "loss": 0.6685, "num_input_tokens_seen": 26127400, "step": 45010 }, { "epoch": 6.704647006255585, "grad_norm": 1.3538857698440552, "learning_rate": 4.203559983141312e-05, "loss": 0.7928, "num_input_tokens_seen": 26130184, "step": 45015 }, { "epoch": 6.705391718796545, "grad_norm": 1.347449779510498, "learning_rate": 4.2033221472071364e-05, "loss": 0.5571, "num_input_tokens_seen": 26133224, "step": 45020 }, { "epoch": 6.706136431337503, "grad_norm": 0.9737610816955566, "learning_rate": 4.2030842824970645e-05, "loss": 0.4717, "num_input_tokens_seen": 26136264, "step": 45025 }, { "epoch": 6.706881143878463, "grad_norm": 0.8651151061058044, "learning_rate": 4.2028463890151144e-05, "loss": 0.6753, "num_input_tokens_seen": 26138888, "step": 45030 }, { "epoch": 6.707625856419422, "grad_norm": 1.1823617219924927, "learning_rate": 4.202608466765306e-05, "loss": 0.6189, "num_input_tokens_seen": 26141640, "step": 45035 }, { "epoch": 6.708370568960381, "grad_norm": 1.4755284786224365, "learning_rate": 4.202370515751657e-05, "loss": 0.7682, "num_input_tokens_seen": 26144904, "step": 45040 }, { "epoch": 6.70911528150134, "grad_norm": 1.8923566341400146, "learning_rate": 4.2021325359781885e-05, "loss": 0.6846, "num_input_tokens_seen": 26147464, "step": 45045 }, { "epoch": 6.7098599940423, "grad_norm": 1.2695651054382324, "learning_rate": 4.201894527448921e-05, "loss": 0.5794, "num_input_tokens_seen": 26150824, "step": 45050 }, { "epoch": 6.710604706583259, "grad_norm": 0.835304319858551, "learning_rate": 4.2016564901678744e-05, "loss": 0.6398, "num_input_tokens_seen": 26153608, "step": 45055 }, { "epoch": 6.711349419124218, "grad_norm": 1.1598848104476929, "learning_rate": 4.201418424139072e-05, "loss": 0.7377, "num_input_tokens_seen": 26156552, "step": 45060 }, { "epoch": 6.712094131665177, "grad_norm": 1.2800334692001343, "learning_rate": 4.201180329366534e-05, "loss": 0.5554, "num_input_tokens_seen": 26159272, "step": 45065 }, { "epoch": 6.712838844206137, "grad_norm": 1.0232396125793457, "learning_rate": 4.200942205854282e-05, "loss": 0.5949, "num_input_tokens_seen": 26162024, "step": 45070 }, { "epoch": 6.713583556747095, "grad_norm": 0.9065468311309814, "learning_rate": 4.2007040536063424e-05, "loss": 0.6157, "num_input_tokens_seen": 26164840, "step": 45075 }, { "epoch": 6.714328269288055, "grad_norm": 1.3545089960098267, "learning_rate": 4.200465872626736e-05, "loss": 0.5655, "num_input_tokens_seen": 26167688, "step": 45080 }, { "epoch": 6.715072981829014, "grad_norm": 1.6280343532562256, "learning_rate": 4.200227662919487e-05, "loss": 0.6221, "num_input_tokens_seen": 26170760, "step": 45085 }, { "epoch": 6.7158176943699734, "grad_norm": 1.2896848917007446, "learning_rate": 4.1999894244886184e-05, "loss": 0.7327, "num_input_tokens_seen": 26173896, "step": 45090 }, { "epoch": 6.716562406910932, "grad_norm": 0.8752096891403198, "learning_rate": 4.1997511573381575e-05, "loss": 0.716, "num_input_tokens_seen": 26176488, "step": 45095 }, { "epoch": 6.717307119451892, "grad_norm": 1.4259175062179565, "learning_rate": 4.199512861472128e-05, "loss": 0.7243, "num_input_tokens_seen": 26179432, "step": 45100 }, { "epoch": 6.718051831992851, "grad_norm": 1.1682521104812622, "learning_rate": 4.1992745368945554e-05, "loss": 0.8813, "num_input_tokens_seen": 26182696, "step": 45105 }, { "epoch": 6.71879654453381, "grad_norm": 1.665777325630188, "learning_rate": 4.199036183609467e-05, "loss": 0.7294, "num_input_tokens_seen": 26185768, "step": 45110 }, { "epoch": 6.719541257074769, "grad_norm": 1.0326273441314697, "learning_rate": 4.1987978016208895e-05, "loss": 0.6042, "num_input_tokens_seen": 26189064, "step": 45115 }, { "epoch": 6.720285969615729, "grad_norm": 1.2524185180664062, "learning_rate": 4.1985593909328494e-05, "loss": 0.7035, "num_input_tokens_seen": 26191880, "step": 45120 }, { "epoch": 6.721030682156687, "grad_norm": 0.8530955910682678, "learning_rate": 4.198320951549375e-05, "loss": 0.689, "num_input_tokens_seen": 26194728, "step": 45125 }, { "epoch": 6.721775394697647, "grad_norm": 1.3345725536346436, "learning_rate": 4.1980824834744934e-05, "loss": 0.74, "num_input_tokens_seen": 26197384, "step": 45130 }, { "epoch": 6.722520107238606, "grad_norm": 1.2836122512817383, "learning_rate": 4.1978439867122344e-05, "loss": 0.6307, "num_input_tokens_seen": 26200360, "step": 45135 }, { "epoch": 6.7232648197795655, "grad_norm": 1.0957027673721313, "learning_rate": 4.197605461266627e-05, "loss": 0.5594, "num_input_tokens_seen": 26203400, "step": 45140 }, { "epoch": 6.724009532320524, "grad_norm": 1.0852950811386108, "learning_rate": 4.197366907141701e-05, "loss": 0.7201, "num_input_tokens_seen": 26206152, "step": 45145 }, { "epoch": 6.724754244861483, "grad_norm": 1.1203542947769165, "learning_rate": 4.197128324341486e-05, "loss": 0.5209, "num_input_tokens_seen": 26209064, "step": 45150 }, { "epoch": 6.725498957402443, "grad_norm": 1.089137077331543, "learning_rate": 4.196889712870013e-05, "loss": 0.5475, "num_input_tokens_seen": 26211816, "step": 45155 }, { "epoch": 6.726243669943402, "grad_norm": 1.4299278259277344, "learning_rate": 4.196651072731313e-05, "loss": 0.6714, "num_input_tokens_seen": 26214888, "step": 45160 }, { "epoch": 6.726988382484361, "grad_norm": 1.2226864099502563, "learning_rate": 4.196412403929417e-05, "loss": 0.7049, "num_input_tokens_seen": 26217672, "step": 45165 }, { "epoch": 6.72773309502532, "grad_norm": 1.4609811305999756, "learning_rate": 4.196173706468358e-05, "loss": 0.7957, "num_input_tokens_seen": 26220840, "step": 45170 }, { "epoch": 6.7284778075662794, "grad_norm": 0.7032378911972046, "learning_rate": 4.195934980352169e-05, "loss": 0.5719, "num_input_tokens_seen": 26223944, "step": 45175 }, { "epoch": 6.729222520107239, "grad_norm": 0.8913888335227966, "learning_rate": 4.195696225584881e-05, "loss": 0.6179, "num_input_tokens_seen": 26226760, "step": 45180 }, { "epoch": 6.729967232648198, "grad_norm": 2.369478940963745, "learning_rate": 4.195457442170528e-05, "loss": 0.7377, "num_input_tokens_seen": 26229736, "step": 45185 }, { "epoch": 6.730711945189157, "grad_norm": 1.0915135145187378, "learning_rate": 4.195218630113146e-05, "loss": 0.5523, "num_input_tokens_seen": 26232552, "step": 45190 }, { "epoch": 6.731456657730116, "grad_norm": 1.120539665222168, "learning_rate": 4.1949797894167676e-05, "loss": 0.5195, "num_input_tokens_seen": 26235176, "step": 45195 }, { "epoch": 6.732201370271075, "grad_norm": 1.4638434648513794, "learning_rate": 4.1947409200854296e-05, "loss": 0.6423, "num_input_tokens_seen": 26238216, "step": 45200 }, { "epoch": 6.732946082812035, "grad_norm": 1.045182704925537, "learning_rate": 4.1945020221231643e-05, "loss": 0.6646, "num_input_tokens_seen": 26240936, "step": 45205 }, { "epoch": 6.733690795352993, "grad_norm": 1.4001449346542358, "learning_rate": 4.194263095534011e-05, "loss": 0.6367, "num_input_tokens_seen": 26243752, "step": 45210 }, { "epoch": 6.734435507893953, "grad_norm": 0.8335675597190857, "learning_rate": 4.194024140322004e-05, "loss": 0.557, "num_input_tokens_seen": 26246696, "step": 45215 }, { "epoch": 6.735180220434912, "grad_norm": 0.9363516569137573, "learning_rate": 4.193785156491181e-05, "loss": 0.625, "num_input_tokens_seen": 26249832, "step": 45220 }, { "epoch": 6.7359249329758715, "grad_norm": 1.8969413042068481, "learning_rate": 4.193546144045579e-05, "loss": 0.6837, "num_input_tokens_seen": 26253032, "step": 45225 }, { "epoch": 6.73666964551683, "grad_norm": 1.1385993957519531, "learning_rate": 4.193307102989237e-05, "loss": 0.5814, "num_input_tokens_seen": 26255944, "step": 45230 }, { "epoch": 6.73741435805779, "grad_norm": 0.8303695321083069, "learning_rate": 4.1930680333261915e-05, "loss": 0.5422, "num_input_tokens_seen": 26258440, "step": 45235 }, { "epoch": 6.738159070598749, "grad_norm": 1.1464930772781372, "learning_rate": 4.1928289350604826e-05, "loss": 0.6054, "num_input_tokens_seen": 26261224, "step": 45240 }, { "epoch": 6.738903783139708, "grad_norm": 0.7639818787574768, "learning_rate": 4.19258980819615e-05, "loss": 0.648, "num_input_tokens_seen": 26264584, "step": 45245 }, { "epoch": 6.739648495680667, "grad_norm": 1.1409302949905396, "learning_rate": 4.192350652737232e-05, "loss": 0.4127, "num_input_tokens_seen": 26267208, "step": 45250 }, { "epoch": 6.740393208221627, "grad_norm": 1.4507302045822144, "learning_rate": 4.19211146868777e-05, "loss": 0.6683, "num_input_tokens_seen": 26270088, "step": 45255 }, { "epoch": 6.7411379207625854, "grad_norm": 2.46026873588562, "learning_rate": 4.1918722560518045e-05, "loss": 0.6277, "num_input_tokens_seen": 26273000, "step": 45260 }, { "epoch": 6.741882633303545, "grad_norm": 1.1652554273605347, "learning_rate": 4.191633014833377e-05, "loss": 0.5815, "num_input_tokens_seen": 26275656, "step": 45265 }, { "epoch": 6.742627345844504, "grad_norm": 0.8499526977539062, "learning_rate": 4.191393745036529e-05, "loss": 0.6049, "num_input_tokens_seen": 26278536, "step": 45270 }, { "epoch": 6.7433720583854635, "grad_norm": 0.8974575400352478, "learning_rate": 4.191154446665303e-05, "loss": 0.4611, "num_input_tokens_seen": 26281384, "step": 45275 }, { "epoch": 6.744116770926422, "grad_norm": 0.9898697733879089, "learning_rate": 4.19091511972374e-05, "loss": 0.7359, "num_input_tokens_seen": 26284392, "step": 45280 }, { "epoch": 6.744861483467382, "grad_norm": 0.9186433553695679, "learning_rate": 4.1906757642158865e-05, "loss": 0.6419, "num_input_tokens_seen": 26287080, "step": 45285 }, { "epoch": 6.745606196008341, "grad_norm": 1.1868228912353516, "learning_rate": 4.1904363801457835e-05, "loss": 0.6177, "num_input_tokens_seen": 26289928, "step": 45290 }, { "epoch": 6.7463509085493, "grad_norm": 1.674676537513733, "learning_rate": 4.1901969675174755e-05, "loss": 0.5092, "num_input_tokens_seen": 26292776, "step": 45295 }, { "epoch": 6.747095621090259, "grad_norm": 1.4374765157699585, "learning_rate": 4.189957526335009e-05, "loss": 0.6675, "num_input_tokens_seen": 26295688, "step": 45300 }, { "epoch": 6.747840333631219, "grad_norm": 1.7578715085983276, "learning_rate": 4.1897180566024266e-05, "loss": 0.5607, "num_input_tokens_seen": 26298600, "step": 45305 }, { "epoch": 6.7485850461721775, "grad_norm": 1.1742758750915527, "learning_rate": 4.189478558323775e-05, "loss": 0.7691, "num_input_tokens_seen": 26301448, "step": 45310 }, { "epoch": 6.749329758713137, "grad_norm": 1.061680555343628, "learning_rate": 4.1892390315031e-05, "loss": 0.6624, "num_input_tokens_seen": 26304520, "step": 45315 }, { "epoch": 6.750074471254096, "grad_norm": 1.7897295951843262, "learning_rate": 4.188999476144449e-05, "loss": 0.7674, "num_input_tokens_seen": 26307944, "step": 45320 }, { "epoch": 6.7508191837950555, "grad_norm": 1.3582708835601807, "learning_rate": 4.188759892251868e-05, "loss": 0.6863, "num_input_tokens_seen": 26310856, "step": 45325 }, { "epoch": 6.751563896336014, "grad_norm": 1.770407795906067, "learning_rate": 4.188520279829406e-05, "loss": 0.9334, "num_input_tokens_seen": 26313832, "step": 45330 }, { "epoch": 6.752308608876973, "grad_norm": 1.6233851909637451, "learning_rate": 4.188280638881109e-05, "loss": 0.5145, "num_input_tokens_seen": 26316744, "step": 45335 }, { "epoch": 6.753053321417933, "grad_norm": 2.20275616645813, "learning_rate": 4.188040969411027e-05, "loss": 0.6915, "num_input_tokens_seen": 26319784, "step": 45340 }, { "epoch": 6.753798033958892, "grad_norm": 1.1223773956298828, "learning_rate": 4.187801271423207e-05, "loss": 0.6753, "num_input_tokens_seen": 26322728, "step": 45345 }, { "epoch": 6.754542746499851, "grad_norm": 1.1647475957870483, "learning_rate": 4.187561544921702e-05, "loss": 0.6078, "num_input_tokens_seen": 26325736, "step": 45350 }, { "epoch": 6.75528745904081, "grad_norm": 0.7978506088256836, "learning_rate": 4.187321789910559e-05, "loss": 0.5883, "num_input_tokens_seen": 26328744, "step": 45355 }, { "epoch": 6.7560321715817695, "grad_norm": 1.4777361154556274, "learning_rate": 4.1870820063938296e-05, "loss": 0.5758, "num_input_tokens_seen": 26331784, "step": 45360 }, { "epoch": 6.756776884122728, "grad_norm": 1.0053819417953491, "learning_rate": 4.186842194375564e-05, "loss": 0.6859, "num_input_tokens_seen": 26334696, "step": 45365 }, { "epoch": 6.757521596663688, "grad_norm": 1.1921287775039673, "learning_rate": 4.1866023538598136e-05, "loss": 0.6591, "num_input_tokens_seen": 26337512, "step": 45370 }, { "epoch": 6.758266309204647, "grad_norm": 0.9573994874954224, "learning_rate": 4.186362484850631e-05, "loss": 0.4892, "num_input_tokens_seen": 26340456, "step": 45375 }, { "epoch": 6.759011021745606, "grad_norm": 1.2782742977142334, "learning_rate": 4.1861225873520684e-05, "loss": 0.5893, "num_input_tokens_seen": 26343336, "step": 45380 }, { "epoch": 6.759755734286565, "grad_norm": 0.9765234589576721, "learning_rate": 4.185882661368178e-05, "loss": 0.6145, "num_input_tokens_seen": 26346120, "step": 45385 }, { "epoch": 6.760500446827525, "grad_norm": 1.8846052885055542, "learning_rate": 4.185642706903014e-05, "loss": 0.6196, "num_input_tokens_seen": 26348904, "step": 45390 }, { "epoch": 6.7612451593684835, "grad_norm": 0.9360989332199097, "learning_rate": 4.185402723960629e-05, "loss": 0.7751, "num_input_tokens_seen": 26351816, "step": 45395 }, { "epoch": 6.761989871909443, "grad_norm": 0.9899221062660217, "learning_rate": 4.185162712545079e-05, "loss": 0.4737, "num_input_tokens_seen": 26354536, "step": 45400 }, { "epoch": 6.762734584450402, "grad_norm": 1.023546576499939, "learning_rate": 4.1849226726604165e-05, "loss": 0.6643, "num_input_tokens_seen": 26357288, "step": 45405 }, { "epoch": 6.7634792969913615, "grad_norm": 1.787955403327942, "learning_rate": 4.184682604310698e-05, "loss": 0.6117, "num_input_tokens_seen": 26360168, "step": 45410 }, { "epoch": 6.76422400953232, "grad_norm": 1.5273016691207886, "learning_rate": 4.18444250749998e-05, "loss": 0.6506, "num_input_tokens_seen": 26362728, "step": 45415 }, { "epoch": 6.76496872207328, "grad_norm": 1.2784061431884766, "learning_rate": 4.184202382232317e-05, "loss": 0.7029, "num_input_tokens_seen": 26365768, "step": 45420 }, { "epoch": 6.765713434614239, "grad_norm": 1.0027250051498413, "learning_rate": 4.183962228511767e-05, "loss": 0.5781, "num_input_tokens_seen": 26368680, "step": 45425 }, { "epoch": 6.766458147155198, "grad_norm": 0.8836030960083008, "learning_rate": 4.183722046342386e-05, "loss": 0.6702, "num_input_tokens_seen": 26371880, "step": 45430 }, { "epoch": 6.767202859696157, "grad_norm": 0.6780269145965576, "learning_rate": 4.1834818357282336e-05, "loss": 0.6368, "num_input_tokens_seen": 26374856, "step": 45435 }, { "epoch": 6.767947572237117, "grad_norm": 1.7719188928604126, "learning_rate": 4.183241596673366e-05, "loss": 0.6785, "num_input_tokens_seen": 26377768, "step": 45440 }, { "epoch": 6.7686922847780755, "grad_norm": 0.9532424211502075, "learning_rate": 4.183001329181843e-05, "loss": 0.7267, "num_input_tokens_seen": 26380712, "step": 45445 }, { "epoch": 6.769436997319035, "grad_norm": 0.6304320096969604, "learning_rate": 4.1827610332577214e-05, "loss": 0.5592, "num_input_tokens_seen": 26383624, "step": 45450 }, { "epoch": 6.770181709859994, "grad_norm": 1.1877014636993408, "learning_rate": 4.1825207089050634e-05, "loss": 0.7236, "num_input_tokens_seen": 26386568, "step": 45455 }, { "epoch": 6.7709264224009535, "grad_norm": 1.592250943183899, "learning_rate": 4.182280356127928e-05, "loss": 0.5336, "num_input_tokens_seen": 26389160, "step": 45460 }, { "epoch": 6.771671134941912, "grad_norm": 0.7637169361114502, "learning_rate": 4.182039974930376e-05, "loss": 0.6061, "num_input_tokens_seen": 26391880, "step": 45465 }, { "epoch": 6.772415847482872, "grad_norm": 0.9251262545585632, "learning_rate": 4.1817995653164675e-05, "loss": 0.7248, "num_input_tokens_seen": 26394664, "step": 45470 }, { "epoch": 6.773160560023831, "grad_norm": 1.3894447088241577, "learning_rate": 4.1815591272902654e-05, "loss": 0.7143, "num_input_tokens_seen": 26397512, "step": 45475 }, { "epoch": 6.77390527256479, "grad_norm": 0.9542392492294312, "learning_rate": 4.1813186608558305e-05, "loss": 0.6633, "num_input_tokens_seen": 26400328, "step": 45480 }, { "epoch": 6.774649985105749, "grad_norm": 1.881173014640808, "learning_rate": 4.181078166017226e-05, "loss": 0.7213, "num_input_tokens_seen": 26402952, "step": 45485 }, { "epoch": 6.775394697646709, "grad_norm": 1.1590522527694702, "learning_rate": 4.180837642778513e-05, "loss": 0.6177, "num_input_tokens_seen": 26405768, "step": 45490 }, { "epoch": 6.7761394101876675, "grad_norm": 1.6824744939804077, "learning_rate": 4.180597091143759e-05, "loss": 0.6972, "num_input_tokens_seen": 26408552, "step": 45495 }, { "epoch": 6.776884122728626, "grad_norm": 0.9645102024078369, "learning_rate": 4.1803565111170227e-05, "loss": 0.6803, "num_input_tokens_seen": 26411176, "step": 45500 }, { "epoch": 6.777628835269586, "grad_norm": 0.55876225233078, "learning_rate": 4.180115902702372e-05, "loss": 0.4008, "num_input_tokens_seen": 26413960, "step": 45505 }, { "epoch": 6.778373547810546, "grad_norm": 0.8162534832954407, "learning_rate": 4.179875265903871e-05, "loss": 0.6349, "num_input_tokens_seen": 26416936, "step": 45510 }, { "epoch": 6.779118260351504, "grad_norm": 0.9780896902084351, "learning_rate": 4.1796346007255844e-05, "loss": 0.5194, "num_input_tokens_seen": 26419944, "step": 45515 }, { "epoch": 6.779862972892463, "grad_norm": 1.0029609203338623, "learning_rate": 4.1793939071715786e-05, "loss": 0.5333, "num_input_tokens_seen": 26422824, "step": 45520 }, { "epoch": 6.780607685433423, "grad_norm": 0.9378268122673035, "learning_rate": 4.1791531852459196e-05, "loss": 0.3985, "num_input_tokens_seen": 26425736, "step": 45525 }, { "epoch": 6.781352397974382, "grad_norm": 2.0113704204559326, "learning_rate": 4.1789124349526745e-05, "loss": 0.7188, "num_input_tokens_seen": 26428776, "step": 45530 }, { "epoch": 6.782097110515341, "grad_norm": 2.7039759159088135, "learning_rate": 4.178671656295909e-05, "loss": 0.5149, "num_input_tokens_seen": 26431496, "step": 45535 }, { "epoch": 6.7828418230563, "grad_norm": 1.2638330459594727, "learning_rate": 4.1784308492796926e-05, "loss": 0.6167, "num_input_tokens_seen": 26434600, "step": 45540 }, { "epoch": 6.7835865355972595, "grad_norm": 2.486582040786743, "learning_rate": 4.1781900139080933e-05, "loss": 0.7283, "num_input_tokens_seen": 26437448, "step": 45545 }, { "epoch": 6.784331248138218, "grad_norm": 1.1134939193725586, "learning_rate": 4.1779491501851786e-05, "loss": 0.71, "num_input_tokens_seen": 26440328, "step": 45550 }, { "epoch": 6.785075960679178, "grad_norm": 1.1418874263763428, "learning_rate": 4.177708258115019e-05, "loss": 0.678, "num_input_tokens_seen": 26443240, "step": 45555 }, { "epoch": 6.785820673220137, "grad_norm": 1.8931818008422852, "learning_rate": 4.177467337701683e-05, "loss": 0.5241, "num_input_tokens_seen": 26445928, "step": 45560 }, { "epoch": 6.786565385761096, "grad_norm": 2.0427873134613037, "learning_rate": 4.177226388949241e-05, "loss": 0.7273, "num_input_tokens_seen": 26448872, "step": 45565 }, { "epoch": 6.787310098302055, "grad_norm": 0.9865609407424927, "learning_rate": 4.176985411861765e-05, "loss": 0.5245, "num_input_tokens_seen": 26451912, "step": 45570 }, { "epoch": 6.788054810843015, "grad_norm": 1.6820727586746216, "learning_rate": 4.1767444064433244e-05, "loss": 0.6733, "num_input_tokens_seen": 26454664, "step": 45575 }, { "epoch": 6.7887995233839735, "grad_norm": 1.9177783727645874, "learning_rate": 4.1765033726979906e-05, "loss": 0.7851, "num_input_tokens_seen": 26457448, "step": 45580 }, { "epoch": 6.789544235924933, "grad_norm": 2.1837220191955566, "learning_rate": 4.176262310629837e-05, "loss": 0.7728, "num_input_tokens_seen": 26460456, "step": 45585 }, { "epoch": 6.790288948465892, "grad_norm": 2.7232816219329834, "learning_rate": 4.176021220242935e-05, "loss": 0.6221, "num_input_tokens_seen": 26463528, "step": 45590 }, { "epoch": 6.791033661006852, "grad_norm": 1.9010511636734009, "learning_rate": 4.175780101541358e-05, "loss": 0.5933, "num_input_tokens_seen": 26466440, "step": 45595 }, { "epoch": 6.79177837354781, "grad_norm": 1.5774314403533936, "learning_rate": 4.175538954529179e-05, "loss": 0.6633, "num_input_tokens_seen": 26469160, "step": 45600 }, { "epoch": 6.79252308608877, "grad_norm": 2.4382665157318115, "learning_rate": 4.175297779210473e-05, "loss": 0.6962, "num_input_tokens_seen": 26471688, "step": 45605 }, { "epoch": 6.793267798629729, "grad_norm": 0.8897668719291687, "learning_rate": 4.1750565755893134e-05, "loss": 0.6236, "num_input_tokens_seen": 26474504, "step": 45610 }, { "epoch": 6.794012511170688, "grad_norm": 1.1594918966293335, "learning_rate": 4.174815343669775e-05, "loss": 0.5675, "num_input_tokens_seen": 26477480, "step": 45615 }, { "epoch": 6.794757223711647, "grad_norm": 0.6036248803138733, "learning_rate": 4.1745740834559335e-05, "loss": 0.413, "num_input_tokens_seen": 26480200, "step": 45620 }, { "epoch": 6.795501936252607, "grad_norm": 1.165998101234436, "learning_rate": 4.174332794951866e-05, "loss": 0.6493, "num_input_tokens_seen": 26483048, "step": 45625 }, { "epoch": 6.7962466487935655, "grad_norm": 1.6138774156570435, "learning_rate": 4.174091478161646e-05, "loss": 0.5807, "num_input_tokens_seen": 26485768, "step": 45630 }, { "epoch": 6.796991361334525, "grad_norm": 1.1550315618515015, "learning_rate": 4.173850133089353e-05, "loss": 0.6934, "num_input_tokens_seen": 26488872, "step": 45635 }, { "epoch": 6.797736073875484, "grad_norm": 1.2411857843399048, "learning_rate": 4.173608759739063e-05, "loss": 0.7116, "num_input_tokens_seen": 26491784, "step": 45640 }, { "epoch": 6.798480786416444, "grad_norm": 0.6791526675224304, "learning_rate": 4.173367358114855e-05, "loss": 0.5045, "num_input_tokens_seen": 26494696, "step": 45645 }, { "epoch": 6.799225498957402, "grad_norm": 1.328257441520691, "learning_rate": 4.1731259282208047e-05, "loss": 0.7638, "num_input_tokens_seen": 26497384, "step": 45650 }, { "epoch": 6.799970211498362, "grad_norm": 0.9839432835578918, "learning_rate": 4.1728844700609926e-05, "loss": 0.4516, "num_input_tokens_seen": 26500392, "step": 45655 }, { "epoch": 6.800714924039321, "grad_norm": 1.1564292907714844, "learning_rate": 4.172642983639498e-05, "loss": 0.473, "num_input_tokens_seen": 26503176, "step": 45660 }, { "epoch": 6.8014596365802795, "grad_norm": 1.2288095951080322, "learning_rate": 4.1724014689604e-05, "loss": 0.5794, "num_input_tokens_seen": 26505992, "step": 45665 }, { "epoch": 6.802204349121239, "grad_norm": 0.7508024573326111, "learning_rate": 4.1721599260277796e-05, "loss": 0.4799, "num_input_tokens_seen": 26509064, "step": 45670 }, { "epoch": 6.802949061662199, "grad_norm": 1.100738525390625, "learning_rate": 4.171918354845716e-05, "loss": 0.8403, "num_input_tokens_seen": 26512168, "step": 45675 }, { "epoch": 6.803693774203158, "grad_norm": 2.067216157913208, "learning_rate": 4.171676755418291e-05, "loss": 0.7897, "num_input_tokens_seen": 26514984, "step": 45680 }, { "epoch": 6.804438486744116, "grad_norm": 0.8128723502159119, "learning_rate": 4.171435127749587e-05, "loss": 0.7265, "num_input_tokens_seen": 26518088, "step": 45685 }, { "epoch": 6.805183199285076, "grad_norm": 1.1435301303863525, "learning_rate": 4.171193471843685e-05, "loss": 0.6061, "num_input_tokens_seen": 26520904, "step": 45690 }, { "epoch": 6.805927911826036, "grad_norm": 1.1885418891906738, "learning_rate": 4.170951787704667e-05, "loss": 0.8332, "num_input_tokens_seen": 26523688, "step": 45695 }, { "epoch": 6.806672624366994, "grad_norm": 1.135934829711914, "learning_rate": 4.170710075336617e-05, "loss": 0.7444, "num_input_tokens_seen": 26526568, "step": 45700 }, { "epoch": 6.807417336907953, "grad_norm": 1.141530990600586, "learning_rate": 4.170468334743619e-05, "loss": 0.5989, "num_input_tokens_seen": 26529544, "step": 45705 }, { "epoch": 6.808162049448913, "grad_norm": 0.9373657703399658, "learning_rate": 4.1702265659297554e-05, "loss": 0.7287, "num_input_tokens_seen": 26532488, "step": 45710 }, { "epoch": 6.8089067619898715, "grad_norm": 1.0715007781982422, "learning_rate": 4.169984768899112e-05, "loss": 0.6468, "num_input_tokens_seen": 26535464, "step": 45715 }, { "epoch": 6.809651474530831, "grad_norm": 1.7965348958969116, "learning_rate": 4.169742943655774e-05, "loss": 0.7179, "num_input_tokens_seen": 26538536, "step": 45720 }, { "epoch": 6.81039618707179, "grad_norm": 0.9126522541046143, "learning_rate": 4.169501090203826e-05, "loss": 0.6475, "num_input_tokens_seen": 26541128, "step": 45725 }, { "epoch": 6.81114089961275, "grad_norm": 1.2853015661239624, "learning_rate": 4.1692592085473525e-05, "loss": 0.5564, "num_input_tokens_seen": 26543976, "step": 45730 }, { "epoch": 6.811885612153708, "grad_norm": 1.3651278018951416, "learning_rate": 4.169017298690442e-05, "loss": 0.6196, "num_input_tokens_seen": 26546856, "step": 45735 }, { "epoch": 6.812630324694668, "grad_norm": 0.7551059126853943, "learning_rate": 4.168775360637181e-05, "loss": 0.4136, "num_input_tokens_seen": 26550024, "step": 45740 }, { "epoch": 6.813375037235627, "grad_norm": 1.4315581321716309, "learning_rate": 4.168533394391656e-05, "loss": 0.7164, "num_input_tokens_seen": 26553096, "step": 45745 }, { "epoch": 6.814119749776586, "grad_norm": 1.871899127960205, "learning_rate": 4.1682913999579545e-05, "loss": 0.7128, "num_input_tokens_seen": 26555912, "step": 45750 }, { "epoch": 6.814864462317545, "grad_norm": 1.139970064163208, "learning_rate": 4.1680493773401657e-05, "loss": 0.5958, "num_input_tokens_seen": 26558792, "step": 45755 }, { "epoch": 6.815609174858505, "grad_norm": 1.2776641845703125, "learning_rate": 4.167807326542379e-05, "loss": 0.7486, "num_input_tokens_seen": 26561704, "step": 45760 }, { "epoch": 6.816353887399464, "grad_norm": 1.3195871114730835, "learning_rate": 4.167565247568681e-05, "loss": 0.7893, "num_input_tokens_seen": 26564456, "step": 45765 }, { "epoch": 6.817098599940423, "grad_norm": 1.8814738988876343, "learning_rate": 4.167323140423164e-05, "loss": 0.5811, "num_input_tokens_seen": 26567272, "step": 45770 }, { "epoch": 6.817843312481382, "grad_norm": 1.9847602844238281, "learning_rate": 4.167081005109917e-05, "loss": 0.6545, "num_input_tokens_seen": 26570120, "step": 45775 }, { "epoch": 6.818588025022342, "grad_norm": 1.1213194131851196, "learning_rate": 4.16683884163303e-05, "loss": 0.7068, "num_input_tokens_seen": 26573288, "step": 45780 }, { "epoch": 6.8193327375633, "grad_norm": 1.0346248149871826, "learning_rate": 4.166596649996596e-05, "loss": 0.7281, "num_input_tokens_seen": 26576488, "step": 45785 }, { "epoch": 6.82007745010426, "grad_norm": 0.9303046464920044, "learning_rate": 4.166354430204705e-05, "loss": 0.644, "num_input_tokens_seen": 26579208, "step": 45790 }, { "epoch": 6.820822162645219, "grad_norm": 0.8059935569763184, "learning_rate": 4.166112182261449e-05, "loss": 0.6475, "num_input_tokens_seen": 26582248, "step": 45795 }, { "epoch": 6.821566875186178, "grad_norm": 1.1025325059890747, "learning_rate": 4.1658699061709215e-05, "loss": 0.6019, "num_input_tokens_seen": 26585096, "step": 45800 }, { "epoch": 6.822311587727137, "grad_norm": 1.9341925382614136, "learning_rate": 4.1656276019372156e-05, "loss": 0.7352, "num_input_tokens_seen": 26588168, "step": 45805 }, { "epoch": 6.823056300268097, "grad_norm": 1.7277953624725342, "learning_rate": 4.165385269564423e-05, "loss": 0.6625, "num_input_tokens_seen": 26590984, "step": 45810 }, { "epoch": 6.823801012809056, "grad_norm": 1.3295814990997314, "learning_rate": 4.16514290905664e-05, "loss": 0.6173, "num_input_tokens_seen": 26593896, "step": 45815 }, { "epoch": 6.824545725350015, "grad_norm": 1.4895663261413574, "learning_rate": 4.164900520417959e-05, "loss": 0.5463, "num_input_tokens_seen": 26596904, "step": 45820 }, { "epoch": 6.825290437890974, "grad_norm": 2.2188751697540283, "learning_rate": 4.164658103652477e-05, "loss": 0.5593, "num_input_tokens_seen": 26599592, "step": 45825 }, { "epoch": 6.826035150431934, "grad_norm": 1.2202612161636353, "learning_rate": 4.164415658764287e-05, "loss": 0.6104, "num_input_tokens_seen": 26602536, "step": 45830 }, { "epoch": 6.826779862972892, "grad_norm": 1.4360544681549072, "learning_rate": 4.164173185757487e-05, "loss": 0.7354, "num_input_tokens_seen": 26605352, "step": 45835 }, { "epoch": 6.827524575513852, "grad_norm": 0.9735091328620911, "learning_rate": 4.163930684636173e-05, "loss": 0.5772, "num_input_tokens_seen": 26607912, "step": 45840 }, { "epoch": 6.828269288054811, "grad_norm": 1.3174502849578857, "learning_rate": 4.16368815540444e-05, "loss": 0.5105, "num_input_tokens_seen": 26610568, "step": 45845 }, { "epoch": 6.82901400059577, "grad_norm": 1.2727442979812622, "learning_rate": 4.1634455980663866e-05, "loss": 0.6214, "num_input_tokens_seen": 26613672, "step": 45850 }, { "epoch": 6.829758713136729, "grad_norm": 1.527793049812317, "learning_rate": 4.163203012626111e-05, "loss": 0.6084, "num_input_tokens_seen": 26616680, "step": 45855 }, { "epoch": 6.830503425677689, "grad_norm": 0.9146248698234558, "learning_rate": 4.16296039908771e-05, "loss": 0.617, "num_input_tokens_seen": 26619816, "step": 45860 }, { "epoch": 6.831248138218648, "grad_norm": 0.9599303007125854, "learning_rate": 4.162717757455284e-05, "loss": 0.6096, "num_input_tokens_seen": 26622760, "step": 45865 }, { "epoch": 6.831992850759606, "grad_norm": 1.0714308023452759, "learning_rate": 4.162475087732931e-05, "loss": 0.527, "num_input_tokens_seen": 26625512, "step": 45870 }, { "epoch": 6.832737563300566, "grad_norm": 0.8575351238250732, "learning_rate": 4.162232389924751e-05, "loss": 0.6907, "num_input_tokens_seen": 26630056, "step": 45875 }, { "epoch": 6.833482275841525, "grad_norm": 1.6919203996658325, "learning_rate": 4.1619896640348445e-05, "loss": 0.6293, "num_input_tokens_seen": 26632808, "step": 45880 }, { "epoch": 6.834226988382484, "grad_norm": 1.0337698459625244, "learning_rate": 4.1617469100673126e-05, "loss": 0.6446, "num_input_tokens_seen": 26635560, "step": 45885 }, { "epoch": 6.834971700923443, "grad_norm": 1.051633596420288, "learning_rate": 4.161504128026255e-05, "loss": 0.6649, "num_input_tokens_seen": 26638472, "step": 45890 }, { "epoch": 6.835716413464403, "grad_norm": 1.747916579246521, "learning_rate": 4.1612613179157725e-05, "loss": 0.6458, "num_input_tokens_seen": 26641448, "step": 45895 }, { "epoch": 6.836461126005362, "grad_norm": 0.6894704699516296, "learning_rate": 4.1610184797399696e-05, "loss": 0.5823, "num_input_tokens_seen": 26644264, "step": 45900 }, { "epoch": 6.837205838546321, "grad_norm": 1.4094525575637817, "learning_rate": 4.160775613502948e-05, "loss": 0.5381, "num_input_tokens_seen": 26647016, "step": 45905 }, { "epoch": 6.83795055108728, "grad_norm": 1.503318190574646, "learning_rate": 4.160532719208809e-05, "loss": 0.8233, "num_input_tokens_seen": 26649832, "step": 45910 }, { "epoch": 6.83869526362824, "grad_norm": 1.6698652505874634, "learning_rate": 4.160289796861659e-05, "loss": 0.5583, "num_input_tokens_seen": 26652776, "step": 45915 }, { "epoch": 6.839439976169198, "grad_norm": 1.124740719795227, "learning_rate": 4.1600468464656e-05, "loss": 0.4355, "num_input_tokens_seen": 26655592, "step": 45920 }, { "epoch": 6.840184688710158, "grad_norm": 2.605639934539795, "learning_rate": 4.1598038680247363e-05, "loss": 0.8255, "num_input_tokens_seen": 26658824, "step": 45925 }, { "epoch": 6.840929401251117, "grad_norm": 0.8603819608688354, "learning_rate": 4.159560861543174e-05, "loss": 0.7656, "num_input_tokens_seen": 26661480, "step": 45930 }, { "epoch": 6.8416741137920765, "grad_norm": 0.7877789735794067, "learning_rate": 4.159317827025016e-05, "loss": 0.5841, "num_input_tokens_seen": 26664328, "step": 45935 }, { "epoch": 6.842418826333035, "grad_norm": 0.9834244251251221, "learning_rate": 4.159074764474371e-05, "loss": 0.7418, "num_input_tokens_seen": 26667592, "step": 45940 }, { "epoch": 6.843163538873995, "grad_norm": 1.1564689874649048, "learning_rate": 4.1588316738953434e-05, "loss": 0.8294, "num_input_tokens_seen": 26670504, "step": 45945 }, { "epoch": 6.843908251414954, "grad_norm": 0.999808132648468, "learning_rate": 4.158588555292041e-05, "loss": 0.5487, "num_input_tokens_seen": 26673448, "step": 45950 }, { "epoch": 6.844652963955913, "grad_norm": 1.437626838684082, "learning_rate": 4.158345408668571e-05, "loss": 0.5395, "num_input_tokens_seen": 26676136, "step": 45955 }, { "epoch": 6.845397676496872, "grad_norm": 1.152953028678894, "learning_rate": 4.15810223402904e-05, "loss": 0.5358, "num_input_tokens_seen": 26679080, "step": 45960 }, { "epoch": 6.846142389037832, "grad_norm": 1.6596553325653076, "learning_rate": 4.157859031377558e-05, "loss": 0.5726, "num_input_tokens_seen": 26681864, "step": 45965 }, { "epoch": 6.84688710157879, "grad_norm": 1.6664503812789917, "learning_rate": 4.157615800718232e-05, "loss": 0.7632, "num_input_tokens_seen": 26684744, "step": 45970 }, { "epoch": 6.84763181411975, "grad_norm": 1.1997096538543701, "learning_rate": 4.1573725420551716e-05, "loss": 0.5035, "num_input_tokens_seen": 26687688, "step": 45975 }, { "epoch": 6.848376526660709, "grad_norm": 1.028708577156067, "learning_rate": 4.157129255392487e-05, "loss": 0.6225, "num_input_tokens_seen": 26690504, "step": 45980 }, { "epoch": 6.8491212392016685, "grad_norm": 0.9139019250869751, "learning_rate": 4.1568859407342876e-05, "loss": 0.6655, "num_input_tokens_seen": 26693224, "step": 45985 }, { "epoch": 6.849865951742627, "grad_norm": 1.5336326360702515, "learning_rate": 4.1566425980846844e-05, "loss": 0.6952, "num_input_tokens_seen": 26695976, "step": 45990 }, { "epoch": 6.850610664283587, "grad_norm": 1.4698731899261475, "learning_rate": 4.156399227447788e-05, "loss": 0.6279, "num_input_tokens_seen": 26698760, "step": 45995 }, { "epoch": 6.851355376824546, "grad_norm": 0.7287558317184448, "learning_rate": 4.15615582882771e-05, "loss": 0.5615, "num_input_tokens_seen": 26701448, "step": 46000 }, { "epoch": 6.852100089365505, "grad_norm": 0.8387532830238342, "learning_rate": 4.155912402228563e-05, "loss": 0.584, "num_input_tokens_seen": 26704520, "step": 46005 }, { "epoch": 6.852844801906464, "grad_norm": 1.3321223258972168, "learning_rate": 4.155668947654458e-05, "loss": 0.818, "num_input_tokens_seen": 26707176, "step": 46010 }, { "epoch": 6.853589514447423, "grad_norm": 1.210299015045166, "learning_rate": 4.15542546510951e-05, "loss": 0.5498, "num_input_tokens_seen": 26710088, "step": 46015 }, { "epoch": 6.8543342269883825, "grad_norm": 2.7017571926116943, "learning_rate": 4.155181954597832e-05, "loss": 0.6155, "num_input_tokens_seen": 26713000, "step": 46020 }, { "epoch": 6.855078939529342, "grad_norm": 1.5919269323349, "learning_rate": 4.154938416123535e-05, "loss": 0.6217, "num_input_tokens_seen": 26716232, "step": 46025 }, { "epoch": 6.855823652070301, "grad_norm": 1.865760087966919, "learning_rate": 4.154694849690737e-05, "loss": 0.6682, "num_input_tokens_seen": 26719112, "step": 46030 }, { "epoch": 6.85656836461126, "grad_norm": 1.6369946002960205, "learning_rate": 4.154451255303551e-05, "loss": 0.6005, "num_input_tokens_seen": 26721864, "step": 46035 }, { "epoch": 6.857313077152219, "grad_norm": 1.2800508737564087, "learning_rate": 4.154207632966092e-05, "loss": 0.723, "num_input_tokens_seen": 26725256, "step": 46040 }, { "epoch": 6.858057789693179, "grad_norm": 1.3611578941345215, "learning_rate": 4.1539639826824765e-05, "loss": 0.6084, "num_input_tokens_seen": 26727944, "step": 46045 }, { "epoch": 6.858802502234138, "grad_norm": 1.9259252548217773, "learning_rate": 4.1537203044568205e-05, "loss": 0.5567, "num_input_tokens_seen": 26730600, "step": 46050 }, { "epoch": 6.859547214775096, "grad_norm": 1.0085331201553345, "learning_rate": 4.153476598293241e-05, "loss": 0.6164, "num_input_tokens_seen": 26733576, "step": 46055 }, { "epoch": 6.860291927316056, "grad_norm": 0.9308328032493591, "learning_rate": 4.153232864195855e-05, "loss": 0.6936, "num_input_tokens_seen": 26736424, "step": 46060 }, { "epoch": 6.861036639857015, "grad_norm": 1.5350171327590942, "learning_rate": 4.1529891021687796e-05, "loss": 0.5454, "num_input_tokens_seen": 26739208, "step": 46065 }, { "epoch": 6.8617813523979745, "grad_norm": 2.3954739570617676, "learning_rate": 4.152745312216134e-05, "loss": 0.6469, "num_input_tokens_seen": 26742056, "step": 46070 }, { "epoch": 6.862526064938933, "grad_norm": 0.6744016408920288, "learning_rate": 4.152501494342035e-05, "loss": 0.7022, "num_input_tokens_seen": 26744840, "step": 46075 }, { "epoch": 6.863270777479893, "grad_norm": 1.129693627357483, "learning_rate": 4.152257648550604e-05, "loss": 0.8072, "num_input_tokens_seen": 26747688, "step": 46080 }, { "epoch": 6.864015490020852, "grad_norm": 1.185366153717041, "learning_rate": 4.1520137748459587e-05, "loss": 0.695, "num_input_tokens_seen": 26750408, "step": 46085 }, { "epoch": 6.864760202561811, "grad_norm": 0.7849366068840027, "learning_rate": 4.1517698732322194e-05, "loss": 0.4539, "num_input_tokens_seen": 26753096, "step": 46090 }, { "epoch": 6.86550491510277, "grad_norm": 1.082317590713501, "learning_rate": 4.1515259437135076e-05, "loss": 0.6803, "num_input_tokens_seen": 26755848, "step": 46095 }, { "epoch": 6.86624962764373, "grad_norm": 0.9318513870239258, "learning_rate": 4.1512819862939425e-05, "loss": 0.6627, "num_input_tokens_seen": 26758632, "step": 46100 }, { "epoch": 6.8669943401846885, "grad_norm": 1.1222352981567383, "learning_rate": 4.1510380009776475e-05, "loss": 0.5339, "num_input_tokens_seen": 26761096, "step": 46105 }, { "epoch": 6.867739052725648, "grad_norm": 1.0781341791152954, "learning_rate": 4.150793987768743e-05, "loss": 0.8317, "num_input_tokens_seen": 26764008, "step": 46110 }, { "epoch": 6.868483765266607, "grad_norm": 2.298163890838623, "learning_rate": 4.1505499466713516e-05, "loss": 0.7635, "num_input_tokens_seen": 26766824, "step": 46115 }, { "epoch": 6.8692284778075665, "grad_norm": 0.9163206815719604, "learning_rate": 4.1503058776895974e-05, "loss": 0.7512, "num_input_tokens_seen": 26769544, "step": 46120 }, { "epoch": 6.869973190348525, "grad_norm": 1.1214573383331299, "learning_rate": 4.150061780827602e-05, "loss": 0.5903, "num_input_tokens_seen": 26772200, "step": 46125 }, { "epoch": 6.870717902889485, "grad_norm": 1.5769331455230713, "learning_rate": 4.14981765608949e-05, "loss": 0.7238, "num_input_tokens_seen": 26775144, "step": 46130 }, { "epoch": 6.871462615430444, "grad_norm": 2.151914119720459, "learning_rate": 4.1495735034793856e-05, "loss": 0.6305, "num_input_tokens_seen": 26778024, "step": 46135 }, { "epoch": 6.872207327971403, "grad_norm": 1.5385353565216064, "learning_rate": 4.149329323001413e-05, "loss": 0.6729, "num_input_tokens_seen": 26781000, "step": 46140 }, { "epoch": 6.872952040512362, "grad_norm": 2.007781505584717, "learning_rate": 4.149085114659699e-05, "loss": 0.5038, "num_input_tokens_seen": 26783944, "step": 46145 }, { "epoch": 6.873696753053322, "grad_norm": 1.5335981845855713, "learning_rate": 4.1488408784583664e-05, "loss": 0.6409, "num_input_tokens_seen": 26786728, "step": 46150 }, { "epoch": 6.8744414655942805, "grad_norm": 0.9487615823745728, "learning_rate": 4.148596614401544e-05, "loss": 0.6339, "num_input_tokens_seen": 26789896, "step": 46155 }, { "epoch": 6.87518617813524, "grad_norm": 0.6410226225852966, "learning_rate": 4.148352322493357e-05, "loss": 0.6536, "num_input_tokens_seen": 26792648, "step": 46160 }, { "epoch": 6.875930890676199, "grad_norm": 1.243889570236206, "learning_rate": 4.148108002737933e-05, "loss": 0.6748, "num_input_tokens_seen": 26795272, "step": 46165 }, { "epoch": 6.8766756032171585, "grad_norm": 0.8550469875335693, "learning_rate": 4.147863655139399e-05, "loss": 0.7288, "num_input_tokens_seen": 26798376, "step": 46170 }, { "epoch": 6.877420315758117, "grad_norm": 0.6071836948394775, "learning_rate": 4.1476192797018836e-05, "loss": 0.3616, "num_input_tokens_seen": 26801416, "step": 46175 }, { "epoch": 6.878165028299077, "grad_norm": 1.0937761068344116, "learning_rate": 4.147374876429515e-05, "loss": 0.8416, "num_input_tokens_seen": 26804104, "step": 46180 }, { "epoch": 6.878909740840036, "grad_norm": 0.9576815962791443, "learning_rate": 4.1471304453264225e-05, "loss": 0.7685, "num_input_tokens_seen": 26806984, "step": 46185 }, { "epoch": 6.879654453380995, "grad_norm": 0.9057193398475647, "learning_rate": 4.1468859863967345e-05, "loss": 0.6715, "num_input_tokens_seen": 26809768, "step": 46190 }, { "epoch": 6.880399165921954, "grad_norm": 1.6247386932373047, "learning_rate": 4.1466414996445824e-05, "loss": 0.5573, "num_input_tokens_seen": 26812744, "step": 46195 }, { "epoch": 6.881143878462913, "grad_norm": 1.194976806640625, "learning_rate": 4.146396985074095e-05, "loss": 0.5938, "num_input_tokens_seen": 26815496, "step": 46200 }, { "epoch": 6.8818885910038725, "grad_norm": 0.9447504878044128, "learning_rate": 4.146152442689405e-05, "loss": 0.5334, "num_input_tokens_seen": 26818536, "step": 46205 }, { "epoch": 6.882633303544832, "grad_norm": 1.5877197980880737, "learning_rate": 4.1459078724946406e-05, "loss": 0.9216, "num_input_tokens_seen": 26821352, "step": 46210 }, { "epoch": 6.883378016085791, "grad_norm": 1.0770959854125977, "learning_rate": 4.1456632744939375e-05, "loss": 0.6953, "num_input_tokens_seen": 26824072, "step": 46215 }, { "epoch": 6.88412272862675, "grad_norm": 1.173936128616333, "learning_rate": 4.145418648691425e-05, "loss": 0.7165, "num_input_tokens_seen": 26826984, "step": 46220 }, { "epoch": 6.884867441167709, "grad_norm": 1.2418063879013062, "learning_rate": 4.1451739950912365e-05, "loss": 0.6415, "num_input_tokens_seen": 26829864, "step": 46225 }, { "epoch": 6.885612153708668, "grad_norm": 2.3460233211517334, "learning_rate": 4.144929313697506e-05, "loss": 0.724, "num_input_tokens_seen": 26832616, "step": 46230 }, { "epoch": 6.886356866249628, "grad_norm": 2.316965103149414, "learning_rate": 4.144684604514366e-05, "loss": 0.7707, "num_input_tokens_seen": 26835528, "step": 46235 }, { "epoch": 6.8871015787905865, "grad_norm": 1.0960326194763184, "learning_rate": 4.144439867545952e-05, "loss": 0.5878, "num_input_tokens_seen": 26838632, "step": 46240 }, { "epoch": 6.887846291331546, "grad_norm": 1.4792866706848145, "learning_rate": 4.144195102796398e-05, "loss": 0.789, "num_input_tokens_seen": 26841608, "step": 46245 }, { "epoch": 6.888591003872505, "grad_norm": 2.128582239151001, "learning_rate": 4.143950310269837e-05, "loss": 0.6672, "num_input_tokens_seen": 26844616, "step": 46250 }, { "epoch": 6.8893357164134645, "grad_norm": 0.9442983865737915, "learning_rate": 4.143705489970408e-05, "loss": 0.595, "num_input_tokens_seen": 26847560, "step": 46255 }, { "epoch": 6.890080428954423, "grad_norm": 1.2402006387710571, "learning_rate": 4.143460641902245e-05, "loss": 0.5416, "num_input_tokens_seen": 26850248, "step": 46260 }, { "epoch": 6.890825141495383, "grad_norm": 1.0900826454162598, "learning_rate": 4.143215766069484e-05, "loss": 0.6389, "num_input_tokens_seen": 26853288, "step": 46265 }, { "epoch": 6.891569854036342, "grad_norm": 0.7983907461166382, "learning_rate": 4.142970862476264e-05, "loss": 0.6878, "num_input_tokens_seen": 26856040, "step": 46270 }, { "epoch": 6.892314566577301, "grad_norm": 1.4363199472427368, "learning_rate": 4.14272593112672e-05, "loss": 0.605, "num_input_tokens_seen": 26858888, "step": 46275 }, { "epoch": 6.89305927911826, "grad_norm": 1.3522692918777466, "learning_rate": 4.142480972024991e-05, "loss": 0.6787, "num_input_tokens_seen": 26861896, "step": 46280 }, { "epoch": 6.89380399165922, "grad_norm": 1.689401388168335, "learning_rate": 4.142235985175217e-05, "loss": 0.7806, "num_input_tokens_seen": 26864520, "step": 46285 }, { "epoch": 6.8945487042001785, "grad_norm": 1.0540937185287476, "learning_rate": 4.141990970581534e-05, "loss": 0.5815, "num_input_tokens_seen": 26867496, "step": 46290 }, { "epoch": 6.895293416741138, "grad_norm": 1.1469651460647583, "learning_rate": 4.141745928248082e-05, "loss": 0.6887, "num_input_tokens_seen": 26870248, "step": 46295 }, { "epoch": 6.896038129282097, "grad_norm": 0.8972534537315369, "learning_rate": 4.141500858179002e-05, "loss": 0.5584, "num_input_tokens_seen": 26873032, "step": 46300 }, { "epoch": 6.896782841823057, "grad_norm": 0.969032883644104, "learning_rate": 4.141255760378432e-05, "loss": 0.7005, "num_input_tokens_seen": 26876200, "step": 46305 }, { "epoch": 6.897527554364015, "grad_norm": 1.7987239360809326, "learning_rate": 4.141010634850515e-05, "loss": 0.6867, "num_input_tokens_seen": 26878952, "step": 46310 }, { "epoch": 6.898272266904975, "grad_norm": 0.9363367557525635, "learning_rate": 4.140765481599391e-05, "loss": 0.7024, "num_input_tokens_seen": 26882024, "step": 46315 }, { "epoch": 6.899016979445934, "grad_norm": 1.0690579414367676, "learning_rate": 4.1405203006292014e-05, "loss": 0.5605, "num_input_tokens_seen": 26884936, "step": 46320 }, { "epoch": 6.899761691986893, "grad_norm": 1.5646698474884033, "learning_rate": 4.1402750919440894e-05, "loss": 0.6688, "num_input_tokens_seen": 26887944, "step": 46325 }, { "epoch": 6.900506404527852, "grad_norm": 0.9556550979614258, "learning_rate": 4.140029855548196e-05, "loss": 0.6211, "num_input_tokens_seen": 26891304, "step": 46330 }, { "epoch": 6.901251117068812, "grad_norm": 1.5744704008102417, "learning_rate": 4.1397845914456656e-05, "loss": 0.6976, "num_input_tokens_seen": 26894152, "step": 46335 }, { "epoch": 6.9019958296097705, "grad_norm": 0.6954466700553894, "learning_rate": 4.139539299640641e-05, "loss": 0.5156, "num_input_tokens_seen": 26896936, "step": 46340 }, { "epoch": 6.90274054215073, "grad_norm": 1.593247890472412, "learning_rate": 4.139293980137267e-05, "loss": 0.5483, "num_input_tokens_seen": 26899752, "step": 46345 }, { "epoch": 6.903485254691689, "grad_norm": 0.7954477667808533, "learning_rate": 4.139048632939686e-05, "loss": 0.5085, "num_input_tokens_seen": 26902696, "step": 46350 }, { "epoch": 6.904229967232649, "grad_norm": 1.8284746408462524, "learning_rate": 4.138803258052045e-05, "loss": 0.6164, "num_input_tokens_seen": 26906856, "step": 46355 }, { "epoch": 6.904974679773607, "grad_norm": 0.8824518322944641, "learning_rate": 4.138557855478489e-05, "loss": 0.7105, "num_input_tokens_seen": 26909608, "step": 46360 }, { "epoch": 6.905719392314566, "grad_norm": 0.9503819942474365, "learning_rate": 4.1383124252231625e-05, "loss": 0.653, "num_input_tokens_seen": 26912488, "step": 46365 }, { "epoch": 6.906464104855526, "grad_norm": 1.1328749656677246, "learning_rate": 4.138066967290213e-05, "loss": 0.6273, "num_input_tokens_seen": 26915464, "step": 46370 }, { "epoch": 6.907208817396485, "grad_norm": 1.4300154447555542, "learning_rate": 4.137821481683787e-05, "loss": 0.6888, "num_input_tokens_seen": 26918824, "step": 46375 }, { "epoch": 6.907953529937444, "grad_norm": 0.9777395129203796, "learning_rate": 4.1375759684080314e-05, "loss": 0.6863, "num_input_tokens_seen": 26921896, "step": 46380 }, { "epoch": 6.908698242478403, "grad_norm": 0.9083206653594971, "learning_rate": 4.137330427467094e-05, "loss": 0.4469, "num_input_tokens_seen": 26924520, "step": 46385 }, { "epoch": 6.909442955019363, "grad_norm": 0.9411382079124451, "learning_rate": 4.137084858865124e-05, "loss": 0.557, "num_input_tokens_seen": 26927016, "step": 46390 }, { "epoch": 6.910187667560322, "grad_norm": 1.4787847995758057, "learning_rate": 4.1368392626062685e-05, "loss": 0.5305, "num_input_tokens_seen": 26929736, "step": 46395 }, { "epoch": 6.910932380101281, "grad_norm": 1.6591049432754517, "learning_rate": 4.1365936386946776e-05, "loss": 0.5463, "num_input_tokens_seen": 26932712, "step": 46400 }, { "epoch": 6.91167709264224, "grad_norm": 1.0980408191680908, "learning_rate": 4.1363479871345e-05, "loss": 0.6504, "num_input_tokens_seen": 26935720, "step": 46405 }, { "epoch": 6.912421805183199, "grad_norm": 0.8819018602371216, "learning_rate": 4.1361023079298874e-05, "loss": 0.567, "num_input_tokens_seen": 26938728, "step": 46410 }, { "epoch": 6.913166517724158, "grad_norm": 1.2242554426193237, "learning_rate": 4.135856601084988e-05, "loss": 0.8306, "num_input_tokens_seen": 26941608, "step": 46415 }, { "epoch": 6.913911230265118, "grad_norm": 1.831399917602539, "learning_rate": 4.135610866603955e-05, "loss": 0.7689, "num_input_tokens_seen": 26944488, "step": 46420 }, { "epoch": 6.9146559428060765, "grad_norm": 1.7653474807739258, "learning_rate": 4.135365104490938e-05, "loss": 0.5461, "num_input_tokens_seen": 26947272, "step": 46425 }, { "epoch": 6.915400655347036, "grad_norm": 1.1314258575439453, "learning_rate": 4.1351193147500887e-05, "loss": 0.7372, "num_input_tokens_seen": 26950088, "step": 46430 }, { "epoch": 6.916145367887995, "grad_norm": 0.8657882213592529, "learning_rate": 4.134873497385562e-05, "loss": 0.5686, "num_input_tokens_seen": 26952904, "step": 46435 }, { "epoch": 6.916890080428955, "grad_norm": 1.026432991027832, "learning_rate": 4.1346276524015085e-05, "loss": 0.7879, "num_input_tokens_seen": 26956040, "step": 46440 }, { "epoch": 6.917634792969913, "grad_norm": 1.260034441947937, "learning_rate": 4.1343817798020824e-05, "loss": 0.7385, "num_input_tokens_seen": 26958760, "step": 46445 }, { "epoch": 6.918379505510873, "grad_norm": 1.3666951656341553, "learning_rate": 4.1341358795914375e-05, "loss": 0.553, "num_input_tokens_seen": 26962056, "step": 46450 }, { "epoch": 6.919124218051832, "grad_norm": 0.8948829770088196, "learning_rate": 4.133889951773727e-05, "loss": 0.5764, "num_input_tokens_seen": 26964936, "step": 46455 }, { "epoch": 6.919868930592791, "grad_norm": 0.8720143437385559, "learning_rate": 4.133643996353107e-05, "loss": 0.6575, "num_input_tokens_seen": 26967848, "step": 46460 }, { "epoch": 6.92061364313375, "grad_norm": 1.6866765022277832, "learning_rate": 4.1333980133337324e-05, "loss": 0.6427, "num_input_tokens_seen": 26970504, "step": 46465 }, { "epoch": 6.92135835567471, "grad_norm": 1.2922347784042358, "learning_rate": 4.133152002719758e-05, "loss": 0.7257, "num_input_tokens_seen": 26973384, "step": 46470 }, { "epoch": 6.922103068215669, "grad_norm": 1.5078552961349487, "learning_rate": 4.13290596451534e-05, "loss": 0.5409, "num_input_tokens_seen": 26976392, "step": 46475 }, { "epoch": 6.922847780756628, "grad_norm": 0.7739469408988953, "learning_rate": 4.1326598987246356e-05, "loss": 0.5685, "num_input_tokens_seen": 26978984, "step": 46480 }, { "epoch": 6.923592493297587, "grad_norm": 1.2554149627685547, "learning_rate": 4.132413805351802e-05, "loss": 0.4973, "num_input_tokens_seen": 26981768, "step": 46485 }, { "epoch": 6.924337205838547, "grad_norm": 1.4120895862579346, "learning_rate": 4.1321676844009957e-05, "loss": 0.7339, "num_input_tokens_seen": 26984744, "step": 46490 }, { "epoch": 6.925081918379505, "grad_norm": 1.0903502702713013, "learning_rate": 4.1319215358763756e-05, "loss": 0.7585, "num_input_tokens_seen": 26987944, "step": 46495 }, { "epoch": 6.925826630920465, "grad_norm": 0.9740375280380249, "learning_rate": 4.1316753597821e-05, "loss": 0.4813, "num_input_tokens_seen": 26990696, "step": 46500 }, { "epoch": 6.926571343461424, "grad_norm": 0.9941781163215637, "learning_rate": 4.1314291561223276e-05, "loss": 0.5101, "num_input_tokens_seen": 26993640, "step": 46505 }, { "epoch": 6.927316056002383, "grad_norm": 0.8041941523551941, "learning_rate": 4.131182924901217e-05, "loss": 0.6324, "num_input_tokens_seen": 26996392, "step": 46510 }, { "epoch": 6.928060768543342, "grad_norm": 1.5876981019973755, "learning_rate": 4.1309366661229286e-05, "loss": 0.6283, "num_input_tokens_seen": 26999272, "step": 46515 }, { "epoch": 6.928805481084302, "grad_norm": 2.312939405441284, "learning_rate": 4.130690379791623e-05, "loss": 0.7637, "num_input_tokens_seen": 27001960, "step": 46520 }, { "epoch": 6.929550193625261, "grad_norm": 1.1780591011047363, "learning_rate": 4.1304440659114615e-05, "loss": 0.7275, "num_input_tokens_seen": 27004872, "step": 46525 }, { "epoch": 6.930294906166219, "grad_norm": 0.8509484529495239, "learning_rate": 4.130197724486604e-05, "loss": 0.7835, "num_input_tokens_seen": 27007880, "step": 46530 }, { "epoch": 6.931039618707179, "grad_norm": 0.654567301273346, "learning_rate": 4.1299513555212135e-05, "loss": 0.5704, "num_input_tokens_seen": 27010664, "step": 46535 }, { "epoch": 6.931784331248139, "grad_norm": 2.1843743324279785, "learning_rate": 4.129704959019451e-05, "loss": 0.6465, "num_input_tokens_seen": 27013672, "step": 46540 }, { "epoch": 6.932529043789097, "grad_norm": 0.815792441368103, "learning_rate": 4.129458534985479e-05, "loss": 0.5486, "num_input_tokens_seen": 27016648, "step": 46545 }, { "epoch": 6.933273756330056, "grad_norm": 1.3251457214355469, "learning_rate": 4.1292120834234624e-05, "loss": 0.6089, "num_input_tokens_seen": 27019432, "step": 46550 }, { "epoch": 6.934018468871016, "grad_norm": 2.1514134407043457, "learning_rate": 4.128965604337563e-05, "loss": 0.575, "num_input_tokens_seen": 27022472, "step": 46555 }, { "epoch": 6.9347631814119755, "grad_norm": 1.1495282649993896, "learning_rate": 4.128719097731945e-05, "loss": 0.5276, "num_input_tokens_seen": 27025384, "step": 46560 }, { "epoch": 6.935507893952934, "grad_norm": 0.9617647528648376, "learning_rate": 4.1284725636107726e-05, "loss": 0.5594, "num_input_tokens_seen": 27028328, "step": 46565 }, { "epoch": 6.936252606493893, "grad_norm": 1.0859769582748413, "learning_rate": 4.128226001978213e-05, "loss": 0.7048, "num_input_tokens_seen": 27030984, "step": 46570 }, { "epoch": 6.936997319034853, "grad_norm": 1.181754469871521, "learning_rate": 4.127979412838428e-05, "loss": 0.4979, "num_input_tokens_seen": 27033960, "step": 46575 }, { "epoch": 6.937742031575811, "grad_norm": 1.4666484594345093, "learning_rate": 4.127732796195587e-05, "loss": 0.6152, "num_input_tokens_seen": 27036712, "step": 46580 }, { "epoch": 6.938486744116771, "grad_norm": 1.0975316762924194, "learning_rate": 4.127486152053854e-05, "loss": 0.6142, "num_input_tokens_seen": 27039368, "step": 46585 }, { "epoch": 6.93923145665773, "grad_norm": 1.4012731313705444, "learning_rate": 4.1272394804173966e-05, "loss": 0.692, "num_input_tokens_seen": 27042280, "step": 46590 }, { "epoch": 6.939976169198689, "grad_norm": 0.7441151142120361, "learning_rate": 4.126992781290382e-05, "loss": 0.5668, "num_input_tokens_seen": 27045320, "step": 46595 }, { "epoch": 6.940720881739648, "grad_norm": 2.302241802215576, "learning_rate": 4.1267460546769784e-05, "loss": 0.7016, "num_input_tokens_seen": 27047912, "step": 46600 }, { "epoch": 6.941465594280608, "grad_norm": 1.1374679803848267, "learning_rate": 4.126499300581353e-05, "loss": 0.6763, "num_input_tokens_seen": 27050888, "step": 46605 }, { "epoch": 6.942210306821567, "grad_norm": 1.1023856401443481, "learning_rate": 4.1262525190076763e-05, "loss": 0.8105, "num_input_tokens_seen": 27053992, "step": 46610 }, { "epoch": 6.942955019362526, "grad_norm": 1.3342994451522827, "learning_rate": 4.1260057099601145e-05, "loss": 0.5681, "num_input_tokens_seen": 27057096, "step": 46615 }, { "epoch": 6.943699731903485, "grad_norm": 0.9264317154884338, "learning_rate": 4.12575887344284e-05, "loss": 0.4397, "num_input_tokens_seen": 27060072, "step": 46620 }, { "epoch": 6.944444444444445, "grad_norm": 1.1462655067443848, "learning_rate": 4.125512009460021e-05, "loss": 0.7513, "num_input_tokens_seen": 27062856, "step": 46625 }, { "epoch": 6.945189156985403, "grad_norm": 0.8109739422798157, "learning_rate": 4.125265118015829e-05, "loss": 0.6104, "num_input_tokens_seen": 27065544, "step": 46630 }, { "epoch": 6.945933869526363, "grad_norm": 1.2058652639389038, "learning_rate": 4.125018199114434e-05, "loss": 0.5366, "num_input_tokens_seen": 27068360, "step": 46635 }, { "epoch": 6.946678582067322, "grad_norm": 1.9461857080459595, "learning_rate": 4.124771252760009e-05, "loss": 0.684, "num_input_tokens_seen": 27071080, "step": 46640 }, { "epoch": 6.9474232946082815, "grad_norm": 0.9931489825248718, "learning_rate": 4.124524278956725e-05, "loss": 0.5401, "num_input_tokens_seen": 27073928, "step": 46645 }, { "epoch": 6.94816800714924, "grad_norm": 1.0294002294540405, "learning_rate": 4.1242772777087536e-05, "loss": 0.6993, "num_input_tokens_seen": 27077000, "step": 46650 }, { "epoch": 6.9489127196902, "grad_norm": 0.7932460904121399, "learning_rate": 4.124030249020269e-05, "loss": 0.5546, "num_input_tokens_seen": 27079752, "step": 46655 }, { "epoch": 6.949657432231159, "grad_norm": 1.8422094583511353, "learning_rate": 4.123783192895444e-05, "loss": 0.9443, "num_input_tokens_seen": 27082536, "step": 46660 }, { "epoch": 6.950402144772118, "grad_norm": 1.050968885421753, "learning_rate": 4.1235361093384523e-05, "loss": 0.5425, "num_input_tokens_seen": 27085160, "step": 46665 }, { "epoch": 6.951146857313077, "grad_norm": 1.172855257987976, "learning_rate": 4.123288998353468e-05, "loss": 0.611, "num_input_tokens_seen": 27088392, "step": 46670 }, { "epoch": 6.951891569854037, "grad_norm": 0.8901498317718506, "learning_rate": 4.123041859944666e-05, "loss": 0.6345, "num_input_tokens_seen": 27091240, "step": 46675 }, { "epoch": 6.952636282394995, "grad_norm": 1.9532296657562256, "learning_rate": 4.122794694116221e-05, "loss": 0.8498, "num_input_tokens_seen": 27094344, "step": 46680 }, { "epoch": 6.953380994935955, "grad_norm": 0.8019576072692871, "learning_rate": 4.122547500872309e-05, "loss": 0.5109, "num_input_tokens_seen": 27097192, "step": 46685 }, { "epoch": 6.954125707476914, "grad_norm": 1.4950653314590454, "learning_rate": 4.122300280217107e-05, "loss": 0.5713, "num_input_tokens_seen": 27099976, "step": 46690 }, { "epoch": 6.9548704200178735, "grad_norm": 1.4399713277816772, "learning_rate": 4.1220530321547894e-05, "loss": 0.7827, "num_input_tokens_seen": 27102792, "step": 46695 }, { "epoch": 6.955615132558832, "grad_norm": 0.972321629524231, "learning_rate": 4.121805756689535e-05, "loss": 0.6178, "num_input_tokens_seen": 27105640, "step": 46700 }, { "epoch": 6.956359845099792, "grad_norm": 1.6837280988693237, "learning_rate": 4.1215584538255206e-05, "loss": 0.6755, "num_input_tokens_seen": 27108584, "step": 46705 }, { "epoch": 6.957104557640751, "grad_norm": 2.22184157371521, "learning_rate": 4.121311123566924e-05, "loss": 0.6476, "num_input_tokens_seen": 27111560, "step": 46710 }, { "epoch": 6.957849270181709, "grad_norm": 0.9448822736740112, "learning_rate": 4.121063765917924e-05, "loss": 0.6476, "num_input_tokens_seen": 27114472, "step": 46715 }, { "epoch": 6.958593982722669, "grad_norm": 1.741830587387085, "learning_rate": 4.120816380882699e-05, "loss": 0.6089, "num_input_tokens_seen": 27117000, "step": 46720 }, { "epoch": 6.959338695263629, "grad_norm": 0.7577212452888489, "learning_rate": 4.120568968465429e-05, "loss": 0.5546, "num_input_tokens_seen": 27119784, "step": 46725 }, { "epoch": 6.9600834078045875, "grad_norm": 0.9625220894813538, "learning_rate": 4.120321528670293e-05, "loss": 0.6891, "num_input_tokens_seen": 27122760, "step": 46730 }, { "epoch": 6.960828120345546, "grad_norm": 2.1597695350646973, "learning_rate": 4.120074061501472e-05, "loss": 0.645, "num_input_tokens_seen": 27125608, "step": 46735 }, { "epoch": 6.961572832886506, "grad_norm": 1.7664148807525635, "learning_rate": 4.1198265669631464e-05, "loss": 0.6891, "num_input_tokens_seen": 27128328, "step": 46740 }, { "epoch": 6.962317545427465, "grad_norm": 0.8708727359771729, "learning_rate": 4.119579045059496e-05, "loss": 0.6737, "num_input_tokens_seen": 27131496, "step": 46745 }, { "epoch": 6.963062257968424, "grad_norm": 2.579735517501831, "learning_rate": 4.119331495794705e-05, "loss": 0.7111, "num_input_tokens_seen": 27134504, "step": 46750 }, { "epoch": 6.963806970509383, "grad_norm": 0.904903769493103, "learning_rate": 4.119083919172954e-05, "loss": 0.6007, "num_input_tokens_seen": 27137480, "step": 46755 }, { "epoch": 6.964551683050343, "grad_norm": 1.0171228647232056, "learning_rate": 4.118836315198425e-05, "loss": 0.6292, "num_input_tokens_seen": 27140680, "step": 46760 }, { "epoch": 6.965296395591301, "grad_norm": 1.4937750101089478, "learning_rate": 4.118588683875303e-05, "loss": 0.7304, "num_input_tokens_seen": 27143560, "step": 46765 }, { "epoch": 6.966041108132261, "grad_norm": 0.8334134817123413, "learning_rate": 4.11834102520777e-05, "loss": 0.4824, "num_input_tokens_seen": 27146344, "step": 46770 }, { "epoch": 6.96678582067322, "grad_norm": 1.5772686004638672, "learning_rate": 4.118093339200009e-05, "loss": 0.7085, "num_input_tokens_seen": 27149448, "step": 46775 }, { "epoch": 6.9675305332141795, "grad_norm": 1.3643087148666382, "learning_rate": 4.1178456258562064e-05, "loss": 0.6354, "num_input_tokens_seen": 27152200, "step": 46780 }, { "epoch": 6.968275245755138, "grad_norm": 1.2107369899749756, "learning_rate": 4.117597885180546e-05, "loss": 0.7719, "num_input_tokens_seen": 27155048, "step": 46785 }, { "epoch": 6.969019958296098, "grad_norm": 1.1317640542984009, "learning_rate": 4.117350117177214e-05, "loss": 0.6652, "num_input_tokens_seen": 27157800, "step": 46790 }, { "epoch": 6.969764670837057, "grad_norm": 1.210328459739685, "learning_rate": 4.1171023218503945e-05, "loss": 0.5752, "num_input_tokens_seen": 27160840, "step": 46795 }, { "epoch": 6.970509383378016, "grad_norm": 1.9975812435150146, "learning_rate": 4.1168544992042756e-05, "loss": 0.709, "num_input_tokens_seen": 27163560, "step": 46800 }, { "epoch": 6.971254095918975, "grad_norm": 0.9104189276695251, "learning_rate": 4.116606649243043e-05, "loss": 0.582, "num_input_tokens_seen": 27166312, "step": 46805 }, { "epoch": 6.971998808459935, "grad_norm": 0.8274286985397339, "learning_rate": 4.116358771970885e-05, "loss": 0.6545, "num_input_tokens_seen": 27169448, "step": 46810 }, { "epoch": 6.9727435210008935, "grad_norm": 0.8991963267326355, "learning_rate": 4.1161108673919874e-05, "loss": 0.4621, "num_input_tokens_seen": 27172168, "step": 46815 }, { "epoch": 6.973488233541853, "grad_norm": 1.4702202081680298, "learning_rate": 4.115862935510539e-05, "loss": 0.7212, "num_input_tokens_seen": 27174984, "step": 46820 }, { "epoch": 6.974232946082812, "grad_norm": 1.2387298345565796, "learning_rate": 4.1156149763307296e-05, "loss": 0.5066, "num_input_tokens_seen": 27177896, "step": 46825 }, { "epoch": 6.9749776586237715, "grad_norm": 0.811428427696228, "learning_rate": 4.115366989856746e-05, "loss": 0.7401, "num_input_tokens_seen": 27180808, "step": 46830 }, { "epoch": 6.97572237116473, "grad_norm": 0.862747848033905, "learning_rate": 4.11511897609278e-05, "loss": 0.4592, "num_input_tokens_seen": 27183496, "step": 46835 }, { "epoch": 6.97646708370569, "grad_norm": 1.5051746368408203, "learning_rate": 4.1148709350430194e-05, "loss": 0.6207, "num_input_tokens_seen": 27186440, "step": 46840 }, { "epoch": 6.977211796246649, "grad_norm": 1.1071618795394897, "learning_rate": 4.114622866711657e-05, "loss": 0.6176, "num_input_tokens_seen": 27189352, "step": 46845 }, { "epoch": 6.977956508787608, "grad_norm": 1.5469430685043335, "learning_rate": 4.114374771102881e-05, "loss": 0.5675, "num_input_tokens_seen": 27192072, "step": 46850 }, { "epoch": 6.978701221328567, "grad_norm": 1.1950221061706543, "learning_rate": 4.114126648220884e-05, "loss": 0.6808, "num_input_tokens_seen": 27194984, "step": 46855 }, { "epoch": 6.979445933869527, "grad_norm": 1.265816330909729, "learning_rate": 4.1138784980698585e-05, "loss": 0.7885, "num_input_tokens_seen": 27197832, "step": 46860 }, { "epoch": 6.9801906464104855, "grad_norm": 1.7210992574691772, "learning_rate": 4.113630320653996e-05, "loss": 0.528, "num_input_tokens_seen": 27200712, "step": 46865 }, { "epoch": 6.980935358951445, "grad_norm": 0.7838387489318848, "learning_rate": 4.113382115977489e-05, "loss": 0.5943, "num_input_tokens_seen": 27203432, "step": 46870 }, { "epoch": 6.981680071492404, "grad_norm": 0.81192547082901, "learning_rate": 4.113133884044531e-05, "loss": 0.4722, "num_input_tokens_seen": 27206152, "step": 46875 }, { "epoch": 6.982424784033363, "grad_norm": 0.8280476331710815, "learning_rate": 4.112885624859316e-05, "loss": 0.5372, "num_input_tokens_seen": 27209032, "step": 46880 }, { "epoch": 6.983169496574322, "grad_norm": 1.0993728637695312, "learning_rate": 4.1126373384260365e-05, "loss": 0.8094, "num_input_tokens_seen": 27211752, "step": 46885 }, { "epoch": 6.983914209115282, "grad_norm": 1.3410496711730957, "learning_rate": 4.112389024748889e-05, "loss": 0.5876, "num_input_tokens_seen": 27214600, "step": 46890 }, { "epoch": 6.984658921656241, "grad_norm": 1.2341479063034058, "learning_rate": 4.112140683832068e-05, "loss": 0.613, "num_input_tokens_seen": 27217864, "step": 46895 }, { "epoch": 6.9854036341971995, "grad_norm": 1.1225907802581787, "learning_rate": 4.1118923156797684e-05, "loss": 0.7174, "num_input_tokens_seen": 27220712, "step": 46900 }, { "epoch": 6.986148346738159, "grad_norm": 1.3748079538345337, "learning_rate": 4.111643920296185e-05, "loss": 0.558, "num_input_tokens_seen": 27223976, "step": 46905 }, { "epoch": 6.986893059279119, "grad_norm": 1.7478406429290771, "learning_rate": 4.1113954976855174e-05, "loss": 0.6635, "num_input_tokens_seen": 27226728, "step": 46910 }, { "epoch": 6.9876377718200775, "grad_norm": 0.9173913598060608, "learning_rate": 4.11114704785196e-05, "loss": 0.6538, "num_input_tokens_seen": 27229544, "step": 46915 }, { "epoch": 6.988382484361036, "grad_norm": 1.9550833702087402, "learning_rate": 4.1108985707997105e-05, "loss": 0.7149, "num_input_tokens_seen": 27232392, "step": 46920 }, { "epoch": 6.989127196901996, "grad_norm": 1.6386961936950684, "learning_rate": 4.110650066532967e-05, "loss": 0.5644, "num_input_tokens_seen": 27235272, "step": 46925 }, { "epoch": 6.989871909442955, "grad_norm": 1.166464924812317, "learning_rate": 4.1104015350559275e-05, "loss": 0.6687, "num_input_tokens_seen": 27238312, "step": 46930 }, { "epoch": 6.990616621983914, "grad_norm": 1.2276891469955444, "learning_rate": 4.110152976372791e-05, "loss": 0.7406, "num_input_tokens_seen": 27241096, "step": 46935 }, { "epoch": 6.991361334524873, "grad_norm": 1.007987141609192, "learning_rate": 4.1099043904877564e-05, "loss": 0.6742, "num_input_tokens_seen": 27244040, "step": 46940 }, { "epoch": 6.992106047065833, "grad_norm": 0.5453574061393738, "learning_rate": 4.1096557774050235e-05, "loss": 0.5152, "num_input_tokens_seen": 27246760, "step": 46945 }, { "epoch": 6.9928507596067915, "grad_norm": 1.2259268760681152, "learning_rate": 4.1094071371287925e-05, "loss": 0.7236, "num_input_tokens_seen": 27249864, "step": 46950 }, { "epoch": 6.993595472147751, "grad_norm": 0.6498303413391113, "learning_rate": 4.109158469663263e-05, "loss": 0.5253, "num_input_tokens_seen": 27253000, "step": 46955 }, { "epoch": 6.99434018468871, "grad_norm": 1.6288721561431885, "learning_rate": 4.108909775012637e-05, "loss": 0.7455, "num_input_tokens_seen": 27255848, "step": 46960 }, { "epoch": 6.9950848972296695, "grad_norm": 1.4238609075546265, "learning_rate": 4.1086610531811155e-05, "loss": 0.7037, "num_input_tokens_seen": 27258728, "step": 46965 }, { "epoch": 6.995829609770628, "grad_norm": 2.7358174324035645, "learning_rate": 4.108412304172901e-05, "loss": 0.7691, "num_input_tokens_seen": 27261832, "step": 46970 }, { "epoch": 6.996574322311588, "grad_norm": 1.400460124015808, "learning_rate": 4.1081635279921945e-05, "loss": 0.6413, "num_input_tokens_seen": 27264776, "step": 46975 }, { "epoch": 6.997319034852547, "grad_norm": 0.9487239122390747, "learning_rate": 4.107914724643199e-05, "loss": 0.5615, "num_input_tokens_seen": 27268008, "step": 46980 }, { "epoch": 6.998063747393506, "grad_norm": 0.9040194153785706, "learning_rate": 4.107665894130121e-05, "loss": 0.5233, "num_input_tokens_seen": 27271176, "step": 46985 }, { "epoch": 6.998808459934465, "grad_norm": 1.1857184171676636, "learning_rate": 4.107417036457159e-05, "loss": 0.7956, "num_input_tokens_seen": 27274408, "step": 46990 }, { "epoch": 6.999553172475425, "grad_norm": 0.943742573261261, "learning_rate": 4.107168151628521e-05, "loss": 0.6886, "num_input_tokens_seen": 27277416, "step": 46995 }, { "epoch": 7.0, "eval_loss": 0.6550788879394531, "eval_runtime": 47.1926, "eval_samples_per_second": 63.23, "eval_steps_per_second": 15.808, "num_input_tokens_seen": 27278720, "step": 46998 }, { "epoch": 7.0002978850163835, "grad_norm": 2.415703773498535, "learning_rate": 4.10691923964841e-05, "loss": 1.0382, "num_input_tokens_seen": 27279936, "step": 47000 }, { "epoch": 7.001042597557343, "grad_norm": 1.513719916343689, "learning_rate": 4.106670300521033e-05, "loss": 0.6956, "num_input_tokens_seen": 27282816, "step": 47005 }, { "epoch": 7.001787310098302, "grad_norm": 1.8657807111740112, "learning_rate": 4.106421334250593e-05, "loss": 0.5432, "num_input_tokens_seen": 27285568, "step": 47010 }, { "epoch": 7.0025320226392616, "grad_norm": 1.2104004621505737, "learning_rate": 4.106172340841298e-05, "loss": 0.6092, "num_input_tokens_seen": 27288736, "step": 47015 }, { "epoch": 7.00327673518022, "grad_norm": 1.1513762474060059, "learning_rate": 4.105923320297353e-05, "loss": 0.6923, "num_input_tokens_seen": 27291392, "step": 47020 }, { "epoch": 7.00402144772118, "grad_norm": 0.44056758284568787, "learning_rate": 4.1056742726229655e-05, "loss": 0.4887, "num_input_tokens_seen": 27294080, "step": 47025 }, { "epoch": 7.004766160262139, "grad_norm": 1.4450815916061401, "learning_rate": 4.105425197822344e-05, "loss": 0.7033, "num_input_tokens_seen": 27297024, "step": 47030 }, { "epoch": 7.005510872803098, "grad_norm": 1.9837729930877686, "learning_rate": 4.105176095899696e-05, "loss": 0.7718, "num_input_tokens_seen": 27300288, "step": 47035 }, { "epoch": 7.006255585344057, "grad_norm": 1.7600455284118652, "learning_rate": 4.104926966859227e-05, "loss": 0.6839, "num_input_tokens_seen": 27303296, "step": 47040 }, { "epoch": 7.007000297885017, "grad_norm": 1.501641035079956, "learning_rate": 4.1046778107051495e-05, "loss": 0.586, "num_input_tokens_seen": 27306336, "step": 47045 }, { "epoch": 7.0077450104259755, "grad_norm": 1.359817385673523, "learning_rate": 4.104428627441672e-05, "loss": 0.7298, "num_input_tokens_seen": 27309280, "step": 47050 }, { "epoch": 7.008489722966935, "grad_norm": 1.1043957471847534, "learning_rate": 4.104179417073002e-05, "loss": 0.7235, "num_input_tokens_seen": 27312096, "step": 47055 }, { "epoch": 7.009234435507894, "grad_norm": 1.1460206508636475, "learning_rate": 4.103930179603352e-05, "loss": 0.6317, "num_input_tokens_seen": 27314976, "step": 47060 }, { "epoch": 7.009979148048854, "grad_norm": 1.5284979343414307, "learning_rate": 4.103680915036932e-05, "loss": 0.4828, "num_input_tokens_seen": 27317824, "step": 47065 }, { "epoch": 7.010723860589812, "grad_norm": 1.2664839029312134, "learning_rate": 4.1034316233779526e-05, "loss": 0.5464, "num_input_tokens_seen": 27320608, "step": 47070 }, { "epoch": 7.011468573130771, "grad_norm": 2.674905300140381, "learning_rate": 4.103182304630625e-05, "loss": 0.7588, "num_input_tokens_seen": 27324032, "step": 47075 }, { "epoch": 7.012213285671731, "grad_norm": 1.0950002670288086, "learning_rate": 4.102932958799163e-05, "loss": 0.8585, "num_input_tokens_seen": 27327008, "step": 47080 }, { "epoch": 7.0129579982126895, "grad_norm": 1.9274487495422363, "learning_rate": 4.102683585887777e-05, "loss": 0.6848, "num_input_tokens_seen": 27329664, "step": 47085 }, { "epoch": 7.013702710753649, "grad_norm": 1.7930920124053955, "learning_rate": 4.102434185900681e-05, "loss": 0.6881, "num_input_tokens_seen": 27332960, "step": 47090 }, { "epoch": 7.014447423294608, "grad_norm": 1.599745750427246, "learning_rate": 4.1021847588420876e-05, "loss": 0.4519, "num_input_tokens_seen": 27336032, "step": 47095 }, { "epoch": 7.0151921358355676, "grad_norm": 0.7982409596443176, "learning_rate": 4.101935304716211e-05, "loss": 0.6863, "num_input_tokens_seen": 27338944, "step": 47100 }, { "epoch": 7.015936848376526, "grad_norm": 1.2432773113250732, "learning_rate": 4.101685823527266e-05, "loss": 0.6039, "num_input_tokens_seen": 27341632, "step": 47105 }, { "epoch": 7.016681560917486, "grad_norm": 1.567291259765625, "learning_rate": 4.1014363152794664e-05, "loss": 0.6991, "num_input_tokens_seen": 27344352, "step": 47110 }, { "epoch": 7.017426273458445, "grad_norm": 0.9772177934646606, "learning_rate": 4.101186779977029e-05, "loss": 0.7957, "num_input_tokens_seen": 27347520, "step": 47115 }, { "epoch": 7.018170985999404, "grad_norm": 2.432281494140625, "learning_rate": 4.1009372176241675e-05, "loss": 0.5336, "num_input_tokens_seen": 27350208, "step": 47120 }, { "epoch": 7.018915698540363, "grad_norm": 1.3146313428878784, "learning_rate": 4.100687628225099e-05, "loss": 0.7134, "num_input_tokens_seen": 27352960, "step": 47125 }, { "epoch": 7.019660411081323, "grad_norm": 1.2445571422576904, "learning_rate": 4.10043801178404e-05, "loss": 0.6416, "num_input_tokens_seen": 27355584, "step": 47130 }, { "epoch": 7.0204051236222815, "grad_norm": 0.6276538968086243, "learning_rate": 4.100188368305207e-05, "loss": 0.6145, "num_input_tokens_seen": 27358624, "step": 47135 }, { "epoch": 7.021149836163241, "grad_norm": 1.173672080039978, "learning_rate": 4.099938697792818e-05, "loss": 0.5338, "num_input_tokens_seen": 27361440, "step": 47140 }, { "epoch": 7.0218945487042, "grad_norm": 1.2616560459136963, "learning_rate": 4.099689000251091e-05, "loss": 0.7091, "num_input_tokens_seen": 27364320, "step": 47145 }, { "epoch": 7.02263926124516, "grad_norm": 0.798516571521759, "learning_rate": 4.0994392756842444e-05, "loss": 0.51, "num_input_tokens_seen": 27367136, "step": 47150 }, { "epoch": 7.023383973786118, "grad_norm": 1.521643877029419, "learning_rate": 4.099189524096496e-05, "loss": 0.7405, "num_input_tokens_seen": 27370432, "step": 47155 }, { "epoch": 7.024128686327078, "grad_norm": 0.7762227058410645, "learning_rate": 4.098939745492066e-05, "loss": 0.5507, "num_input_tokens_seen": 27373664, "step": 47160 }, { "epoch": 7.024873398868037, "grad_norm": 1.0748140811920166, "learning_rate": 4.0986899398751754e-05, "loss": 0.5329, "num_input_tokens_seen": 27376448, "step": 47165 }, { "epoch": 7.025618111408996, "grad_norm": 0.999416708946228, "learning_rate": 4.098440107250042e-05, "loss": 0.6333, "num_input_tokens_seen": 27379392, "step": 47170 }, { "epoch": 7.026362823949955, "grad_norm": 1.0272107124328613, "learning_rate": 4.098190247620888e-05, "loss": 0.6769, "num_input_tokens_seen": 27382240, "step": 47175 }, { "epoch": 7.027107536490915, "grad_norm": 0.6939693093299866, "learning_rate": 4.097940360991934e-05, "loss": 0.6399, "num_input_tokens_seen": 27385472, "step": 47180 }, { "epoch": 7.0278522490318736, "grad_norm": 1.2491079568862915, "learning_rate": 4.097690447367402e-05, "loss": 0.7459, "num_input_tokens_seen": 27388256, "step": 47185 }, { "epoch": 7.028596961572833, "grad_norm": 2.8696610927581787, "learning_rate": 4.097440506751513e-05, "loss": 0.924, "num_input_tokens_seen": 27391072, "step": 47190 }, { "epoch": 7.029341674113792, "grad_norm": 0.8882629871368408, "learning_rate": 4.097190539148491e-05, "loss": 0.5818, "num_input_tokens_seen": 27394048, "step": 47195 }, { "epoch": 7.030086386654752, "grad_norm": 0.8828050494194031, "learning_rate": 4.096940544562557e-05, "loss": 0.6675, "num_input_tokens_seen": 27397024, "step": 47200 }, { "epoch": 7.03083109919571, "grad_norm": 0.9883195161819458, "learning_rate": 4.096690522997936e-05, "loss": 0.5302, "num_input_tokens_seen": 27400096, "step": 47205 }, { "epoch": 7.03157581173667, "grad_norm": 0.9610217809677124, "learning_rate": 4.096440474458852e-05, "loss": 0.5325, "num_input_tokens_seen": 27402784, "step": 47210 }, { "epoch": 7.032320524277629, "grad_norm": 0.8828074932098389, "learning_rate": 4.096190398949529e-05, "loss": 0.7078, "num_input_tokens_seen": 27405280, "step": 47215 }, { "epoch": 7.033065236818588, "grad_norm": 0.9417130947113037, "learning_rate": 4.09594029647419e-05, "loss": 0.6431, "num_input_tokens_seen": 27408704, "step": 47220 }, { "epoch": 7.033809949359547, "grad_norm": 1.3970274925231934, "learning_rate": 4.095690167037063e-05, "loss": 0.4884, "num_input_tokens_seen": 27411392, "step": 47225 }, { "epoch": 7.034554661900507, "grad_norm": 1.269295573234558, "learning_rate": 4.095440010642372e-05, "loss": 0.6088, "num_input_tokens_seen": 27414208, "step": 47230 }, { "epoch": 7.035299374441466, "grad_norm": 2.8951172828674316, "learning_rate": 4.0951898272943436e-05, "loss": 0.7574, "num_input_tokens_seen": 27416992, "step": 47235 }, { "epoch": 7.036044086982425, "grad_norm": 0.6388319730758667, "learning_rate": 4.094939616997204e-05, "loss": 0.6173, "num_input_tokens_seen": 27419840, "step": 47240 }, { "epoch": 7.036788799523384, "grad_norm": 1.6580170392990112, "learning_rate": 4.094689379755181e-05, "loss": 0.5549, "num_input_tokens_seen": 27422752, "step": 47245 }, { "epoch": 7.037533512064343, "grad_norm": 0.5899856686592102, "learning_rate": 4.094439115572502e-05, "loss": 0.5512, "num_input_tokens_seen": 27425504, "step": 47250 }, { "epoch": 7.038278224605302, "grad_norm": 1.5308170318603516, "learning_rate": 4.094188824453394e-05, "loss": 0.5634, "num_input_tokens_seen": 27428384, "step": 47255 }, { "epoch": 7.039022937146261, "grad_norm": 1.1924772262573242, "learning_rate": 4.0939385064020866e-05, "loss": 0.6366, "num_input_tokens_seen": 27431360, "step": 47260 }, { "epoch": 7.039767649687221, "grad_norm": 1.120323896408081, "learning_rate": 4.093688161422808e-05, "loss": 0.4934, "num_input_tokens_seen": 27434208, "step": 47265 }, { "epoch": 7.0405123622281796, "grad_norm": 1.2905223369598389, "learning_rate": 4.093437789519787e-05, "loss": 0.6835, "num_input_tokens_seen": 27436864, "step": 47270 }, { "epoch": 7.041257074769139, "grad_norm": 2.394352912902832, "learning_rate": 4.093187390697255e-05, "loss": 0.8803, "num_input_tokens_seen": 27439712, "step": 47275 }, { "epoch": 7.042001787310098, "grad_norm": 1.214242696762085, "learning_rate": 4.0929369649594416e-05, "loss": 0.5251, "num_input_tokens_seen": 27442464, "step": 47280 }, { "epoch": 7.042746499851058, "grad_norm": 2.1085057258605957, "learning_rate": 4.092686512310576e-05, "loss": 0.6393, "num_input_tokens_seen": 27445184, "step": 47285 }, { "epoch": 7.043491212392016, "grad_norm": 1.1712520122528076, "learning_rate": 4.09243603275489e-05, "loss": 0.5183, "num_input_tokens_seen": 27447840, "step": 47290 }, { "epoch": 7.044235924932976, "grad_norm": 0.8035867810249329, "learning_rate": 4.092185526296618e-05, "loss": 0.5157, "num_input_tokens_seen": 27451008, "step": 47295 }, { "epoch": 7.044980637473935, "grad_norm": 0.7739044427871704, "learning_rate": 4.091934992939989e-05, "loss": 0.5227, "num_input_tokens_seen": 27454400, "step": 47300 }, { "epoch": 7.045725350014894, "grad_norm": 0.8611094355583191, "learning_rate": 4.0916844326892344e-05, "loss": 0.7978, "num_input_tokens_seen": 27457280, "step": 47305 }, { "epoch": 7.046470062555853, "grad_norm": 0.9865660667419434, "learning_rate": 4.091433845548591e-05, "loss": 0.7172, "num_input_tokens_seen": 27460256, "step": 47310 }, { "epoch": 7.047214775096813, "grad_norm": 1.3585221767425537, "learning_rate": 4.0911832315222896e-05, "loss": 0.6509, "num_input_tokens_seen": 27463072, "step": 47315 }, { "epoch": 7.047959487637772, "grad_norm": 1.2795239686965942, "learning_rate": 4.090932590614565e-05, "loss": 0.6194, "num_input_tokens_seen": 27465728, "step": 47320 }, { "epoch": 7.048704200178731, "grad_norm": 1.8787797689437866, "learning_rate": 4.09068192282965e-05, "loss": 0.7619, "num_input_tokens_seen": 27469024, "step": 47325 }, { "epoch": 7.04944891271969, "grad_norm": 1.6875245571136475, "learning_rate": 4.090431228171782e-05, "loss": 0.7077, "num_input_tokens_seen": 27471904, "step": 47330 }, { "epoch": 7.05019362526065, "grad_norm": 2.1995790004730225, "learning_rate": 4.0901805066451946e-05, "loss": 0.7098, "num_input_tokens_seen": 27474880, "step": 47335 }, { "epoch": 7.050938337801608, "grad_norm": 2.062324047088623, "learning_rate": 4.089929758254123e-05, "loss": 0.5958, "num_input_tokens_seen": 27477888, "step": 47340 }, { "epoch": 7.051683050342568, "grad_norm": 1.383070468902588, "learning_rate": 4.089678983002805e-05, "loss": 0.8513, "num_input_tokens_seen": 27480672, "step": 47345 }, { "epoch": 7.052427762883527, "grad_norm": 1.2906945943832397, "learning_rate": 4.089428180895476e-05, "loss": 0.6539, "num_input_tokens_seen": 27483424, "step": 47350 }, { "epoch": 7.053172475424486, "grad_norm": 1.6854844093322754, "learning_rate": 4.089177351936373e-05, "loss": 0.4792, "num_input_tokens_seen": 27486176, "step": 47355 }, { "epoch": 7.053917187965445, "grad_norm": 0.9893122315406799, "learning_rate": 4.0889264961297336e-05, "loss": 0.7469, "num_input_tokens_seen": 27489120, "step": 47360 }, { "epoch": 7.054661900506405, "grad_norm": 0.7678108811378479, "learning_rate": 4.0886756134797964e-05, "loss": 0.6058, "num_input_tokens_seen": 27491968, "step": 47365 }, { "epoch": 7.055406613047364, "grad_norm": 1.4138996601104736, "learning_rate": 4.0884247039907984e-05, "loss": 0.6317, "num_input_tokens_seen": 27494816, "step": 47370 }, { "epoch": 7.056151325588323, "grad_norm": 1.4822007417678833, "learning_rate": 4.0881737676669813e-05, "loss": 0.7197, "num_input_tokens_seen": 27497792, "step": 47375 }, { "epoch": 7.056896038129282, "grad_norm": 1.6794376373291016, "learning_rate": 4.087922804512582e-05, "loss": 0.8015, "num_input_tokens_seen": 27500576, "step": 47380 }, { "epoch": 7.057640750670242, "grad_norm": 0.9681773781776428, "learning_rate": 4.087671814531839e-05, "loss": 0.5842, "num_input_tokens_seen": 27503456, "step": 47385 }, { "epoch": 7.0583854632112, "grad_norm": 1.26748526096344, "learning_rate": 4.087420797728996e-05, "loss": 0.7855, "num_input_tokens_seen": 27506272, "step": 47390 }, { "epoch": 7.05913017575216, "grad_norm": 1.0824358463287354, "learning_rate": 4.087169754108292e-05, "loss": 0.4514, "num_input_tokens_seen": 27509184, "step": 47395 }, { "epoch": 7.059874888293119, "grad_norm": 1.7804011106491089, "learning_rate": 4.0869186836739674e-05, "loss": 0.5457, "num_input_tokens_seen": 27511808, "step": 47400 }, { "epoch": 7.0606196008340785, "grad_norm": 0.9086872339248657, "learning_rate": 4.086667586430265e-05, "loss": 0.6103, "num_input_tokens_seen": 27514848, "step": 47405 }, { "epoch": 7.061364313375037, "grad_norm": 0.9695812463760376, "learning_rate": 4.086416462381426e-05, "loss": 0.5917, "num_input_tokens_seen": 27517632, "step": 47410 }, { "epoch": 7.062109025915996, "grad_norm": 1.1636251211166382, "learning_rate": 4.086165311531694e-05, "loss": 0.5911, "num_input_tokens_seen": 27520640, "step": 47415 }, { "epoch": 7.062853738456956, "grad_norm": 1.3142657279968262, "learning_rate": 4.085914133885311e-05, "loss": 0.6499, "num_input_tokens_seen": 27523648, "step": 47420 }, { "epoch": 7.063598450997914, "grad_norm": 1.2194080352783203, "learning_rate": 4.08566292944652e-05, "loss": 0.6307, "num_input_tokens_seen": 27526400, "step": 47425 }, { "epoch": 7.064343163538874, "grad_norm": 1.1927666664123535, "learning_rate": 4.085411698219566e-05, "loss": 0.5926, "num_input_tokens_seen": 27529376, "step": 47430 }, { "epoch": 7.065087876079833, "grad_norm": 0.8235465288162231, "learning_rate": 4.085160440208692e-05, "loss": 0.5614, "num_input_tokens_seen": 27531968, "step": 47435 }, { "epoch": 7.065832588620792, "grad_norm": 1.1436721086502075, "learning_rate": 4.084909155418143e-05, "loss": 0.625, "num_input_tokens_seen": 27534784, "step": 47440 }, { "epoch": 7.066577301161751, "grad_norm": 0.6653091311454773, "learning_rate": 4.084657843852166e-05, "loss": 0.6049, "num_input_tokens_seen": 27537920, "step": 47445 }, { "epoch": 7.067322013702711, "grad_norm": 1.368608832359314, "learning_rate": 4.0844065055150046e-05, "loss": 0.6923, "num_input_tokens_seen": 27540800, "step": 47450 }, { "epoch": 7.06806672624367, "grad_norm": 1.0555323362350464, "learning_rate": 4.0841551404109056e-05, "loss": 0.5507, "num_input_tokens_seen": 27543360, "step": 47455 }, { "epoch": 7.068811438784629, "grad_norm": 1.1093474626541138, "learning_rate": 4.083903748544116e-05, "loss": 0.7389, "num_input_tokens_seen": 27546240, "step": 47460 }, { "epoch": 7.069556151325588, "grad_norm": 0.8997598886489868, "learning_rate": 4.0836523299188826e-05, "loss": 0.4563, "num_input_tokens_seen": 27549248, "step": 47465 }, { "epoch": 7.070300863866548, "grad_norm": 1.2053362131118774, "learning_rate": 4.083400884539452e-05, "loss": 0.5583, "num_input_tokens_seen": 27552512, "step": 47470 }, { "epoch": 7.071045576407506, "grad_norm": 0.9188858270645142, "learning_rate": 4.083149412410072e-05, "loss": 0.5753, "num_input_tokens_seen": 27555360, "step": 47475 }, { "epoch": 7.071790288948466, "grad_norm": 1.2226173877716064, "learning_rate": 4.082897913534993e-05, "loss": 0.5895, "num_input_tokens_seen": 27558432, "step": 47480 }, { "epoch": 7.072535001489425, "grad_norm": 1.4373854398727417, "learning_rate": 4.0826463879184615e-05, "loss": 0.5728, "num_input_tokens_seen": 27561184, "step": 47485 }, { "epoch": 7.0732797140303845, "grad_norm": 2.2520925998687744, "learning_rate": 4.082394835564729e-05, "loss": 0.6331, "num_input_tokens_seen": 27564288, "step": 47490 }, { "epoch": 7.074024426571343, "grad_norm": 1.5697332620620728, "learning_rate": 4.082143256478044e-05, "loss": 0.5845, "num_input_tokens_seen": 27567008, "step": 47495 }, { "epoch": 7.074769139112303, "grad_norm": 1.235958218574524, "learning_rate": 4.081891650662656e-05, "loss": 0.5837, "num_input_tokens_seen": 27569920, "step": 47500 }, { "epoch": 7.075513851653262, "grad_norm": 3.636502265930176, "learning_rate": 4.0816400181228165e-05, "loss": 0.7574, "num_input_tokens_seen": 27572864, "step": 47505 }, { "epoch": 7.076258564194221, "grad_norm": 1.4738456010818481, "learning_rate": 4.081388358862776e-05, "loss": 0.6283, "num_input_tokens_seen": 27575776, "step": 47510 }, { "epoch": 7.07700327673518, "grad_norm": 1.3881416320800781, "learning_rate": 4.0811366728867874e-05, "loss": 0.704, "num_input_tokens_seen": 27578560, "step": 47515 }, { "epoch": 7.07774798927614, "grad_norm": 3.052945852279663, "learning_rate": 4.080884960199101e-05, "loss": 0.6596, "num_input_tokens_seen": 27581600, "step": 47520 }, { "epoch": 7.078492701817098, "grad_norm": 0.9150389432907104, "learning_rate": 4.08063322080397e-05, "loss": 0.5883, "num_input_tokens_seen": 27584640, "step": 47525 }, { "epoch": 7.079237414358058, "grad_norm": 1.0938851833343506, "learning_rate": 4.080381454705647e-05, "loss": 0.7188, "num_input_tokens_seen": 27587616, "step": 47530 }, { "epoch": 7.079982126899017, "grad_norm": 0.8383574485778809, "learning_rate": 4.080129661908386e-05, "loss": 0.6155, "num_input_tokens_seen": 27590368, "step": 47535 }, { "epoch": 7.0807268394399765, "grad_norm": 1.4835184812545776, "learning_rate": 4.07987784241644e-05, "loss": 0.4052, "num_input_tokens_seen": 27593152, "step": 47540 }, { "epoch": 7.081471551980935, "grad_norm": 0.9432562589645386, "learning_rate": 4.0796259962340636e-05, "loss": 0.5264, "num_input_tokens_seen": 27596064, "step": 47545 }, { "epoch": 7.082216264521895, "grad_norm": 0.9956982731819153, "learning_rate": 4.079374123365512e-05, "loss": 0.5216, "num_input_tokens_seen": 27598912, "step": 47550 }, { "epoch": 7.082960977062854, "grad_norm": 1.1425230503082275, "learning_rate": 4.079122223815039e-05, "loss": 0.6806, "num_input_tokens_seen": 27602016, "step": 47555 }, { "epoch": 7.083705689603813, "grad_norm": 2.5780789852142334, "learning_rate": 4.0788702975869013e-05, "loss": 0.7761, "num_input_tokens_seen": 27604608, "step": 47560 }, { "epoch": 7.084450402144772, "grad_norm": 1.6249111890792847, "learning_rate": 4.0786183446853545e-05, "loss": 0.6446, "num_input_tokens_seen": 27607520, "step": 47565 }, { "epoch": 7.085195114685732, "grad_norm": 1.2556406259536743, "learning_rate": 4.0783663651146555e-05, "loss": 0.6153, "num_input_tokens_seen": 27610784, "step": 47570 }, { "epoch": 7.0859398272266905, "grad_norm": 1.0361912250518799, "learning_rate": 4.078114358879061e-05, "loss": 0.6654, "num_input_tokens_seen": 27613632, "step": 47575 }, { "epoch": 7.08668453976765, "grad_norm": 1.4729117155075073, "learning_rate": 4.077862325982828e-05, "loss": 0.7158, "num_input_tokens_seen": 27616448, "step": 47580 }, { "epoch": 7.087429252308609, "grad_norm": 1.0388984680175781, "learning_rate": 4.077610266430215e-05, "loss": 0.6969, "num_input_tokens_seen": 27619392, "step": 47585 }, { "epoch": 7.088173964849568, "grad_norm": 1.3280820846557617, "learning_rate": 4.0773581802254795e-05, "loss": 0.5206, "num_input_tokens_seen": 27622304, "step": 47590 }, { "epoch": 7.088918677390527, "grad_norm": 1.950892448425293, "learning_rate": 4.077106067372881e-05, "loss": 0.7636, "num_input_tokens_seen": 27625120, "step": 47595 }, { "epoch": 7.089663389931486, "grad_norm": 0.9419682025909424, "learning_rate": 4.0768539278766784e-05, "loss": 0.508, "num_input_tokens_seen": 27628576, "step": 47600 }, { "epoch": 7.090408102472446, "grad_norm": 1.2543314695358276, "learning_rate": 4.076601761741131e-05, "loss": 0.6764, "num_input_tokens_seen": 27631296, "step": 47605 }, { "epoch": 7.091152815013404, "grad_norm": 1.2430795431137085, "learning_rate": 4.0763495689705004e-05, "loss": 0.6819, "num_input_tokens_seen": 27634048, "step": 47610 }, { "epoch": 7.091897527554364, "grad_norm": 0.8765020370483398, "learning_rate": 4.076097349569044e-05, "loss": 0.6405, "num_input_tokens_seen": 27637216, "step": 47615 }, { "epoch": 7.092642240095323, "grad_norm": 1.1609092950820923, "learning_rate": 4.075845103541026e-05, "loss": 0.6371, "num_input_tokens_seen": 27640160, "step": 47620 }, { "epoch": 7.0933869526362825, "grad_norm": 1.1366441249847412, "learning_rate": 4.0755928308907065e-05, "loss": 0.5932, "num_input_tokens_seen": 27642944, "step": 47625 }, { "epoch": 7.094131665177241, "grad_norm": 1.2394553422927856, "learning_rate": 4.0753405316223476e-05, "loss": 0.6117, "num_input_tokens_seen": 27645856, "step": 47630 }, { "epoch": 7.094876377718201, "grad_norm": 1.012453556060791, "learning_rate": 4.0750882057402116e-05, "loss": 0.6832, "num_input_tokens_seen": 27648800, "step": 47635 }, { "epoch": 7.09562109025916, "grad_norm": 1.2476806640625, "learning_rate": 4.074835853248561e-05, "loss": 0.6499, "num_input_tokens_seen": 27651680, "step": 47640 }, { "epoch": 7.096365802800119, "grad_norm": 0.8503163456916809, "learning_rate": 4.074583474151659e-05, "loss": 0.5885, "num_input_tokens_seen": 27654624, "step": 47645 }, { "epoch": 7.097110515341078, "grad_norm": 1.3800244331359863, "learning_rate": 4.074331068453769e-05, "loss": 0.5683, "num_input_tokens_seen": 27657504, "step": 47650 }, { "epoch": 7.097855227882038, "grad_norm": 1.630357027053833, "learning_rate": 4.0740786361591565e-05, "loss": 0.5595, "num_input_tokens_seen": 27660224, "step": 47655 }, { "epoch": 7.0985999404229965, "grad_norm": 1.2871012687683105, "learning_rate": 4.073826177272085e-05, "loss": 0.7066, "num_input_tokens_seen": 27663232, "step": 47660 }, { "epoch": 7.099344652963956, "grad_norm": 2.346799850463867, "learning_rate": 4.0735736917968205e-05, "loss": 0.6667, "num_input_tokens_seen": 27665984, "step": 47665 }, { "epoch": 7.100089365504915, "grad_norm": 1.5715731382369995, "learning_rate": 4.073321179737627e-05, "loss": 0.6122, "num_input_tokens_seen": 27669152, "step": 47670 }, { "epoch": 7.1008340780458745, "grad_norm": 1.967562198638916, "learning_rate": 4.073068641098772e-05, "loss": 0.6909, "num_input_tokens_seen": 27672032, "step": 47675 }, { "epoch": 7.101578790586833, "grad_norm": 1.3847845792770386, "learning_rate": 4.07281607588452e-05, "loss": 0.6696, "num_input_tokens_seen": 27674720, "step": 47680 }, { "epoch": 7.102323503127793, "grad_norm": 1.4085174798965454, "learning_rate": 4.07256348409914e-05, "loss": 0.7053, "num_input_tokens_seen": 27677984, "step": 47685 }, { "epoch": 7.103068215668752, "grad_norm": 1.042776346206665, "learning_rate": 4.072310865746898e-05, "loss": 0.4475, "num_input_tokens_seen": 27680768, "step": 47690 }, { "epoch": 7.103812928209711, "grad_norm": 1.144431471824646, "learning_rate": 4.072058220832061e-05, "loss": 0.6244, "num_input_tokens_seen": 27683264, "step": 47695 }, { "epoch": 7.10455764075067, "grad_norm": 2.0184741020202637, "learning_rate": 4.071805549358899e-05, "loss": 0.7063, "num_input_tokens_seen": 27686304, "step": 47700 }, { "epoch": 7.10530235329163, "grad_norm": 1.298771619796753, "learning_rate": 4.0715528513316796e-05, "loss": 0.915, "num_input_tokens_seen": 27689280, "step": 47705 }, { "epoch": 7.1060470658325885, "grad_norm": 1.3698513507843018, "learning_rate": 4.0713001267546724e-05, "loss": 0.613, "num_input_tokens_seen": 27692160, "step": 47710 }, { "epoch": 7.106791778373548, "grad_norm": 1.1954842805862427, "learning_rate": 4.0710473756321453e-05, "loss": 0.7759, "num_input_tokens_seen": 27694816, "step": 47715 }, { "epoch": 7.107536490914507, "grad_norm": 0.9455753564834595, "learning_rate": 4.07079459796837e-05, "loss": 0.8146, "num_input_tokens_seen": 27697760, "step": 47720 }, { "epoch": 7.1082812034554665, "grad_norm": 1.0836023092269897, "learning_rate": 4.070541793767618e-05, "loss": 0.6537, "num_input_tokens_seen": 27701024, "step": 47725 }, { "epoch": 7.109025915996425, "grad_norm": 1.181604266166687, "learning_rate": 4.0702889630341566e-05, "loss": 0.5886, "num_input_tokens_seen": 27703712, "step": 47730 }, { "epoch": 7.109770628537385, "grad_norm": 0.9784172177314758, "learning_rate": 4.07003610577226e-05, "loss": 0.5905, "num_input_tokens_seen": 27706592, "step": 47735 }, { "epoch": 7.110515341078344, "grad_norm": 1.1821155548095703, "learning_rate": 4.0697832219862e-05, "loss": 0.5982, "num_input_tokens_seen": 27709408, "step": 47740 }, { "epoch": 7.111260053619303, "grad_norm": 1.8276209831237793, "learning_rate": 4.069530311680247e-05, "loss": 0.5453, "num_input_tokens_seen": 27712160, "step": 47745 }, { "epoch": 7.112004766160262, "grad_norm": 1.4376953840255737, "learning_rate": 4.0692773748586743e-05, "loss": 0.7691, "num_input_tokens_seen": 27715232, "step": 47750 }, { "epoch": 7.112749478701222, "grad_norm": 1.2035382986068726, "learning_rate": 4.069024411525756e-05, "loss": 0.6517, "num_input_tokens_seen": 27718208, "step": 47755 }, { "epoch": 7.1134941912421805, "grad_norm": 1.6275895833969116, "learning_rate": 4.0687714216857645e-05, "loss": 0.7258, "num_input_tokens_seen": 27721056, "step": 47760 }, { "epoch": 7.114238903783139, "grad_norm": 2.396411180496216, "learning_rate": 4.068518405342974e-05, "loss": 0.6668, "num_input_tokens_seen": 27724000, "step": 47765 }, { "epoch": 7.114983616324099, "grad_norm": 2.833906412124634, "learning_rate": 4.068265362501659e-05, "loss": 0.7857, "num_input_tokens_seen": 27727008, "step": 47770 }, { "epoch": 7.115728328865058, "grad_norm": 0.8680848479270935, "learning_rate": 4.0680122931660955e-05, "loss": 0.5767, "num_input_tokens_seen": 27730176, "step": 47775 }, { "epoch": 7.116473041406017, "grad_norm": 0.5907221436500549, "learning_rate": 4.067759197340558e-05, "loss": 0.5943, "num_input_tokens_seen": 27732928, "step": 47780 }, { "epoch": 7.117217753946976, "grad_norm": 1.8807445764541626, "learning_rate": 4.0675060750293216e-05, "loss": 0.7727, "num_input_tokens_seen": 27736160, "step": 47785 }, { "epoch": 7.117962466487936, "grad_norm": 1.1344741582870483, "learning_rate": 4.067252926236663e-05, "loss": 0.7322, "num_input_tokens_seen": 27739168, "step": 47790 }, { "epoch": 7.1187071790288945, "grad_norm": 1.2552099227905273, "learning_rate": 4.06699975096686e-05, "loss": 0.7957, "num_input_tokens_seen": 27742304, "step": 47795 }, { "epoch": 7.119451891569854, "grad_norm": 1.6501774787902832, "learning_rate": 4.066746549224189e-05, "loss": 0.7516, "num_input_tokens_seen": 27745120, "step": 47800 }, { "epoch": 7.120196604110813, "grad_norm": 1.1835527420043945, "learning_rate": 4.0664933210129265e-05, "loss": 0.6598, "num_input_tokens_seen": 27747968, "step": 47805 }, { "epoch": 7.1209413166517725, "grad_norm": 1.1708370447158813, "learning_rate": 4.066240066337351e-05, "loss": 0.9327, "num_input_tokens_seen": 27750816, "step": 47810 }, { "epoch": 7.121686029192731, "grad_norm": 1.0332207679748535, "learning_rate": 4.065986785201743e-05, "loss": 0.5652, "num_input_tokens_seen": 27753728, "step": 47815 }, { "epoch": 7.122430741733691, "grad_norm": 1.3475117683410645, "learning_rate": 4.065733477610379e-05, "loss": 0.6425, "num_input_tokens_seen": 27756544, "step": 47820 }, { "epoch": 7.12317545427465, "grad_norm": 0.9438719153404236, "learning_rate": 4.065480143567539e-05, "loss": 0.627, "num_input_tokens_seen": 27759424, "step": 47825 }, { "epoch": 7.123920166815609, "grad_norm": 1.2841812372207642, "learning_rate": 4.065226783077504e-05, "loss": 0.5959, "num_input_tokens_seen": 27762112, "step": 47830 }, { "epoch": 7.124664879356568, "grad_norm": 1.135240912437439, "learning_rate": 4.0649733961445525e-05, "loss": 0.5294, "num_input_tokens_seen": 27764896, "step": 47835 }, { "epoch": 7.125409591897528, "grad_norm": 1.334723949432373, "learning_rate": 4.064719982772965e-05, "loss": 0.4952, "num_input_tokens_seen": 27767808, "step": 47840 }, { "epoch": 7.1261543044384865, "grad_norm": 1.7800555229187012, "learning_rate": 4.064466542967026e-05, "loss": 0.5424, "num_input_tokens_seen": 27770656, "step": 47845 }, { "epoch": 7.126899016979446, "grad_norm": 0.8084745407104492, "learning_rate": 4.0642130767310136e-05, "loss": 0.6335, "num_input_tokens_seen": 27773472, "step": 47850 }, { "epoch": 7.127643729520405, "grad_norm": 1.7940247058868408, "learning_rate": 4.0639595840692116e-05, "loss": 0.6459, "num_input_tokens_seen": 27776448, "step": 47855 }, { "epoch": 7.128388442061365, "grad_norm": 0.7339383959770203, "learning_rate": 4.063706064985901e-05, "loss": 0.6196, "num_input_tokens_seen": 27779040, "step": 47860 }, { "epoch": 7.129133154602323, "grad_norm": 1.2301955223083496, "learning_rate": 4.063452519485367e-05, "loss": 0.6357, "num_input_tokens_seen": 27781664, "step": 47865 }, { "epoch": 7.129877867143283, "grad_norm": 1.1076453924179077, "learning_rate": 4.06319894757189e-05, "loss": 0.7689, "num_input_tokens_seen": 27784448, "step": 47870 }, { "epoch": 7.130622579684242, "grad_norm": 0.9921231269836426, "learning_rate": 4.062945349249757e-05, "loss": 0.5685, "num_input_tokens_seen": 27787296, "step": 47875 }, { "epoch": 7.131367292225201, "grad_norm": 1.1901878118515015, "learning_rate": 4.06269172452325e-05, "loss": 0.6562, "num_input_tokens_seen": 27790208, "step": 47880 }, { "epoch": 7.13211200476616, "grad_norm": 1.192940354347229, "learning_rate": 4.0624380733966546e-05, "loss": 0.6528, "num_input_tokens_seen": 27793216, "step": 47885 }, { "epoch": 7.13285671730712, "grad_norm": 1.526544451713562, "learning_rate": 4.062184395874257e-05, "loss": 0.7043, "num_input_tokens_seen": 27796256, "step": 47890 }, { "epoch": 7.1336014298480785, "grad_norm": 0.7647329568862915, "learning_rate": 4.0619306919603405e-05, "loss": 0.6196, "num_input_tokens_seen": 27799104, "step": 47895 }, { "epoch": 7.134346142389038, "grad_norm": 2.218949556350708, "learning_rate": 4.061676961659193e-05, "loss": 0.7335, "num_input_tokens_seen": 27801824, "step": 47900 }, { "epoch": 7.135090854929997, "grad_norm": 1.1208646297454834, "learning_rate": 4.061423204975101e-05, "loss": 0.5314, "num_input_tokens_seen": 27804608, "step": 47905 }, { "epoch": 7.135835567470957, "grad_norm": 1.0965914726257324, "learning_rate": 4.06116942191235e-05, "loss": 0.6103, "num_input_tokens_seen": 27807648, "step": 47910 }, { "epoch": 7.136580280011915, "grad_norm": 1.1692893505096436, "learning_rate": 4.060915612475229e-05, "loss": 0.5801, "num_input_tokens_seen": 27810240, "step": 47915 }, { "epoch": 7.137324992552875, "grad_norm": 1.7459608316421509, "learning_rate": 4.060661776668024e-05, "loss": 0.638, "num_input_tokens_seen": 27813216, "step": 47920 }, { "epoch": 7.138069705093834, "grad_norm": 0.9337958693504333, "learning_rate": 4.060407914495026e-05, "loss": 0.6064, "num_input_tokens_seen": 27816352, "step": 47925 }, { "epoch": 7.1388144176347925, "grad_norm": 0.8187044858932495, "learning_rate": 4.060154025960521e-05, "loss": 0.4667, "num_input_tokens_seen": 27819360, "step": 47930 }, { "epoch": 7.139559130175752, "grad_norm": 1.024213194847107, "learning_rate": 4.0599001110688e-05, "loss": 0.53, "num_input_tokens_seen": 27822432, "step": 47935 }, { "epoch": 7.140303842716711, "grad_norm": 1.7454168796539307, "learning_rate": 4.0596461698241524e-05, "loss": 0.6886, "num_input_tokens_seen": 27825376, "step": 47940 }, { "epoch": 7.141048555257671, "grad_norm": 0.8507464528083801, "learning_rate": 4.059392202230867e-05, "loss": 0.4245, "num_input_tokens_seen": 27828384, "step": 47945 }, { "epoch": 7.141793267798629, "grad_norm": 1.6407970190048218, "learning_rate": 4.059138208293236e-05, "loss": 0.4716, "num_input_tokens_seen": 27831328, "step": 47950 }, { "epoch": 7.142537980339589, "grad_norm": 1.6973938941955566, "learning_rate": 4.058884188015549e-05, "loss": 0.5018, "num_input_tokens_seen": 27834016, "step": 47955 }, { "epoch": 7.143282692880548, "grad_norm": 1.600732684135437, "learning_rate": 4.058630141402099e-05, "loss": 0.603, "num_input_tokens_seen": 27836672, "step": 47960 }, { "epoch": 7.144027405421507, "grad_norm": 2.9194908142089844, "learning_rate": 4.058376068457176e-05, "loss": 0.5924, "num_input_tokens_seen": 27839296, "step": 47965 }, { "epoch": 7.144772117962466, "grad_norm": 1.387347936630249, "learning_rate": 4.058121969185073e-05, "loss": 0.6021, "num_input_tokens_seen": 27842112, "step": 47970 }, { "epoch": 7.145516830503426, "grad_norm": 1.0129858255386353, "learning_rate": 4.057867843590083e-05, "loss": 0.5788, "num_input_tokens_seen": 27845024, "step": 47975 }, { "epoch": 7.1462615430443845, "grad_norm": 2.2349767684936523, "learning_rate": 4.0576136916765e-05, "loss": 0.6685, "num_input_tokens_seen": 27847776, "step": 47980 }, { "epoch": 7.147006255585344, "grad_norm": 1.3094407320022583, "learning_rate": 4.0573595134486166e-05, "loss": 0.6885, "num_input_tokens_seen": 27851008, "step": 47985 }, { "epoch": 7.147750968126303, "grad_norm": 1.3155369758605957, "learning_rate": 4.0571053089107256e-05, "loss": 0.6471, "num_input_tokens_seen": 27854240, "step": 47990 }, { "epoch": 7.148495680667263, "grad_norm": 2.2043991088867188, "learning_rate": 4.056851078067124e-05, "loss": 0.7823, "num_input_tokens_seen": 27857312, "step": 47995 }, { "epoch": 7.149240393208221, "grad_norm": 0.8782455921173096, "learning_rate": 4.056596820922106e-05, "loss": 0.6723, "num_input_tokens_seen": 27860032, "step": 48000 }, { "epoch": 7.149985105749181, "grad_norm": 1.108214020729065, "learning_rate": 4.0563425374799665e-05, "loss": 0.7412, "num_input_tokens_seen": 27863008, "step": 48005 }, { "epoch": 7.15072981829014, "grad_norm": 1.1515706777572632, "learning_rate": 4.0560882277450017e-05, "loss": 0.5692, "num_input_tokens_seen": 27865920, "step": 48010 }, { "epoch": 7.151474530831099, "grad_norm": 1.289228916168213, "learning_rate": 4.055833891721508e-05, "loss": 0.7512, "num_input_tokens_seen": 27869120, "step": 48015 }, { "epoch": 7.152219243372058, "grad_norm": 2.0396547317504883, "learning_rate": 4.0555795294137824e-05, "loss": 0.5627, "num_input_tokens_seen": 27872128, "step": 48020 }, { "epoch": 7.152963955913018, "grad_norm": 1.656968116760254, "learning_rate": 4.05532514082612e-05, "loss": 0.6293, "num_input_tokens_seen": 27874912, "step": 48025 }, { "epoch": 7.153708668453977, "grad_norm": 1.191774845123291, "learning_rate": 4.055070725962822e-05, "loss": 0.5308, "num_input_tokens_seen": 27877344, "step": 48030 }, { "epoch": 7.154453380994936, "grad_norm": 0.7895607352256775, "learning_rate": 4.0548162848281835e-05, "loss": 0.4804, "num_input_tokens_seen": 27880128, "step": 48035 }, { "epoch": 7.155198093535895, "grad_norm": 1.4936171770095825, "learning_rate": 4.0545618174265045e-05, "loss": 0.6093, "num_input_tokens_seen": 27882816, "step": 48040 }, { "epoch": 7.155942806076855, "grad_norm": 1.2389434576034546, "learning_rate": 4.054307323762083e-05, "loss": 0.4797, "num_input_tokens_seen": 27885504, "step": 48045 }, { "epoch": 7.156687518617813, "grad_norm": 2.10121750831604, "learning_rate": 4.05405280383922e-05, "loss": 0.7694, "num_input_tokens_seen": 27888480, "step": 48050 }, { "epoch": 7.157432231158773, "grad_norm": 1.784117341041565, "learning_rate": 4.053798257662213e-05, "loss": 0.5615, "num_input_tokens_seen": 27891328, "step": 48055 }, { "epoch": 7.158176943699732, "grad_norm": 1.1281436681747437, "learning_rate": 4.053543685235365e-05, "loss": 0.5938, "num_input_tokens_seen": 27894112, "step": 48060 }, { "epoch": 7.158921656240691, "grad_norm": 1.080414056777954, "learning_rate": 4.0532890865629744e-05, "loss": 0.6683, "num_input_tokens_seen": 27897216, "step": 48065 }, { "epoch": 7.15966636878165, "grad_norm": 1.7476222515106201, "learning_rate": 4.053034461649344e-05, "loss": 0.5937, "num_input_tokens_seen": 27900064, "step": 48070 }, { "epoch": 7.16041108132261, "grad_norm": 1.5393701791763306, "learning_rate": 4.0527798104987745e-05, "loss": 0.6098, "num_input_tokens_seen": 27903072, "step": 48075 }, { "epoch": 7.161155793863569, "grad_norm": 1.9689685106277466, "learning_rate": 4.052525133115569e-05, "loss": 0.7918, "num_input_tokens_seen": 27905984, "step": 48080 }, { "epoch": 7.161900506404528, "grad_norm": 1.1822479963302612, "learning_rate": 4.052270429504028e-05, "loss": 0.7868, "num_input_tokens_seen": 27908736, "step": 48085 }, { "epoch": 7.162645218945487, "grad_norm": 1.1369740962982178, "learning_rate": 4.0520156996684565e-05, "loss": 0.6694, "num_input_tokens_seen": 27911552, "step": 48090 }, { "epoch": 7.163389931486447, "grad_norm": 1.1195416450500488, "learning_rate": 4.051760943613158e-05, "loss": 0.645, "num_input_tokens_seen": 27914400, "step": 48095 }, { "epoch": 7.164134644027405, "grad_norm": 1.4009459018707275, "learning_rate": 4.0515061613424345e-05, "loss": 0.5102, "num_input_tokens_seen": 27917408, "step": 48100 }, { "epoch": 7.164879356568365, "grad_norm": 1.0378035306930542, "learning_rate": 4.051251352860591e-05, "loss": 0.6739, "num_input_tokens_seen": 27920896, "step": 48105 }, { "epoch": 7.165624069109324, "grad_norm": 1.367551565170288, "learning_rate": 4.0509965181719326e-05, "loss": 0.5457, "num_input_tokens_seen": 27923712, "step": 48110 }, { "epoch": 7.166368781650283, "grad_norm": 1.4758856296539307, "learning_rate": 4.050741657280765e-05, "loss": 0.589, "num_input_tokens_seen": 27926304, "step": 48115 }, { "epoch": 7.167113494191242, "grad_norm": 1.0747908353805542, "learning_rate": 4.050486770191393e-05, "loss": 0.6624, "num_input_tokens_seen": 27929216, "step": 48120 }, { "epoch": 7.167858206732201, "grad_norm": 1.1796479225158691, "learning_rate": 4.050231856908122e-05, "loss": 0.6755, "num_input_tokens_seen": 27931872, "step": 48125 }, { "epoch": 7.168602919273161, "grad_norm": 2.1289098262786865, "learning_rate": 4.04997691743526e-05, "loss": 0.7415, "num_input_tokens_seen": 27934784, "step": 48130 }, { "epoch": 7.169347631814119, "grad_norm": 0.7803676128387451, "learning_rate": 4.0497219517771137e-05, "loss": 0.6412, "num_input_tokens_seen": 27937536, "step": 48135 }, { "epoch": 7.170092344355079, "grad_norm": 1.0896657705307007, "learning_rate": 4.04946695993799e-05, "loss": 0.4949, "num_input_tokens_seen": 27940352, "step": 48140 }, { "epoch": 7.170837056896038, "grad_norm": 0.7555824518203735, "learning_rate": 4.0492119419221966e-05, "loss": 0.5556, "num_input_tokens_seen": 27943488, "step": 48145 }, { "epoch": 7.171581769436997, "grad_norm": 1.0900410413742065, "learning_rate": 4.048956897734042e-05, "loss": 0.5623, "num_input_tokens_seen": 27946560, "step": 48150 }, { "epoch": 7.172326481977956, "grad_norm": 1.2070367336273193, "learning_rate": 4.048701827377835e-05, "loss": 0.5386, "num_input_tokens_seen": 27949472, "step": 48155 }, { "epoch": 7.173071194518916, "grad_norm": 1.448713779449463, "learning_rate": 4.0484467308578844e-05, "loss": 0.6788, "num_input_tokens_seen": 27952288, "step": 48160 }, { "epoch": 7.173815907059875, "grad_norm": 1.4108691215515137, "learning_rate": 4.0481916081785e-05, "loss": 0.6611, "num_input_tokens_seen": 27955200, "step": 48165 }, { "epoch": 7.174560619600834, "grad_norm": 1.3631055355072021, "learning_rate": 4.047936459343992e-05, "loss": 0.6565, "num_input_tokens_seen": 27957952, "step": 48170 }, { "epoch": 7.175305332141793, "grad_norm": 0.6898892521858215, "learning_rate": 4.047681284358671e-05, "loss": 0.5202, "num_input_tokens_seen": 27960768, "step": 48175 }, { "epoch": 7.176050044682753, "grad_norm": 1.6268229484558105, "learning_rate": 4.0474260832268476e-05, "loss": 0.5845, "num_input_tokens_seen": 27963520, "step": 48180 }, { "epoch": 7.176794757223711, "grad_norm": 1.3076958656311035, "learning_rate": 4.047170855952833e-05, "loss": 0.734, "num_input_tokens_seen": 27966336, "step": 48185 }, { "epoch": 7.177539469764671, "grad_norm": 0.8472943305969238, "learning_rate": 4.04691560254094e-05, "loss": 0.6099, "num_input_tokens_seen": 27969408, "step": 48190 }, { "epoch": 7.17828418230563, "grad_norm": 1.8257299661636353, "learning_rate": 4.046660322995479e-05, "loss": 0.8041, "num_input_tokens_seen": 27972608, "step": 48195 }, { "epoch": 7.1790288948465895, "grad_norm": 1.1210521459579468, "learning_rate": 4.046405017320765e-05, "loss": 0.599, "num_input_tokens_seen": 27975328, "step": 48200 }, { "epoch": 7.179773607387548, "grad_norm": 1.345885157585144, "learning_rate": 4.046149685521109e-05, "loss": 0.6155, "num_input_tokens_seen": 27978240, "step": 48205 }, { "epoch": 7.180518319928508, "grad_norm": 1.1137651205062866, "learning_rate": 4.045894327600826e-05, "loss": 0.6004, "num_input_tokens_seen": 27981088, "step": 48210 }, { "epoch": 7.181263032469467, "grad_norm": 1.0386343002319336, "learning_rate": 4.04563894356423e-05, "loss": 0.6496, "num_input_tokens_seen": 27983872, "step": 48215 }, { "epoch": 7.182007745010426, "grad_norm": 1.1530468463897705, "learning_rate": 4.045383533415634e-05, "loss": 0.8192, "num_input_tokens_seen": 27986752, "step": 48220 }, { "epoch": 7.182752457551385, "grad_norm": 1.370058536529541, "learning_rate": 4.045128097159354e-05, "loss": 0.7023, "num_input_tokens_seen": 27989504, "step": 48225 }, { "epoch": 7.183497170092345, "grad_norm": 1.1209293603897095, "learning_rate": 4.044872634799706e-05, "loss": 0.5328, "num_input_tokens_seen": 27992256, "step": 48230 }, { "epoch": 7.184241882633303, "grad_norm": 0.9367300271987915, "learning_rate": 4.044617146341003e-05, "loss": 0.6584, "num_input_tokens_seen": 27995136, "step": 48235 }, { "epoch": 7.184986595174263, "grad_norm": 1.0863815546035767, "learning_rate": 4.044361631787565e-05, "loss": 0.7078, "num_input_tokens_seen": 27998048, "step": 48240 }, { "epoch": 7.185731307715222, "grad_norm": 0.8951435685157776, "learning_rate": 4.044106091143707e-05, "loss": 0.5662, "num_input_tokens_seen": 28000896, "step": 48245 }, { "epoch": 7.1864760202561815, "grad_norm": 1.1248102188110352, "learning_rate": 4.043850524413745e-05, "loss": 0.5776, "num_input_tokens_seen": 28004000, "step": 48250 }, { "epoch": 7.18722073279714, "grad_norm": 1.7449578046798706, "learning_rate": 4.0435949316019974e-05, "loss": 0.6395, "num_input_tokens_seen": 28006752, "step": 48255 }, { "epoch": 7.1879654453381, "grad_norm": 1.060421109199524, "learning_rate": 4.0433393127127827e-05, "loss": 0.6203, "num_input_tokens_seen": 28009760, "step": 48260 }, { "epoch": 7.188710157879059, "grad_norm": 1.4444491863250732, "learning_rate": 4.043083667750419e-05, "loss": 0.5209, "num_input_tokens_seen": 28012512, "step": 48265 }, { "epoch": 7.189454870420018, "grad_norm": 0.8803439736366272, "learning_rate": 4.042827996719225e-05, "loss": 0.609, "num_input_tokens_seen": 28015328, "step": 48270 }, { "epoch": 7.190199582960977, "grad_norm": 1.861312747001648, "learning_rate": 4.04257229962352e-05, "loss": 0.8972, "num_input_tokens_seen": 28018048, "step": 48275 }, { "epoch": 7.190944295501936, "grad_norm": 0.9022408723831177, "learning_rate": 4.042316576467624e-05, "loss": 0.5463, "num_input_tokens_seen": 28020832, "step": 48280 }, { "epoch": 7.1916890080428955, "grad_norm": 1.1670387983322144, "learning_rate": 4.0420608272558566e-05, "loss": 0.6288, "num_input_tokens_seen": 28023840, "step": 48285 }, { "epoch": 7.192433720583854, "grad_norm": 1.2224080562591553, "learning_rate": 4.0418050519925386e-05, "loss": 0.733, "num_input_tokens_seen": 28026624, "step": 48290 }, { "epoch": 7.193178433124814, "grad_norm": 1.1661622524261475, "learning_rate": 4.041549250681992e-05, "loss": 0.6076, "num_input_tokens_seen": 28029376, "step": 48295 }, { "epoch": 7.193923145665773, "grad_norm": 1.1250332593917847, "learning_rate": 4.041293423328537e-05, "loss": 0.6341, "num_input_tokens_seen": 28032704, "step": 48300 }, { "epoch": 7.194667858206732, "grad_norm": 1.3084003925323486, "learning_rate": 4.0410375699364964e-05, "loss": 0.5324, "num_input_tokens_seen": 28035392, "step": 48305 }, { "epoch": 7.195412570747691, "grad_norm": 1.5188157558441162, "learning_rate": 4.040781690510193e-05, "loss": 0.5675, "num_input_tokens_seen": 28038080, "step": 48310 }, { "epoch": 7.196157283288651, "grad_norm": 1.0889174938201904, "learning_rate": 4.0405257850539474e-05, "loss": 0.6084, "num_input_tokens_seen": 28041056, "step": 48315 }, { "epoch": 7.196901995829609, "grad_norm": 1.0753543376922607, "learning_rate": 4.040269853572085e-05, "loss": 0.7294, "num_input_tokens_seen": 28044000, "step": 48320 }, { "epoch": 7.197646708370569, "grad_norm": 1.2871594429016113, "learning_rate": 4.0400138960689295e-05, "loss": 0.6351, "num_input_tokens_seen": 28046720, "step": 48325 }, { "epoch": 7.198391420911528, "grad_norm": 0.7275180816650391, "learning_rate": 4.0397579125488036e-05, "loss": 0.5183, "num_input_tokens_seen": 28049600, "step": 48330 }, { "epoch": 7.1991361334524875, "grad_norm": 1.1586416959762573, "learning_rate": 4.039501903016033e-05, "loss": 0.6644, "num_input_tokens_seen": 28052544, "step": 48335 }, { "epoch": 7.199880845993446, "grad_norm": 1.0177096128463745, "learning_rate": 4.039245867474942e-05, "loss": 0.8272, "num_input_tokens_seen": 28055680, "step": 48340 }, { "epoch": 7.200625558534406, "grad_norm": 1.2333712577819824, "learning_rate": 4.038989805929857e-05, "loss": 0.5674, "num_input_tokens_seen": 28058592, "step": 48345 }, { "epoch": 7.201370271075365, "grad_norm": 1.4070643186569214, "learning_rate": 4.0387337183851035e-05, "loss": 0.5231, "num_input_tokens_seen": 28061504, "step": 48350 }, { "epoch": 7.202114983616324, "grad_norm": 1.4895225763320923, "learning_rate": 4.038477604845008e-05, "loss": 0.7733, "num_input_tokens_seen": 28064320, "step": 48355 }, { "epoch": 7.202859696157283, "grad_norm": 1.0478743314743042, "learning_rate": 4.038221465313896e-05, "loss": 0.6935, "num_input_tokens_seen": 28067232, "step": 48360 }, { "epoch": 7.203604408698243, "grad_norm": 1.029220461845398, "learning_rate": 4.037965299796096e-05, "loss": 0.6966, "num_input_tokens_seen": 28070400, "step": 48365 }, { "epoch": 7.2043491212392015, "grad_norm": 1.3065158128738403, "learning_rate": 4.037709108295936e-05, "loss": 0.6597, "num_input_tokens_seen": 28073152, "step": 48370 }, { "epoch": 7.205093833780161, "grad_norm": 1.233228325843811, "learning_rate": 4.037452890817743e-05, "loss": 0.6917, "num_input_tokens_seen": 28075808, "step": 48375 }, { "epoch": 7.20583854632112, "grad_norm": 0.9197698831558228, "learning_rate": 4.0371966473658465e-05, "loss": 0.5212, "num_input_tokens_seen": 28079008, "step": 48380 }, { "epoch": 7.2065832588620795, "grad_norm": 1.2765671014785767, "learning_rate": 4.0369403779445744e-05, "loss": 0.5996, "num_input_tokens_seen": 28081664, "step": 48385 }, { "epoch": 7.207327971403038, "grad_norm": 1.8507106304168701, "learning_rate": 4.0366840825582574e-05, "loss": 0.6697, "num_input_tokens_seen": 28084256, "step": 48390 }, { "epoch": 7.208072683943998, "grad_norm": 1.4051920175552368, "learning_rate": 4.036427761211224e-05, "loss": 0.6656, "num_input_tokens_seen": 28087136, "step": 48395 }, { "epoch": 7.208817396484957, "grad_norm": 1.0369771718978882, "learning_rate": 4.0361714139078055e-05, "loss": 0.7888, "num_input_tokens_seen": 28090112, "step": 48400 }, { "epoch": 7.209562109025916, "grad_norm": 1.6745860576629639, "learning_rate": 4.0359150406523314e-05, "loss": 0.6268, "num_input_tokens_seen": 28092736, "step": 48405 }, { "epoch": 7.210306821566875, "grad_norm": 1.0265141725540161, "learning_rate": 4.0356586414491345e-05, "loss": 0.7287, "num_input_tokens_seen": 28095616, "step": 48410 }, { "epoch": 7.211051534107835, "grad_norm": 1.8240243196487427, "learning_rate": 4.035402216302546e-05, "loss": 0.6989, "num_input_tokens_seen": 28098400, "step": 48415 }, { "epoch": 7.2117962466487935, "grad_norm": 0.8195840120315552, "learning_rate": 4.035145765216897e-05, "loss": 0.6018, "num_input_tokens_seen": 28101280, "step": 48420 }, { "epoch": 7.212540959189753, "grad_norm": 0.8748961091041565, "learning_rate": 4.03488928819652e-05, "loss": 0.5181, "num_input_tokens_seen": 28104416, "step": 48425 }, { "epoch": 7.213285671730712, "grad_norm": 1.2515519857406616, "learning_rate": 4.03463278524575e-05, "loss": 0.5664, "num_input_tokens_seen": 28107040, "step": 48430 }, { "epoch": 7.2140303842716715, "grad_norm": 2.9044642448425293, "learning_rate": 4.034376256368917e-05, "loss": 0.6646, "num_input_tokens_seen": 28109760, "step": 48435 }, { "epoch": 7.21477509681263, "grad_norm": 1.8582009077072144, "learning_rate": 4.034119701570358e-05, "loss": 0.7383, "num_input_tokens_seen": 28112608, "step": 48440 }, { "epoch": 7.21551980935359, "grad_norm": 0.9959964752197266, "learning_rate": 4.033863120854405e-05, "loss": 0.6547, "num_input_tokens_seen": 28115616, "step": 48445 }, { "epoch": 7.216264521894549, "grad_norm": 1.095535397529602, "learning_rate": 4.0336065142253945e-05, "loss": 0.4722, "num_input_tokens_seen": 28118464, "step": 48450 }, { "epoch": 7.217009234435508, "grad_norm": 0.9522311091423035, "learning_rate": 4.03334988168766e-05, "loss": 0.5592, "num_input_tokens_seen": 28121312, "step": 48455 }, { "epoch": 7.217753946976467, "grad_norm": 1.0895020961761475, "learning_rate": 4.0330932232455376e-05, "loss": 0.5188, "num_input_tokens_seen": 28124256, "step": 48460 }, { "epoch": 7.218498659517426, "grad_norm": 1.3848174810409546, "learning_rate": 4.0328365389033636e-05, "loss": 0.4645, "num_input_tokens_seen": 28127040, "step": 48465 }, { "epoch": 7.2192433720583855, "grad_norm": 1.522886872291565, "learning_rate": 4.0325798286654734e-05, "loss": 0.7636, "num_input_tokens_seen": 28129920, "step": 48470 }, { "epoch": 7.219988084599344, "grad_norm": 1.6543430089950562, "learning_rate": 4.032323092536206e-05, "loss": 0.7201, "num_input_tokens_seen": 28132672, "step": 48475 }, { "epoch": 7.220732797140304, "grad_norm": 1.3830938339233398, "learning_rate": 4.032066330519896e-05, "loss": 0.5721, "num_input_tokens_seen": 28135392, "step": 48480 }, { "epoch": 7.221477509681263, "grad_norm": 1.191519856452942, "learning_rate": 4.0318095426208835e-05, "loss": 0.6548, "num_input_tokens_seen": 28138144, "step": 48485 }, { "epoch": 7.222222222222222, "grad_norm": 0.9106963872909546, "learning_rate": 4.031552728843505e-05, "loss": 0.7102, "num_input_tokens_seen": 28140992, "step": 48490 }, { "epoch": 7.222966934763181, "grad_norm": 1.1219916343688965, "learning_rate": 4.0312958891921e-05, "loss": 0.6199, "num_input_tokens_seen": 28143936, "step": 48495 }, { "epoch": 7.223711647304141, "grad_norm": 1.0010179281234741, "learning_rate": 4.031039023671007e-05, "loss": 0.5639, "num_input_tokens_seen": 28146912, "step": 48500 }, { "epoch": 7.2244563598450995, "grad_norm": 2.0582125186920166, "learning_rate": 4.0307821322845664e-05, "loss": 0.5625, "num_input_tokens_seen": 28149792, "step": 48505 }, { "epoch": 7.225201072386059, "grad_norm": 0.9669979810714722, "learning_rate": 4.0305252150371175e-05, "loss": 0.6752, "num_input_tokens_seen": 28152864, "step": 48510 }, { "epoch": 7.225945784927018, "grad_norm": 3.7606284618377686, "learning_rate": 4.030268271933e-05, "loss": 0.7286, "num_input_tokens_seen": 28155744, "step": 48515 }, { "epoch": 7.2266904974679775, "grad_norm": 1.0417338609695435, "learning_rate": 4.030011302976555e-05, "loss": 0.6283, "num_input_tokens_seen": 28158816, "step": 48520 }, { "epoch": 7.227435210008936, "grad_norm": 1.0458606481552124, "learning_rate": 4.0297543081721254e-05, "loss": 0.636, "num_input_tokens_seen": 28161696, "step": 48525 }, { "epoch": 7.228179922549896, "grad_norm": 1.5956487655639648, "learning_rate": 4.02949728752405e-05, "loss": 0.5983, "num_input_tokens_seen": 28164448, "step": 48530 }, { "epoch": 7.228924635090855, "grad_norm": 0.7649865746498108, "learning_rate": 4.0292402410366734e-05, "loss": 0.5867, "num_input_tokens_seen": 28167264, "step": 48535 }, { "epoch": 7.229669347631814, "grad_norm": 2.085442543029785, "learning_rate": 4.0289831687143376e-05, "loss": 0.6607, "num_input_tokens_seen": 28170176, "step": 48540 }, { "epoch": 7.230414060172773, "grad_norm": 0.8837016820907593, "learning_rate": 4.028726070561385e-05, "loss": 0.6293, "num_input_tokens_seen": 28173056, "step": 48545 }, { "epoch": 7.231158772713733, "grad_norm": 0.9515650868415833, "learning_rate": 4.028468946582158e-05, "loss": 0.6885, "num_input_tokens_seen": 28176224, "step": 48550 }, { "epoch": 7.2319034852546915, "grad_norm": 0.8701418042182922, "learning_rate": 4.028211796781003e-05, "loss": 0.5802, "num_input_tokens_seen": 28179104, "step": 48555 }, { "epoch": 7.232648197795651, "grad_norm": 2.048285961151123, "learning_rate": 4.027954621162262e-05, "loss": 0.5848, "num_input_tokens_seen": 28182016, "step": 48560 }, { "epoch": 7.23339291033661, "grad_norm": 1.4604053497314453, "learning_rate": 4.027697419730281e-05, "loss": 0.6479, "num_input_tokens_seen": 28184832, "step": 48565 }, { "epoch": 7.23413762287757, "grad_norm": 1.1628021001815796, "learning_rate": 4.027440192489404e-05, "loss": 0.4568, "num_input_tokens_seen": 28187616, "step": 48570 }, { "epoch": 7.234882335418528, "grad_norm": 1.0436099767684937, "learning_rate": 4.0271829394439786e-05, "loss": 0.6272, "num_input_tokens_seen": 28190464, "step": 48575 }, { "epoch": 7.235627047959488, "grad_norm": 1.8540254831314087, "learning_rate": 4.026925660598349e-05, "loss": 0.6103, "num_input_tokens_seen": 28193248, "step": 48580 }, { "epoch": 7.236371760500447, "grad_norm": 1.1007918119430542, "learning_rate": 4.0266683559568625e-05, "loss": 0.6361, "num_input_tokens_seen": 28196128, "step": 48585 }, { "epoch": 7.237116473041406, "grad_norm": 1.0455106496810913, "learning_rate": 4.0264110255238654e-05, "loss": 0.726, "num_input_tokens_seen": 28199040, "step": 48590 }, { "epoch": 7.237861185582365, "grad_norm": 1.0924538373947144, "learning_rate": 4.026153669303706e-05, "loss": 0.6024, "num_input_tokens_seen": 28202368, "step": 48595 }, { "epoch": 7.238605898123325, "grad_norm": 1.5475304126739502, "learning_rate": 4.0258962873007305e-05, "loss": 0.668, "num_input_tokens_seen": 28205312, "step": 48600 }, { "epoch": 7.2393506106642835, "grad_norm": 1.0185003280639648, "learning_rate": 4.025638879519289e-05, "loss": 0.6125, "num_input_tokens_seen": 28208320, "step": 48605 }, { "epoch": 7.240095323205243, "grad_norm": 1.9337390661239624, "learning_rate": 4.025381445963728e-05, "loss": 0.7894, "num_input_tokens_seen": 28211168, "step": 48610 }, { "epoch": 7.240840035746202, "grad_norm": 0.8726973533630371, "learning_rate": 4.025123986638399e-05, "loss": 0.6233, "num_input_tokens_seen": 28214080, "step": 48615 }, { "epoch": 7.241584748287162, "grad_norm": 0.9418549537658691, "learning_rate": 4.02486650154765e-05, "loss": 0.5326, "num_input_tokens_seen": 28216832, "step": 48620 }, { "epoch": 7.24232946082812, "grad_norm": 1.3058502674102783, "learning_rate": 4.0246089906958317e-05, "loss": 0.6051, "num_input_tokens_seen": 28219648, "step": 48625 }, { "epoch": 7.243074173369079, "grad_norm": 1.2492676973342896, "learning_rate": 4.024351454087293e-05, "loss": 0.7375, "num_input_tokens_seen": 28222496, "step": 48630 }, { "epoch": 7.243818885910039, "grad_norm": 1.6476478576660156, "learning_rate": 4.0240938917263864e-05, "loss": 0.5708, "num_input_tokens_seen": 28225984, "step": 48635 }, { "epoch": 7.2445635984509975, "grad_norm": 1.6234270334243774, "learning_rate": 4.0238363036174625e-05, "loss": 0.4811, "num_input_tokens_seen": 28228992, "step": 48640 }, { "epoch": 7.245308310991957, "grad_norm": 1.041080117225647, "learning_rate": 4.023578689764873e-05, "loss": 0.4892, "num_input_tokens_seen": 28232288, "step": 48645 }, { "epoch": 7.246053023532916, "grad_norm": 0.7759391069412231, "learning_rate": 4.02332105017297e-05, "loss": 0.4508, "num_input_tokens_seen": 28235168, "step": 48650 }, { "epoch": 7.246797736073876, "grad_norm": 0.9822921752929688, "learning_rate": 4.0230633848461056e-05, "loss": 0.7776, "num_input_tokens_seen": 28238368, "step": 48655 }, { "epoch": 7.247542448614834, "grad_norm": 1.1173065900802612, "learning_rate": 4.022805693788634e-05, "loss": 0.4947, "num_input_tokens_seen": 28241376, "step": 48660 }, { "epoch": 7.248287161155794, "grad_norm": 1.3815654516220093, "learning_rate": 4.0225479770049076e-05, "loss": 0.5695, "num_input_tokens_seen": 28244384, "step": 48665 }, { "epoch": 7.249031873696753, "grad_norm": 1.660394310951233, "learning_rate": 4.022290234499281e-05, "loss": 0.5907, "num_input_tokens_seen": 28247168, "step": 48670 }, { "epoch": 7.249776586237712, "grad_norm": 1.269890308380127, "learning_rate": 4.0220324662761076e-05, "loss": 0.6166, "num_input_tokens_seen": 28250048, "step": 48675 }, { "epoch": 7.250521298778671, "grad_norm": 0.9637452363967896, "learning_rate": 4.021774672339743e-05, "loss": 0.6329, "num_input_tokens_seen": 28253120, "step": 48680 }, { "epoch": 7.251266011319631, "grad_norm": 1.32668936252594, "learning_rate": 4.021516852694541e-05, "loss": 0.5336, "num_input_tokens_seen": 28256064, "step": 48685 }, { "epoch": 7.2520107238605895, "grad_norm": 1.2442916631698608, "learning_rate": 4.021259007344859e-05, "loss": 0.589, "num_input_tokens_seen": 28258912, "step": 48690 }, { "epoch": 7.252755436401549, "grad_norm": 1.412200927734375, "learning_rate": 4.021001136295052e-05, "loss": 0.6703, "num_input_tokens_seen": 28261792, "step": 48695 }, { "epoch": 7.253500148942508, "grad_norm": 0.98642498254776, "learning_rate": 4.020743239549477e-05, "loss": 0.7484, "num_input_tokens_seen": 28264768, "step": 48700 }, { "epoch": 7.254244861483468, "grad_norm": 1.988595962524414, "learning_rate": 4.0204853171124904e-05, "loss": 0.8165, "num_input_tokens_seen": 28268160, "step": 48705 }, { "epoch": 7.254989574024426, "grad_norm": 2.1871252059936523, "learning_rate": 4.0202273689884496e-05, "loss": 0.6603, "num_input_tokens_seen": 28270784, "step": 48710 }, { "epoch": 7.255734286565386, "grad_norm": 2.351332426071167, "learning_rate": 4.019969395181713e-05, "loss": 0.7865, "num_input_tokens_seen": 28273536, "step": 48715 }, { "epoch": 7.256478999106345, "grad_norm": 1.6979283094406128, "learning_rate": 4.0197113956966376e-05, "loss": 0.5252, "num_input_tokens_seen": 28276512, "step": 48720 }, { "epoch": 7.257223711647304, "grad_norm": 1.5540039539337158, "learning_rate": 4.019453370537583e-05, "loss": 0.6052, "num_input_tokens_seen": 28279552, "step": 48725 }, { "epoch": 7.257968424188263, "grad_norm": 1.534636378288269, "learning_rate": 4.019195319708908e-05, "loss": 0.6282, "num_input_tokens_seen": 28282240, "step": 48730 }, { "epoch": 7.258713136729223, "grad_norm": 1.6317181587219238, "learning_rate": 4.018937243214972e-05, "loss": 0.606, "num_input_tokens_seen": 28284896, "step": 48735 }, { "epoch": 7.259457849270182, "grad_norm": 2.6099905967712402, "learning_rate": 4.018679141060136e-05, "loss": 0.6534, "num_input_tokens_seen": 28287808, "step": 48740 }, { "epoch": 7.260202561811141, "grad_norm": 1.0330013036727905, "learning_rate": 4.0184210132487576e-05, "loss": 0.5728, "num_input_tokens_seen": 28290528, "step": 48745 }, { "epoch": 7.2609472743521, "grad_norm": 0.9723738431930542, "learning_rate": 4.018162859785201e-05, "loss": 0.5244, "num_input_tokens_seen": 28293664, "step": 48750 }, { "epoch": 7.26169198689306, "grad_norm": 0.5811389684677124, "learning_rate": 4.017904680673825e-05, "loss": 0.4777, "num_input_tokens_seen": 28296640, "step": 48755 }, { "epoch": 7.262436699434018, "grad_norm": 1.2022879123687744, "learning_rate": 4.0176464759189924e-05, "loss": 0.5609, "num_input_tokens_seen": 28299584, "step": 48760 }, { "epoch": 7.263181411974978, "grad_norm": 0.7700329422950745, "learning_rate": 4.017388245525065e-05, "loss": 0.5824, "num_input_tokens_seen": 28302400, "step": 48765 }, { "epoch": 7.263926124515937, "grad_norm": 1.2615031003952026, "learning_rate": 4.017129989496405e-05, "loss": 0.7041, "num_input_tokens_seen": 28305504, "step": 48770 }, { "epoch": 7.264670837056896, "grad_norm": 0.9364359378814697, "learning_rate": 4.0168717078373763e-05, "loss": 0.5047, "num_input_tokens_seen": 28308480, "step": 48775 }, { "epoch": 7.265415549597855, "grad_norm": 1.7345507144927979, "learning_rate": 4.016613400552342e-05, "loss": 0.7182, "num_input_tokens_seen": 28311584, "step": 48780 }, { "epoch": 7.266160262138815, "grad_norm": 2.0080647468566895, "learning_rate": 4.016355067645666e-05, "loss": 0.5178, "num_input_tokens_seen": 28315168, "step": 48785 }, { "epoch": 7.266904974679774, "grad_norm": 0.9443149566650391, "learning_rate": 4.0160967091217114e-05, "loss": 0.5961, "num_input_tokens_seen": 28318048, "step": 48790 }, { "epoch": 7.267649687220732, "grad_norm": 0.8678172826766968, "learning_rate": 4.015838324984844e-05, "loss": 0.5392, "num_input_tokens_seen": 28320800, "step": 48795 }, { "epoch": 7.268394399761692, "grad_norm": 1.4295357465744019, "learning_rate": 4.015579915239429e-05, "loss": 0.5659, "num_input_tokens_seen": 28323584, "step": 48800 }, { "epoch": 7.269139112302652, "grad_norm": 1.0606240034103394, "learning_rate": 4.015321479889832e-05, "loss": 0.8891, "num_input_tokens_seen": 28326368, "step": 48805 }, { "epoch": 7.26988382484361, "grad_norm": 1.9529328346252441, "learning_rate": 4.015063018940418e-05, "loss": 0.614, "num_input_tokens_seen": 28328960, "step": 48810 }, { "epoch": 7.270628537384569, "grad_norm": 1.5502334833145142, "learning_rate": 4.014804532395554e-05, "loss": 0.751, "num_input_tokens_seen": 28331936, "step": 48815 }, { "epoch": 7.271373249925529, "grad_norm": 1.3472853899002075, "learning_rate": 4.014546020259607e-05, "loss": 0.6656, "num_input_tokens_seen": 28334976, "step": 48820 }, { "epoch": 7.272117962466488, "grad_norm": 1.7404838800430298, "learning_rate": 4.014287482536945e-05, "loss": 0.6434, "num_input_tokens_seen": 28337888, "step": 48825 }, { "epoch": 7.272862675007447, "grad_norm": 1.1488434076309204, "learning_rate": 4.0140289192319355e-05, "loss": 0.6657, "num_input_tokens_seen": 28340896, "step": 48830 }, { "epoch": 7.273607387548406, "grad_norm": 1.3339710235595703, "learning_rate": 4.013770330348945e-05, "loss": 0.6107, "num_input_tokens_seen": 28343616, "step": 48835 }, { "epoch": 7.274352100089366, "grad_norm": 1.1403741836547852, "learning_rate": 4.013511715892344e-05, "loss": 0.7842, "num_input_tokens_seen": 28346368, "step": 48840 }, { "epoch": 7.275096812630324, "grad_norm": 0.9061501026153564, "learning_rate": 4.0132530758665006e-05, "loss": 0.5416, "num_input_tokens_seen": 28349344, "step": 48845 }, { "epoch": 7.275841525171284, "grad_norm": 0.8137478232383728, "learning_rate": 4.0129944102757847e-05, "loss": 0.6639, "num_input_tokens_seen": 28352768, "step": 48850 }, { "epoch": 7.276586237712243, "grad_norm": 1.1492269039154053, "learning_rate": 4.0127357191245654e-05, "loss": 0.5369, "num_input_tokens_seen": 28355360, "step": 48855 }, { "epoch": 7.277330950253202, "grad_norm": 0.990628719329834, "learning_rate": 4.0124770024172135e-05, "loss": 0.7007, "num_input_tokens_seen": 28358400, "step": 48860 }, { "epoch": 7.278075662794161, "grad_norm": 1.0655592679977417, "learning_rate": 4.0122182601581005e-05, "loss": 0.5481, "num_input_tokens_seen": 28361344, "step": 48865 }, { "epoch": 7.278820375335121, "grad_norm": 1.0442783832550049, "learning_rate": 4.011959492351597e-05, "loss": 0.6208, "num_input_tokens_seen": 28364512, "step": 48870 }, { "epoch": 7.27956508787608, "grad_norm": 1.2236624956130981, "learning_rate": 4.011700699002075e-05, "loss": 0.6307, "num_input_tokens_seen": 28367264, "step": 48875 }, { "epoch": 7.280309800417039, "grad_norm": 0.9568905830383301, "learning_rate": 4.011441880113905e-05, "loss": 0.8658, "num_input_tokens_seen": 28370240, "step": 48880 }, { "epoch": 7.281054512957998, "grad_norm": 1.0297883749008179, "learning_rate": 4.0111830356914605e-05, "loss": 0.5762, "num_input_tokens_seen": 28373216, "step": 48885 }, { "epoch": 7.281799225498958, "grad_norm": 2.1625068187713623, "learning_rate": 4.010924165739115e-05, "loss": 0.7778, "num_input_tokens_seen": 28375872, "step": 48890 }, { "epoch": 7.282543938039916, "grad_norm": 0.9265454411506653, "learning_rate": 4.0106652702612416e-05, "loss": 0.6969, "num_input_tokens_seen": 28378912, "step": 48895 }, { "epoch": 7.283288650580876, "grad_norm": 1.085627555847168, "learning_rate": 4.010406349262214e-05, "loss": 0.6244, "num_input_tokens_seen": 28381728, "step": 48900 }, { "epoch": 7.284033363121835, "grad_norm": 1.2895475625991821, "learning_rate": 4.010147402746405e-05, "loss": 0.739, "num_input_tokens_seen": 28384832, "step": 48905 }, { "epoch": 7.2847780756627944, "grad_norm": 3.876129388809204, "learning_rate": 4.009888430718192e-05, "loss": 0.6072, "num_input_tokens_seen": 28387520, "step": 48910 }, { "epoch": 7.285522788203753, "grad_norm": 0.838741660118103, "learning_rate": 4.009629433181947e-05, "loss": 0.4685, "num_input_tokens_seen": 28390816, "step": 48915 }, { "epoch": 7.286267500744713, "grad_norm": 1.2270846366882324, "learning_rate": 4.009370410142049e-05, "loss": 0.4876, "num_input_tokens_seen": 28393984, "step": 48920 }, { "epoch": 7.287012213285672, "grad_norm": 0.7327113747596741, "learning_rate": 4.00911136160287e-05, "loss": 0.713, "num_input_tokens_seen": 28396736, "step": 48925 }, { "epoch": 7.287756925826631, "grad_norm": 2.8589346408843994, "learning_rate": 4.00885228756879e-05, "loss": 0.6995, "num_input_tokens_seen": 28399680, "step": 48930 }, { "epoch": 7.28850163836759, "grad_norm": 1.0650919675827026, "learning_rate": 4.008593188044183e-05, "loss": 0.7551, "num_input_tokens_seen": 28402336, "step": 48935 }, { "epoch": 7.28924635090855, "grad_norm": 1.635216474533081, "learning_rate": 4.008334063033428e-05, "loss": 0.4541, "num_input_tokens_seen": 28404928, "step": 48940 }, { "epoch": 7.289991063449508, "grad_norm": 1.11054265499115, "learning_rate": 4.008074912540901e-05, "loss": 0.6549, "num_input_tokens_seen": 28407936, "step": 48945 }, { "epoch": 7.290735775990468, "grad_norm": 1.277724266052246, "learning_rate": 4.0078157365709823e-05, "loss": 0.6551, "num_input_tokens_seen": 28410816, "step": 48950 }, { "epoch": 7.291480488531427, "grad_norm": 0.9166579246520996, "learning_rate": 4.0075565351280485e-05, "loss": 0.545, "num_input_tokens_seen": 28413824, "step": 48955 }, { "epoch": 7.292225201072386, "grad_norm": 1.0504177808761597, "learning_rate": 4.00729730821648e-05, "loss": 0.6838, "num_input_tokens_seen": 28416928, "step": 48960 }, { "epoch": 7.292969913613345, "grad_norm": 1.093178391456604, "learning_rate": 4.007038055840654e-05, "loss": 0.6469, "num_input_tokens_seen": 28420000, "step": 48965 }, { "epoch": 7.293714626154305, "grad_norm": 0.9702379703521729, "learning_rate": 4.0067787780049535e-05, "loss": 0.6477, "num_input_tokens_seen": 28422976, "step": 48970 }, { "epoch": 7.294459338695264, "grad_norm": 0.6003273725509644, "learning_rate": 4.0065194747137555e-05, "loss": 0.5943, "num_input_tokens_seen": 28426208, "step": 48975 }, { "epoch": 7.295204051236222, "grad_norm": 1.5289920568466187, "learning_rate": 4.006260145971443e-05, "loss": 0.6295, "num_input_tokens_seen": 28428992, "step": 48980 }, { "epoch": 7.295948763777182, "grad_norm": 1.817901611328125, "learning_rate": 4.006000791782396e-05, "loss": 0.4837, "num_input_tokens_seen": 28431840, "step": 48985 }, { "epoch": 7.296693476318141, "grad_norm": 1.276180624961853, "learning_rate": 4.0057414121509965e-05, "loss": 0.5754, "num_input_tokens_seen": 28435008, "step": 48990 }, { "epoch": 7.2974381888591004, "grad_norm": 1.4451295137405396, "learning_rate": 4.005482007081626e-05, "loss": 0.506, "num_input_tokens_seen": 28438176, "step": 48995 }, { "epoch": 7.298182901400059, "grad_norm": 0.8882007002830505, "learning_rate": 4.005222576578667e-05, "loss": 0.7119, "num_input_tokens_seen": 28441472, "step": 49000 }, { "epoch": 7.298927613941019, "grad_norm": 1.4985634088516235, "learning_rate": 4.004963120646502e-05, "loss": 0.5936, "num_input_tokens_seen": 28444288, "step": 49005 }, { "epoch": 7.299672326481978, "grad_norm": 1.186753749847412, "learning_rate": 4.004703639289515e-05, "loss": 0.5985, "num_input_tokens_seen": 28446944, "step": 49010 }, { "epoch": 7.300417039022937, "grad_norm": 1.074398398399353, "learning_rate": 4.004444132512089e-05, "loss": 0.6954, "num_input_tokens_seen": 28449888, "step": 49015 }, { "epoch": 7.301161751563896, "grad_norm": 0.8298999667167664, "learning_rate": 4.004184600318609e-05, "loss": 0.6492, "num_input_tokens_seen": 28452768, "step": 49020 }, { "epoch": 7.301906464104856, "grad_norm": 1.0073164701461792, "learning_rate": 4.003925042713459e-05, "loss": 0.6266, "num_input_tokens_seen": 28455872, "step": 49025 }, { "epoch": 7.302651176645814, "grad_norm": 2.086637496948242, "learning_rate": 4.003665459701024e-05, "loss": 0.7826, "num_input_tokens_seen": 28458880, "step": 49030 }, { "epoch": 7.303395889186774, "grad_norm": 0.9487127065658569, "learning_rate": 4.003405851285689e-05, "loss": 0.4813, "num_input_tokens_seen": 28461696, "step": 49035 }, { "epoch": 7.304140601727733, "grad_norm": 1.5227386951446533, "learning_rate": 4.00314621747184e-05, "loss": 0.6036, "num_input_tokens_seen": 28464768, "step": 49040 }, { "epoch": 7.3048853142686925, "grad_norm": 1.0612435340881348, "learning_rate": 4.002886558263863e-05, "loss": 0.6331, "num_input_tokens_seen": 28467552, "step": 49045 }, { "epoch": 7.305630026809651, "grad_norm": 1.4276832342147827, "learning_rate": 4.0026268736661457e-05, "loss": 0.6659, "num_input_tokens_seen": 28470592, "step": 49050 }, { "epoch": 7.306374739350611, "grad_norm": 1.1139472723007202, "learning_rate": 4.002367163683075e-05, "loss": 0.4606, "num_input_tokens_seen": 28473376, "step": 49055 }, { "epoch": 7.30711945189157, "grad_norm": 0.7524638772010803, "learning_rate": 4.002107428319037e-05, "loss": 0.5264, "num_input_tokens_seen": 28476160, "step": 49060 }, { "epoch": 7.307864164432529, "grad_norm": 1.0070316791534424, "learning_rate": 4.0018476675784214e-05, "loss": 0.6226, "num_input_tokens_seen": 28478816, "step": 49065 }, { "epoch": 7.308608876973488, "grad_norm": 1.533529281616211, "learning_rate": 4.001587881465616e-05, "loss": 0.7544, "num_input_tokens_seen": 28481920, "step": 49070 }, { "epoch": 7.309353589514448, "grad_norm": 0.905087947845459, "learning_rate": 4.001328069985009e-05, "loss": 0.6361, "num_input_tokens_seen": 28484704, "step": 49075 }, { "epoch": 7.3100983020554064, "grad_norm": 0.7732389569282532, "learning_rate": 4.00106823314099e-05, "loss": 0.5843, "num_input_tokens_seen": 28487616, "step": 49080 }, { "epoch": 7.310843014596366, "grad_norm": 0.9596510529518127, "learning_rate": 4.0008083709379496e-05, "loss": 0.5794, "num_input_tokens_seen": 28490656, "step": 49085 }, { "epoch": 7.311587727137325, "grad_norm": 1.0452309846878052, "learning_rate": 4.0005484833802765e-05, "loss": 0.7804, "num_input_tokens_seen": 28493888, "step": 49090 }, { "epoch": 7.3123324396782845, "grad_norm": 0.8707378506660461, "learning_rate": 4.0002885704723614e-05, "loss": 0.5947, "num_input_tokens_seen": 28496864, "step": 49095 }, { "epoch": 7.313077152219243, "grad_norm": 0.6338812708854675, "learning_rate": 4.000028632218596e-05, "loss": 0.5815, "num_input_tokens_seen": 28499648, "step": 49100 }, { "epoch": 7.313821864760203, "grad_norm": 1.0679370164871216, "learning_rate": 3.9997686686233724e-05, "loss": 0.6706, "num_input_tokens_seen": 28502592, "step": 49105 }, { "epoch": 7.314566577301162, "grad_norm": 1.6324787139892578, "learning_rate": 3.999508679691081e-05, "loss": 0.6829, "num_input_tokens_seen": 28505344, "step": 49110 }, { "epoch": 7.315311289842121, "grad_norm": 2.2577145099639893, "learning_rate": 3.999248665426114e-05, "loss": 0.7799, "num_input_tokens_seen": 28508032, "step": 49115 }, { "epoch": 7.31605600238308, "grad_norm": 0.8493408560752869, "learning_rate": 3.998988625832865e-05, "loss": 0.7559, "num_input_tokens_seen": 28511136, "step": 49120 }, { "epoch": 7.31680071492404, "grad_norm": 0.9597095251083374, "learning_rate": 3.998728560915726e-05, "loss": 0.7147, "num_input_tokens_seen": 28513888, "step": 49125 }, { "epoch": 7.3175454274649985, "grad_norm": 1.0962265729904175, "learning_rate": 3.9984684706790915e-05, "loss": 0.5513, "num_input_tokens_seen": 28517024, "step": 49130 }, { "epoch": 7.318290140005958, "grad_norm": 0.5342617034912109, "learning_rate": 3.998208355127355e-05, "loss": 0.6334, "num_input_tokens_seen": 28519680, "step": 49135 }, { "epoch": 7.319034852546917, "grad_norm": 1.080033540725708, "learning_rate": 3.997948214264911e-05, "loss": 0.5628, "num_input_tokens_seen": 28522752, "step": 49140 }, { "epoch": 7.319779565087876, "grad_norm": 1.466130018234253, "learning_rate": 3.9976880480961556e-05, "loss": 0.6, "num_input_tokens_seen": 28525824, "step": 49145 }, { "epoch": 7.320524277628835, "grad_norm": 0.8064078092575073, "learning_rate": 3.997427856625482e-05, "loss": 0.6312, "num_input_tokens_seen": 28528864, "step": 49150 }, { "epoch": 7.321268990169794, "grad_norm": 1.5985054969787598, "learning_rate": 3.997167639857287e-05, "loss": 0.6666, "num_input_tokens_seen": 28531680, "step": 49155 }, { "epoch": 7.322013702710754, "grad_norm": 0.8288558721542358, "learning_rate": 3.996907397795966e-05, "loss": 0.5222, "num_input_tokens_seen": 28534432, "step": 49160 }, { "epoch": 7.3227584152517124, "grad_norm": 1.102872610092163, "learning_rate": 3.9966471304459154e-05, "loss": 0.5376, "num_input_tokens_seen": 28537440, "step": 49165 }, { "epoch": 7.323503127792672, "grad_norm": 1.335716724395752, "learning_rate": 3.996386837811533e-05, "loss": 0.7559, "num_input_tokens_seen": 28540608, "step": 49170 }, { "epoch": 7.324247840333631, "grad_norm": 1.3343613147735596, "learning_rate": 3.996126519897216e-05, "loss": 0.532, "num_input_tokens_seen": 28543520, "step": 49175 }, { "epoch": 7.3249925528745905, "grad_norm": 1.1773091554641724, "learning_rate": 3.995866176707363e-05, "loss": 0.6531, "num_input_tokens_seen": 28546400, "step": 49180 }, { "epoch": 7.325737265415549, "grad_norm": 1.3600677251815796, "learning_rate": 3.99560580824637e-05, "loss": 0.6456, "num_input_tokens_seen": 28549408, "step": 49185 }, { "epoch": 7.326481977956509, "grad_norm": 1.6797157526016235, "learning_rate": 3.995345414518638e-05, "loss": 0.7911, "num_input_tokens_seen": 28552416, "step": 49190 }, { "epoch": 7.327226690497468, "grad_norm": 1.8333591222763062, "learning_rate": 3.995084995528563e-05, "loss": 0.5566, "num_input_tokens_seen": 28555392, "step": 49195 }, { "epoch": 7.327971403038427, "grad_norm": 1.4084770679473877, "learning_rate": 3.9948245512805484e-05, "loss": 0.6482, "num_input_tokens_seen": 28558336, "step": 49200 }, { "epoch": 7.328716115579386, "grad_norm": 1.4074896574020386, "learning_rate": 3.994564081778992e-05, "loss": 0.7674, "num_input_tokens_seen": 28561056, "step": 49205 }, { "epoch": 7.329460828120346, "grad_norm": 0.8678993582725525, "learning_rate": 3.994303587028294e-05, "loss": 0.6579, "num_input_tokens_seen": 28563840, "step": 49210 }, { "epoch": 7.3302055406613045, "grad_norm": 1.4300023317337036, "learning_rate": 3.9940430670328556e-05, "loss": 0.5501, "num_input_tokens_seen": 28566624, "step": 49215 }, { "epoch": 7.330950253202264, "grad_norm": 0.9671693444252014, "learning_rate": 3.993782521797078e-05, "loss": 0.5481, "num_input_tokens_seen": 28569344, "step": 49220 }, { "epoch": 7.331694965743223, "grad_norm": 1.116416096687317, "learning_rate": 3.993521951325363e-05, "loss": 0.6543, "num_input_tokens_seen": 28572096, "step": 49225 }, { "epoch": 7.3324396782841825, "grad_norm": 1.5685759782791138, "learning_rate": 3.993261355622113e-05, "loss": 0.4889, "num_input_tokens_seen": 28574944, "step": 49230 }, { "epoch": 7.333184390825141, "grad_norm": 0.9352118372917175, "learning_rate": 3.99300073469173e-05, "loss": 0.5344, "num_input_tokens_seen": 28577664, "step": 49235 }, { "epoch": 7.333929103366101, "grad_norm": 0.8061864376068115, "learning_rate": 3.9927400885386165e-05, "loss": 0.6348, "num_input_tokens_seen": 28580448, "step": 49240 }, { "epoch": 7.33467381590706, "grad_norm": 1.0599392652511597, "learning_rate": 3.992479417167177e-05, "loss": 0.7403, "num_input_tokens_seen": 28583328, "step": 49245 }, { "epoch": 7.335418528448019, "grad_norm": 1.9005155563354492, "learning_rate": 3.992218720581814e-05, "loss": 0.5831, "num_input_tokens_seen": 28586080, "step": 49250 }, { "epoch": 7.336163240988978, "grad_norm": 2.1452925205230713, "learning_rate": 3.9919579987869324e-05, "loss": 0.5847, "num_input_tokens_seen": 28589280, "step": 49255 }, { "epoch": 7.336907953529938, "grad_norm": 1.3142582178115845, "learning_rate": 3.991697251786938e-05, "loss": 0.7055, "num_input_tokens_seen": 28592224, "step": 49260 }, { "epoch": 7.3376526660708965, "grad_norm": 0.9842166304588318, "learning_rate": 3.991436479586233e-05, "loss": 0.5329, "num_input_tokens_seen": 28595168, "step": 49265 }, { "epoch": 7.338397378611856, "grad_norm": 1.3246132135391235, "learning_rate": 3.9911756821892256e-05, "loss": 0.6559, "num_input_tokens_seen": 28598112, "step": 49270 }, { "epoch": 7.339142091152815, "grad_norm": 1.840425729751587, "learning_rate": 3.99091485960032e-05, "loss": 0.7843, "num_input_tokens_seen": 28601152, "step": 49275 }, { "epoch": 7.3398868036937746, "grad_norm": 1.0210374593734741, "learning_rate": 3.990654011823923e-05, "loss": 0.4754, "num_input_tokens_seen": 28604192, "step": 49280 }, { "epoch": 7.340631516234733, "grad_norm": 2.370391607284546, "learning_rate": 3.990393138864442e-05, "loss": 0.6739, "num_input_tokens_seen": 28606976, "step": 49285 }, { "epoch": 7.341376228775693, "grad_norm": 1.5255075693130493, "learning_rate": 3.990132240726284e-05, "loss": 0.735, "num_input_tokens_seen": 28609632, "step": 49290 }, { "epoch": 7.342120941316652, "grad_norm": 1.0253324508666992, "learning_rate": 3.989871317413855e-05, "loss": 0.7059, "num_input_tokens_seen": 28612256, "step": 49295 }, { "epoch": 7.342865653857611, "grad_norm": 1.0873878002166748, "learning_rate": 3.989610368931566e-05, "loss": 0.7005, "num_input_tokens_seen": 28615072, "step": 49300 }, { "epoch": 7.34361036639857, "grad_norm": 1.5022670030593872, "learning_rate": 3.9893493952838226e-05, "loss": 0.6448, "num_input_tokens_seen": 28617856, "step": 49305 }, { "epoch": 7.344355078939529, "grad_norm": 1.3669489622116089, "learning_rate": 3.9890883964750355e-05, "loss": 0.6949, "num_input_tokens_seen": 28620736, "step": 49310 }, { "epoch": 7.3450997914804885, "grad_norm": 0.6541488766670227, "learning_rate": 3.9888273725096126e-05, "loss": 0.7093, "num_input_tokens_seen": 28623520, "step": 49315 }, { "epoch": 7.345844504021448, "grad_norm": 0.8186886310577393, "learning_rate": 3.988566323391965e-05, "loss": 0.7063, "num_input_tokens_seen": 28628000, "step": 49320 }, { "epoch": 7.346589216562407, "grad_norm": 0.8455252051353455, "learning_rate": 3.988305249126502e-05, "loss": 0.5747, "num_input_tokens_seen": 28631072, "step": 49325 }, { "epoch": 7.347333929103366, "grad_norm": 2.790388345718384, "learning_rate": 3.988044149717635e-05, "loss": 0.7151, "num_input_tokens_seen": 28633984, "step": 49330 }, { "epoch": 7.348078641644325, "grad_norm": 1.010124921798706, "learning_rate": 3.987783025169773e-05, "loss": 0.5772, "num_input_tokens_seen": 28636768, "step": 49335 }, { "epoch": 7.348823354185284, "grad_norm": 1.1809676885604858, "learning_rate": 3.987521875487331e-05, "loss": 0.6388, "num_input_tokens_seen": 28639488, "step": 49340 }, { "epoch": 7.349568066726244, "grad_norm": 3.339726686477661, "learning_rate": 3.9872607006747174e-05, "loss": 0.6467, "num_input_tokens_seen": 28642720, "step": 49345 }, { "epoch": 7.3503127792672025, "grad_norm": 0.949600338935852, "learning_rate": 3.986999500736346e-05, "loss": 0.7891, "num_input_tokens_seen": 28645504, "step": 49350 }, { "epoch": 7.351057491808162, "grad_norm": 1.0010243654251099, "learning_rate": 3.98673827567663e-05, "loss": 0.5521, "num_input_tokens_seen": 28648544, "step": 49355 }, { "epoch": 7.351802204349121, "grad_norm": 1.797785997390747, "learning_rate": 3.9864770254999814e-05, "loss": 0.5869, "num_input_tokens_seen": 28651744, "step": 49360 }, { "epoch": 7.3525469168900806, "grad_norm": 1.3996620178222656, "learning_rate": 3.986215750210814e-05, "loss": 0.6026, "num_input_tokens_seen": 28654528, "step": 49365 }, { "epoch": 7.353291629431039, "grad_norm": 1.1251856088638306, "learning_rate": 3.985954449813543e-05, "loss": 0.7152, "num_input_tokens_seen": 28657312, "step": 49370 }, { "epoch": 7.354036341971999, "grad_norm": 1.2694425582885742, "learning_rate": 3.9856931243125804e-05, "loss": 0.7074, "num_input_tokens_seen": 28660544, "step": 49375 }, { "epoch": 7.354781054512958, "grad_norm": 1.0373539924621582, "learning_rate": 3.985431773712344e-05, "loss": 0.6011, "num_input_tokens_seen": 28663488, "step": 49380 }, { "epoch": 7.355525767053917, "grad_norm": 0.8659031987190247, "learning_rate": 3.9851703980172464e-05, "loss": 0.552, "num_input_tokens_seen": 28666304, "step": 49385 }, { "epoch": 7.356270479594876, "grad_norm": 1.6679033041000366, "learning_rate": 3.984908997231704e-05, "loss": 0.6095, "num_input_tokens_seen": 28669408, "step": 49390 }, { "epoch": 7.357015192135836, "grad_norm": 1.2347140312194824, "learning_rate": 3.984647571360135e-05, "loss": 0.5264, "num_input_tokens_seen": 28672160, "step": 49395 }, { "epoch": 7.3577599046767945, "grad_norm": 1.0559691190719604, "learning_rate": 3.9843861204069536e-05, "loss": 0.6076, "num_input_tokens_seen": 28675008, "step": 49400 }, { "epoch": 7.358504617217754, "grad_norm": 1.2722280025482178, "learning_rate": 3.9841246443765765e-05, "loss": 0.5742, "num_input_tokens_seen": 28677728, "step": 49405 }, { "epoch": 7.359249329758713, "grad_norm": 1.4415074586868286, "learning_rate": 3.983863143273422e-05, "loss": 0.5665, "num_input_tokens_seen": 28680352, "step": 49410 }, { "epoch": 7.359994042299673, "grad_norm": 1.1958134174346924, "learning_rate": 3.983601617101909e-05, "loss": 0.5369, "num_input_tokens_seen": 28683264, "step": 49415 }, { "epoch": 7.360738754840631, "grad_norm": 0.8873350620269775, "learning_rate": 3.983340065866453e-05, "loss": 0.61, "num_input_tokens_seen": 28685952, "step": 49420 }, { "epoch": 7.361483467381591, "grad_norm": 1.1016017198562622, "learning_rate": 3.9830784895714744e-05, "loss": 0.5241, "num_input_tokens_seen": 28688864, "step": 49425 }, { "epoch": 7.36222817992255, "grad_norm": 1.814510464668274, "learning_rate": 3.982816888221394e-05, "loss": 0.6289, "num_input_tokens_seen": 28691776, "step": 49430 }, { "epoch": 7.362972892463509, "grad_norm": 1.0268304347991943, "learning_rate": 3.9825552618206274e-05, "loss": 0.7372, "num_input_tokens_seen": 28694464, "step": 49435 }, { "epoch": 7.363717605004468, "grad_norm": 1.3117854595184326, "learning_rate": 3.982293610373597e-05, "loss": 0.6783, "num_input_tokens_seen": 28697440, "step": 49440 }, { "epoch": 7.364462317545428, "grad_norm": 1.3993955850601196, "learning_rate": 3.9820319338847224e-05, "loss": 0.6239, "num_input_tokens_seen": 28700352, "step": 49445 }, { "epoch": 7.3652070300863866, "grad_norm": 1.0867363214492798, "learning_rate": 3.981770232358425e-05, "loss": 0.5934, "num_input_tokens_seen": 28703552, "step": 49450 }, { "epoch": 7.365951742627346, "grad_norm": 1.3185609579086304, "learning_rate": 3.9815085057991254e-05, "loss": 0.7469, "num_input_tokens_seen": 28706496, "step": 49455 }, { "epoch": 7.366696455168305, "grad_norm": 1.6313129663467407, "learning_rate": 3.981246754211244e-05, "loss": 0.6519, "num_input_tokens_seen": 28709312, "step": 49460 }, { "epoch": 7.367441167709265, "grad_norm": 1.1209897994995117, "learning_rate": 3.980984977599206e-05, "loss": 0.4917, "num_input_tokens_seen": 28712224, "step": 49465 }, { "epoch": 7.368185880250223, "grad_norm": 1.1968913078308105, "learning_rate": 3.980723175967431e-05, "loss": 0.5911, "num_input_tokens_seen": 28714880, "step": 49470 }, { "epoch": 7.368930592791183, "grad_norm": 0.7293692231178284, "learning_rate": 3.980461349320344e-05, "loss": 0.5781, "num_input_tokens_seen": 28717952, "step": 49475 }, { "epoch": 7.369675305332142, "grad_norm": 2.3666930198669434, "learning_rate": 3.9801994976623655e-05, "loss": 0.6551, "num_input_tokens_seen": 28720576, "step": 49480 }, { "epoch": 7.370420017873101, "grad_norm": 1.460602045059204, "learning_rate": 3.979937620997922e-05, "loss": 0.5432, "num_input_tokens_seen": 28723488, "step": 49485 }, { "epoch": 7.37116473041406, "grad_norm": 1.6252336502075195, "learning_rate": 3.979675719331437e-05, "loss": 0.6424, "num_input_tokens_seen": 28726176, "step": 49490 }, { "epoch": 7.371909442955019, "grad_norm": 1.8055869340896606, "learning_rate": 3.9794137926673337e-05, "loss": 0.7017, "num_input_tokens_seen": 28728992, "step": 49495 }, { "epoch": 7.372654155495979, "grad_norm": 1.0520161390304565, "learning_rate": 3.979151841010038e-05, "loss": 0.6584, "num_input_tokens_seen": 28732032, "step": 49500 }, { "epoch": 7.373398868036937, "grad_norm": 1.706475019454956, "learning_rate": 3.978889864363975e-05, "loss": 0.6643, "num_input_tokens_seen": 28734944, "step": 49505 }, { "epoch": 7.374143580577897, "grad_norm": 1.6428661346435547, "learning_rate": 3.978627862733572e-05, "loss": 0.5659, "num_input_tokens_seen": 28737920, "step": 49510 }, { "epoch": 7.374888293118856, "grad_norm": 1.5812132358551025, "learning_rate": 3.978365836123254e-05, "loss": 0.789, "num_input_tokens_seen": 28740896, "step": 49515 }, { "epoch": 7.375633005659815, "grad_norm": 1.7479339838027954, "learning_rate": 3.978103784537447e-05, "loss": 0.8205, "num_input_tokens_seen": 28743616, "step": 49520 }, { "epoch": 7.376377718200774, "grad_norm": 2.4817652702331543, "learning_rate": 3.977841707980578e-05, "loss": 0.7034, "num_input_tokens_seen": 28746560, "step": 49525 }, { "epoch": 7.377122430741734, "grad_norm": 1.1558064222335815, "learning_rate": 3.977579606457077e-05, "loss": 0.6341, "num_input_tokens_seen": 28749504, "step": 49530 }, { "epoch": 7.3778671432826926, "grad_norm": 1.998745322227478, "learning_rate": 3.97731747997137e-05, "loss": 0.6923, "num_input_tokens_seen": 28752128, "step": 49535 }, { "epoch": 7.378611855823652, "grad_norm": 1.3533872365951538, "learning_rate": 3.9770553285278846e-05, "loss": 0.6303, "num_input_tokens_seen": 28755328, "step": 49540 }, { "epoch": 7.379356568364611, "grad_norm": 0.8182947039604187, "learning_rate": 3.9767931521310514e-05, "loss": 0.5766, "num_input_tokens_seen": 28758240, "step": 49545 }, { "epoch": 7.380101280905571, "grad_norm": 0.8559567332267761, "learning_rate": 3.976530950785299e-05, "loss": 0.6351, "num_input_tokens_seen": 28761088, "step": 49550 }, { "epoch": 7.380845993446529, "grad_norm": 0.9235385060310364, "learning_rate": 3.976268724495057e-05, "loss": 0.612, "num_input_tokens_seen": 28763904, "step": 49555 }, { "epoch": 7.381590705987489, "grad_norm": 1.2041776180267334, "learning_rate": 3.9760064732647545e-05, "loss": 0.5853, "num_input_tokens_seen": 28766656, "step": 49560 }, { "epoch": 7.382335418528448, "grad_norm": 1.2509857416152954, "learning_rate": 3.975744197098823e-05, "loss": 0.638, "num_input_tokens_seen": 28769280, "step": 49565 }, { "epoch": 7.383080131069407, "grad_norm": 0.9202259182929993, "learning_rate": 3.9754818960016934e-05, "loss": 0.4216, "num_input_tokens_seen": 28771872, "step": 49570 }, { "epoch": 7.383824843610366, "grad_norm": 1.6552393436431885, "learning_rate": 3.975219569977797e-05, "loss": 0.6937, "num_input_tokens_seen": 28774464, "step": 49575 }, { "epoch": 7.384569556151326, "grad_norm": 1.7843494415283203, "learning_rate": 3.974957219031565e-05, "loss": 0.7462, "num_input_tokens_seen": 28777280, "step": 49580 }, { "epoch": 7.385314268692285, "grad_norm": 0.8137588500976562, "learning_rate": 3.9746948431674304e-05, "loss": 0.7153, "num_input_tokens_seen": 28780352, "step": 49585 }, { "epoch": 7.386058981233244, "grad_norm": 2.3750038146972656, "learning_rate": 3.974432442389824e-05, "loss": 0.638, "num_input_tokens_seen": 28783328, "step": 49590 }, { "epoch": 7.386803693774203, "grad_norm": 0.7291736006736755, "learning_rate": 3.974170016703181e-05, "loss": 0.6037, "num_input_tokens_seen": 28786208, "step": 49595 }, { "epoch": 7.387548406315163, "grad_norm": 1.5675995349884033, "learning_rate": 3.973907566111934e-05, "loss": 0.7943, "num_input_tokens_seen": 28789088, "step": 49600 }, { "epoch": 7.388293118856121, "grad_norm": 2.4295244216918945, "learning_rate": 3.9736450906205156e-05, "loss": 0.5536, "num_input_tokens_seen": 28792064, "step": 49605 }, { "epoch": 7.389037831397081, "grad_norm": 0.9597671031951904, "learning_rate": 3.973382590233362e-05, "loss": 0.6568, "num_input_tokens_seen": 28794912, "step": 49610 }, { "epoch": 7.38978254393804, "grad_norm": 1.4025356769561768, "learning_rate": 3.973120064954907e-05, "loss": 0.6094, "num_input_tokens_seen": 28798016, "step": 49615 }, { "epoch": 7.390527256478999, "grad_norm": 1.1338403224945068, "learning_rate": 3.972857514789586e-05, "loss": 0.6508, "num_input_tokens_seen": 28800960, "step": 49620 }, { "epoch": 7.391271969019958, "grad_norm": 0.9393031001091003, "learning_rate": 3.972594939741834e-05, "loss": 0.5944, "num_input_tokens_seen": 28803872, "step": 49625 }, { "epoch": 7.392016681560918, "grad_norm": 1.6286635398864746, "learning_rate": 3.9723323398160863e-05, "loss": 0.7997, "num_input_tokens_seen": 28806976, "step": 49630 }, { "epoch": 7.392761394101877, "grad_norm": 1.4230228662490845, "learning_rate": 3.972069715016782e-05, "loss": 0.6364, "num_input_tokens_seen": 28809856, "step": 49635 }, { "epoch": 7.393506106642836, "grad_norm": 1.0801963806152344, "learning_rate": 3.971807065348354e-05, "loss": 0.4663, "num_input_tokens_seen": 28812672, "step": 49640 }, { "epoch": 7.394250819183795, "grad_norm": 0.8065083622932434, "learning_rate": 3.9715443908152426e-05, "loss": 0.5466, "num_input_tokens_seen": 28815584, "step": 49645 }, { "epoch": 7.394995531724755, "grad_norm": 0.9531108140945435, "learning_rate": 3.971281691421884e-05, "loss": 0.7834, "num_input_tokens_seen": 28818656, "step": 49650 }, { "epoch": 7.395740244265713, "grad_norm": 1.965517520904541, "learning_rate": 3.971018967172717e-05, "loss": 0.5863, "num_input_tokens_seen": 28821280, "step": 49655 }, { "epoch": 7.396484956806672, "grad_norm": 1.0204753875732422, "learning_rate": 3.970756218072179e-05, "loss": 0.6245, "num_input_tokens_seen": 28824192, "step": 49660 }, { "epoch": 7.397229669347632, "grad_norm": 1.374731421470642, "learning_rate": 3.97049344412471e-05, "loss": 0.5483, "num_input_tokens_seen": 28827104, "step": 49665 }, { "epoch": 7.3979743818885915, "grad_norm": 0.9657672643661499, "learning_rate": 3.970230645334748e-05, "loss": 0.5499, "num_input_tokens_seen": 28829920, "step": 49670 }, { "epoch": 7.39871909442955, "grad_norm": 1.2002955675125122, "learning_rate": 3.9699678217067346e-05, "loss": 0.6543, "num_input_tokens_seen": 28832704, "step": 49675 }, { "epoch": 7.399463806970509, "grad_norm": 0.8854995369911194, "learning_rate": 3.9697049732451084e-05, "loss": 0.6785, "num_input_tokens_seen": 28835616, "step": 49680 }, { "epoch": 7.400208519511469, "grad_norm": 0.9451801180839539, "learning_rate": 3.9694420999543105e-05, "loss": 0.5623, "num_input_tokens_seen": 28838560, "step": 49685 }, { "epoch": 7.400953232052427, "grad_norm": 1.0896974802017212, "learning_rate": 3.969179201838782e-05, "loss": 0.6332, "num_input_tokens_seen": 28841344, "step": 49690 }, { "epoch": 7.401697944593387, "grad_norm": 0.842011570930481, "learning_rate": 3.968916278902963e-05, "loss": 0.5802, "num_input_tokens_seen": 28844000, "step": 49695 }, { "epoch": 7.402442657134346, "grad_norm": 1.3747345209121704, "learning_rate": 3.968653331151297e-05, "loss": 0.7005, "num_input_tokens_seen": 28847008, "step": 49700 }, { "epoch": 7.403187369675305, "grad_norm": 1.165616512298584, "learning_rate": 3.9683903585882264e-05, "loss": 0.5427, "num_input_tokens_seen": 28849760, "step": 49705 }, { "epoch": 7.403932082216264, "grad_norm": 1.9265022277832031, "learning_rate": 3.9681273612181924e-05, "loss": 0.6879, "num_input_tokens_seen": 28852896, "step": 49710 }, { "epoch": 7.404676794757224, "grad_norm": 2.493516683578491, "learning_rate": 3.967864339045639e-05, "loss": 0.7136, "num_input_tokens_seen": 28856096, "step": 49715 }, { "epoch": 7.405421507298183, "grad_norm": 2.7655069828033447, "learning_rate": 3.967601292075009e-05, "loss": 0.7661, "num_input_tokens_seen": 28859136, "step": 49720 }, { "epoch": 7.406166219839142, "grad_norm": 0.6718544363975525, "learning_rate": 3.967338220310748e-05, "loss": 0.4566, "num_input_tokens_seen": 28861984, "step": 49725 }, { "epoch": 7.406910932380101, "grad_norm": 1.5714166164398193, "learning_rate": 3.967075123757298e-05, "loss": 0.7441, "num_input_tokens_seen": 28865088, "step": 49730 }, { "epoch": 7.407655644921061, "grad_norm": 1.474411964416504, "learning_rate": 3.9668120024191046e-05, "loss": 0.6875, "num_input_tokens_seen": 28868064, "step": 49735 }, { "epoch": 7.408400357462019, "grad_norm": 0.9428443908691406, "learning_rate": 3.966548856300614e-05, "loss": 0.8789, "num_input_tokens_seen": 28870976, "step": 49740 }, { "epoch": 7.409145070002979, "grad_norm": 1.1544193029403687, "learning_rate": 3.9662856854062706e-05, "loss": 0.7848, "num_input_tokens_seen": 28873568, "step": 49745 }, { "epoch": 7.409889782543938, "grad_norm": 1.0260392427444458, "learning_rate": 3.9660224897405206e-05, "loss": 0.5182, "num_input_tokens_seen": 28876576, "step": 49750 }, { "epoch": 7.4106344950848975, "grad_norm": 1.0684256553649902, "learning_rate": 3.965759269307812e-05, "loss": 0.6405, "num_input_tokens_seen": 28879360, "step": 49755 }, { "epoch": 7.411379207625856, "grad_norm": 1.578829288482666, "learning_rate": 3.965496024112589e-05, "loss": 0.3954, "num_input_tokens_seen": 28882336, "step": 49760 }, { "epoch": 7.412123920166816, "grad_norm": 0.8780330419540405, "learning_rate": 3.9652327541593e-05, "loss": 0.5645, "num_input_tokens_seen": 28884992, "step": 49765 }, { "epoch": 7.412868632707775, "grad_norm": 1.6314992904663086, "learning_rate": 3.964969459452393e-05, "loss": 0.8616, "num_input_tokens_seen": 28887840, "step": 49770 }, { "epoch": 7.413613345248734, "grad_norm": 1.2760834693908691, "learning_rate": 3.964706139996316e-05, "loss": 0.5749, "num_input_tokens_seen": 28890848, "step": 49775 }, { "epoch": 7.414358057789693, "grad_norm": 1.3714208602905273, "learning_rate": 3.9644427957955174e-05, "loss": 0.6675, "num_input_tokens_seen": 28893728, "step": 49780 }, { "epoch": 7.415102770330653, "grad_norm": 1.0530140399932861, "learning_rate": 3.9641794268544465e-05, "loss": 0.5462, "num_input_tokens_seen": 28896704, "step": 49785 }, { "epoch": 7.415847482871611, "grad_norm": 1.694346308708191, "learning_rate": 3.963916033177552e-05, "loss": 0.7143, "num_input_tokens_seen": 28899712, "step": 49790 }, { "epoch": 7.416592195412571, "grad_norm": 0.7358806729316711, "learning_rate": 3.963652614769284e-05, "loss": 0.5294, "num_input_tokens_seen": 28902592, "step": 49795 }, { "epoch": 7.41733690795353, "grad_norm": 1.5880550146102905, "learning_rate": 3.963389171634093e-05, "loss": 0.6059, "num_input_tokens_seen": 28905440, "step": 49800 }, { "epoch": 7.4180816204944895, "grad_norm": 0.8101917505264282, "learning_rate": 3.963125703776429e-05, "loss": 0.5201, "num_input_tokens_seen": 28908160, "step": 49805 }, { "epoch": 7.418826333035448, "grad_norm": 1.731857419013977, "learning_rate": 3.962862211200744e-05, "loss": 0.8338, "num_input_tokens_seen": 28910848, "step": 49810 }, { "epoch": 7.419571045576408, "grad_norm": 1.2152382135391235, "learning_rate": 3.962598693911488e-05, "loss": 0.7135, "num_input_tokens_seen": 28913664, "step": 49815 }, { "epoch": 7.420315758117367, "grad_norm": 1.1530433893203735, "learning_rate": 3.962335151913113e-05, "loss": 0.834, "num_input_tokens_seen": 28916512, "step": 49820 }, { "epoch": 7.421060470658326, "grad_norm": 1.9620790481567383, "learning_rate": 3.962071585210072e-05, "loss": 0.7444, "num_input_tokens_seen": 28919648, "step": 49825 }, { "epoch": 7.421805183199285, "grad_norm": 1.217483401298523, "learning_rate": 3.961807993806819e-05, "loss": 0.5419, "num_input_tokens_seen": 28922656, "step": 49830 }, { "epoch": 7.422549895740245, "grad_norm": 1.0941927433013916, "learning_rate": 3.9615443777078046e-05, "loss": 0.6648, "num_input_tokens_seen": 28925376, "step": 49835 }, { "epoch": 7.4232946082812035, "grad_norm": 2.0692191123962402, "learning_rate": 3.961280736917483e-05, "loss": 0.6816, "num_input_tokens_seen": 28928384, "step": 49840 }, { "epoch": 7.424039320822162, "grad_norm": 1.4041095972061157, "learning_rate": 3.961017071440309e-05, "loss": 0.5964, "num_input_tokens_seen": 28931328, "step": 49845 }, { "epoch": 7.424784033363122, "grad_norm": 0.9576525688171387, "learning_rate": 3.960753381280737e-05, "loss": 0.5678, "num_input_tokens_seen": 28934272, "step": 49850 }, { "epoch": 7.425528745904081, "grad_norm": 0.8000956773757935, "learning_rate": 3.96048966644322e-05, "loss": 0.5206, "num_input_tokens_seen": 28937056, "step": 49855 }, { "epoch": 7.42627345844504, "grad_norm": 1.4632108211517334, "learning_rate": 3.9602259269322155e-05, "loss": 0.7364, "num_input_tokens_seen": 28940032, "step": 49860 }, { "epoch": 7.427018170985999, "grad_norm": 0.971656858921051, "learning_rate": 3.9599621627521774e-05, "loss": 0.6023, "num_input_tokens_seen": 28942944, "step": 49865 }, { "epoch": 7.427762883526959, "grad_norm": 1.0563522577285767, "learning_rate": 3.959698373907563e-05, "loss": 0.6962, "num_input_tokens_seen": 28946080, "step": 49870 }, { "epoch": 7.428507596067917, "grad_norm": 1.1599969863891602, "learning_rate": 3.959434560402828e-05, "loss": 0.464, "num_input_tokens_seen": 28948896, "step": 49875 }, { "epoch": 7.429252308608877, "grad_norm": 1.0401246547698975, "learning_rate": 3.9591707222424294e-05, "loss": 0.5094, "num_input_tokens_seen": 28951488, "step": 49880 }, { "epoch": 7.429997021149836, "grad_norm": 1.801411509513855, "learning_rate": 3.958906859430825e-05, "loss": 0.5931, "num_input_tokens_seen": 28954432, "step": 49885 }, { "epoch": 7.4307417336907955, "grad_norm": 1.0264244079589844, "learning_rate": 3.958642971972471e-05, "loss": 0.5893, "num_input_tokens_seen": 28957408, "step": 49890 }, { "epoch": 7.431486446231754, "grad_norm": 0.9872893691062927, "learning_rate": 3.958379059871827e-05, "loss": 0.5305, "num_input_tokens_seen": 28960160, "step": 49895 }, { "epoch": 7.432231158772714, "grad_norm": 1.206788420677185, "learning_rate": 3.9581151231333506e-05, "loss": 0.5818, "num_input_tokens_seen": 28963040, "step": 49900 }, { "epoch": 7.432975871313673, "grad_norm": 1.7356843948364258, "learning_rate": 3.957851161761502e-05, "loss": 0.6407, "num_input_tokens_seen": 28965856, "step": 49905 }, { "epoch": 7.433720583854632, "grad_norm": 1.4276832342147827, "learning_rate": 3.9575871757607385e-05, "loss": 0.5309, "num_input_tokens_seen": 28968800, "step": 49910 }, { "epoch": 7.434465296395591, "grad_norm": 1.340822458267212, "learning_rate": 3.9573231651355225e-05, "loss": 0.5796, "num_input_tokens_seen": 28972000, "step": 49915 }, { "epoch": 7.435210008936551, "grad_norm": 0.9910072684288025, "learning_rate": 3.957059129890311e-05, "loss": 0.6307, "num_input_tokens_seen": 28974688, "step": 49920 }, { "epoch": 7.4359547214775095, "grad_norm": 0.9759251475334167, "learning_rate": 3.956795070029568e-05, "loss": 0.6546, "num_input_tokens_seen": 28977504, "step": 49925 }, { "epoch": 7.436699434018469, "grad_norm": 0.8308677673339844, "learning_rate": 3.956530985557753e-05, "loss": 0.4878, "num_input_tokens_seen": 28980672, "step": 49930 }, { "epoch": 7.437444146559428, "grad_norm": 2.0768964290618896, "learning_rate": 3.9562668764793264e-05, "loss": 0.6444, "num_input_tokens_seen": 28984160, "step": 49935 }, { "epoch": 7.4381888591003875, "grad_norm": 1.1009455919265747, "learning_rate": 3.9560027427987515e-05, "loss": 0.643, "num_input_tokens_seen": 28987200, "step": 49940 }, { "epoch": 7.438933571641346, "grad_norm": 1.0893139839172363, "learning_rate": 3.9557385845204895e-05, "loss": 0.5988, "num_input_tokens_seen": 28989984, "step": 49945 }, { "epoch": 7.439678284182306, "grad_norm": 1.1426295042037964, "learning_rate": 3.955474401649004e-05, "loss": 0.5947, "num_input_tokens_seen": 28992928, "step": 49950 }, { "epoch": 7.440422996723265, "grad_norm": 1.1811619997024536, "learning_rate": 3.955210194188758e-05, "loss": 0.7642, "num_input_tokens_seen": 28995808, "step": 49955 }, { "epoch": 7.441167709264224, "grad_norm": 1.1731499433517456, "learning_rate": 3.954945962144214e-05, "loss": 0.6152, "num_input_tokens_seen": 28998848, "step": 49960 }, { "epoch": 7.441912421805183, "grad_norm": 1.089730143547058, "learning_rate": 3.9546817055198385e-05, "loss": 0.5942, "num_input_tokens_seen": 29001536, "step": 49965 }, { "epoch": 7.442657134346143, "grad_norm": 1.3891764879226685, "learning_rate": 3.954417424320092e-05, "loss": 0.7349, "num_input_tokens_seen": 29004256, "step": 49970 }, { "epoch": 7.4434018468871015, "grad_norm": 0.7151590585708618, "learning_rate": 3.954153118549442e-05, "loss": 0.5945, "num_input_tokens_seen": 29007232, "step": 49975 }, { "epoch": 7.444146559428061, "grad_norm": 0.5168401598930359, "learning_rate": 3.953888788212353e-05, "loss": 0.6595, "num_input_tokens_seen": 29010272, "step": 49980 }, { "epoch": 7.44489127196902, "grad_norm": 0.9505693912506104, "learning_rate": 3.953624433313291e-05, "loss": 0.4878, "num_input_tokens_seen": 29013280, "step": 49985 }, { "epoch": 7.4456359845099795, "grad_norm": 1.0815556049346924, "learning_rate": 3.9533600538567214e-05, "loss": 0.6913, "num_input_tokens_seen": 29016160, "step": 49990 }, { "epoch": 7.446380697050938, "grad_norm": 1.4691922664642334, "learning_rate": 3.953095649847111e-05, "loss": 0.6347, "num_input_tokens_seen": 29019072, "step": 49995 }, { "epoch": 7.447125409591898, "grad_norm": 1.0964363813400269, "learning_rate": 3.952831221288926e-05, "loss": 0.6635, "num_input_tokens_seen": 29021920, "step": 50000 }, { "epoch": 7.447870122132857, "grad_norm": 0.9365902543067932, "learning_rate": 3.9525667681866344e-05, "loss": 0.5735, "num_input_tokens_seen": 29024832, "step": 50005 }, { "epoch": 7.4486148346738155, "grad_norm": 1.6841392517089844, "learning_rate": 3.952302290544704e-05, "loss": 0.5351, "num_input_tokens_seen": 29027712, "step": 50010 }, { "epoch": 7.449359547214775, "grad_norm": 1.472152829170227, "learning_rate": 3.952037788367602e-05, "loss": 0.7408, "num_input_tokens_seen": 29030624, "step": 50015 }, { "epoch": 7.450104259755734, "grad_norm": 1.5020475387573242, "learning_rate": 3.951773261659797e-05, "loss": 0.7533, "num_input_tokens_seen": 29033824, "step": 50020 }, { "epoch": 7.4508489722966935, "grad_norm": 0.7656655311584473, "learning_rate": 3.951508710425758e-05, "loss": 0.6113, "num_input_tokens_seen": 29036704, "step": 50025 }, { "epoch": 7.451593684837652, "grad_norm": 0.8091799020767212, "learning_rate": 3.9512441346699554e-05, "loss": 0.6302, "num_input_tokens_seen": 29039712, "step": 50030 }, { "epoch": 7.452338397378612, "grad_norm": 1.0891923904418945, "learning_rate": 3.950979534396858e-05, "loss": 0.6587, "num_input_tokens_seen": 29042688, "step": 50035 }, { "epoch": 7.453083109919571, "grad_norm": 0.7336135506629944, "learning_rate": 3.9507149096109366e-05, "loss": 0.3457, "num_input_tokens_seen": 29045344, "step": 50040 }, { "epoch": 7.45382782246053, "grad_norm": 1.0626561641693115, "learning_rate": 3.95045026031666e-05, "loss": 0.5115, "num_input_tokens_seen": 29048160, "step": 50045 }, { "epoch": 7.454572535001489, "grad_norm": 1.2853572368621826, "learning_rate": 3.950185586518501e-05, "loss": 0.5088, "num_input_tokens_seen": 29050752, "step": 50050 }, { "epoch": 7.455317247542449, "grad_norm": 0.8320876955986023, "learning_rate": 3.94992088822093e-05, "loss": 0.6794, "num_input_tokens_seen": 29053792, "step": 50055 }, { "epoch": 7.4560619600834075, "grad_norm": 0.7962669134140015, "learning_rate": 3.94965616542842e-05, "loss": 0.7934, "num_input_tokens_seen": 29056576, "step": 50060 }, { "epoch": 7.456806672624367, "grad_norm": 0.9522790908813477, "learning_rate": 3.949391418145442e-05, "loss": 0.6871, "num_input_tokens_seen": 29059328, "step": 50065 }, { "epoch": 7.457551385165326, "grad_norm": 0.9412288069725037, "learning_rate": 3.9491266463764694e-05, "loss": 0.5232, "num_input_tokens_seen": 29062240, "step": 50070 }, { "epoch": 7.4582960977062855, "grad_norm": 1.270904541015625, "learning_rate": 3.948861850125974e-05, "loss": 0.4915, "num_input_tokens_seen": 29064928, "step": 50075 }, { "epoch": 7.459040810247244, "grad_norm": 2.9675402641296387, "learning_rate": 3.948597029398432e-05, "loss": 0.8649, "num_input_tokens_seen": 29067840, "step": 50080 }, { "epoch": 7.459785522788204, "grad_norm": 1.5139402151107788, "learning_rate": 3.9483321841983146e-05, "loss": 0.5916, "num_input_tokens_seen": 29070528, "step": 50085 }, { "epoch": 7.460530235329163, "grad_norm": 1.0940486192703247, "learning_rate": 3.948067314530096e-05, "loss": 0.6578, "num_input_tokens_seen": 29073280, "step": 50090 }, { "epoch": 7.461274947870122, "grad_norm": 2.1680426597595215, "learning_rate": 3.947802420398253e-05, "loss": 0.8912, "num_input_tokens_seen": 29076128, "step": 50095 }, { "epoch": 7.462019660411081, "grad_norm": 0.922279953956604, "learning_rate": 3.947537501807259e-05, "loss": 0.659, "num_input_tokens_seen": 29079392, "step": 50100 }, { "epoch": 7.462764372952041, "grad_norm": 1.2722245454788208, "learning_rate": 3.947272558761591e-05, "loss": 0.6377, "num_input_tokens_seen": 29082272, "step": 50105 }, { "epoch": 7.4635090854929995, "grad_norm": 0.9714254140853882, "learning_rate": 3.947007591265723e-05, "loss": 0.6286, "num_input_tokens_seen": 29085056, "step": 50110 }, { "epoch": 7.464253798033959, "grad_norm": 1.3985484838485718, "learning_rate": 3.9467425993241326e-05, "loss": 0.7524, "num_input_tokens_seen": 29088032, "step": 50115 }, { "epoch": 7.464998510574918, "grad_norm": 1.02155601978302, "learning_rate": 3.946477582941297e-05, "loss": 0.497, "num_input_tokens_seen": 29090880, "step": 50120 }, { "epoch": 7.465743223115878, "grad_norm": 1.4981069564819336, "learning_rate": 3.946212542121692e-05, "loss": 0.5157, "num_input_tokens_seen": 29093824, "step": 50125 }, { "epoch": 7.466487935656836, "grad_norm": 1.535190463066101, "learning_rate": 3.945947476869797e-05, "loss": 0.6281, "num_input_tokens_seen": 29096736, "step": 50130 }, { "epoch": 7.467232648197796, "grad_norm": 0.9646791815757751, "learning_rate": 3.945682387190088e-05, "loss": 0.6085, "num_input_tokens_seen": 29099776, "step": 50135 }, { "epoch": 7.467977360738755, "grad_norm": 0.8266769051551819, "learning_rate": 3.9454172730870445e-05, "loss": 0.6438, "num_input_tokens_seen": 29103008, "step": 50140 }, { "epoch": 7.468722073279714, "grad_norm": 0.8822413682937622, "learning_rate": 3.9451521345651456e-05, "loss": 0.6076, "num_input_tokens_seen": 29106144, "step": 50145 }, { "epoch": 7.469466785820673, "grad_norm": 1.2187575101852417, "learning_rate": 3.94488697162887e-05, "loss": 0.5576, "num_input_tokens_seen": 29108800, "step": 50150 }, { "epoch": 7.470211498361633, "grad_norm": 1.0265334844589233, "learning_rate": 3.944621784282697e-05, "loss": 0.7269, "num_input_tokens_seen": 29111872, "step": 50155 }, { "epoch": 7.4709562109025915, "grad_norm": 1.600683331489563, "learning_rate": 3.944356572531108e-05, "loss": 0.6764, "num_input_tokens_seen": 29114688, "step": 50160 }, { "epoch": 7.471700923443551, "grad_norm": 1.2393211126327515, "learning_rate": 3.944091336378583e-05, "loss": 0.5929, "num_input_tokens_seen": 29117856, "step": 50165 }, { "epoch": 7.47244563598451, "grad_norm": 1.2255371809005737, "learning_rate": 3.943826075829602e-05, "loss": 0.5183, "num_input_tokens_seen": 29120640, "step": 50170 }, { "epoch": 7.473190348525469, "grad_norm": 1.0833665132522583, "learning_rate": 3.943560790888647e-05, "loss": 0.5301, "num_input_tokens_seen": 29123712, "step": 50175 }, { "epoch": 7.473935061066428, "grad_norm": 1.300445556640625, "learning_rate": 3.9432954815601995e-05, "loss": 0.5022, "num_input_tokens_seen": 29126656, "step": 50180 }, { "epoch": 7.474679773607388, "grad_norm": 1.5546598434448242, "learning_rate": 3.943030147848742e-05, "loss": 0.5494, "num_input_tokens_seen": 29129344, "step": 50185 }, { "epoch": 7.475424486148347, "grad_norm": 1.0452097654342651, "learning_rate": 3.9427647897587564e-05, "loss": 0.5963, "num_input_tokens_seen": 29131968, "step": 50190 }, { "epoch": 7.4761691986893055, "grad_norm": 1.294724464416504, "learning_rate": 3.9424994072947256e-05, "loss": 0.6597, "num_input_tokens_seen": 29134880, "step": 50195 }, { "epoch": 7.476913911230265, "grad_norm": 0.8551366925239563, "learning_rate": 3.942234000461135e-05, "loss": 0.4713, "num_input_tokens_seen": 29137664, "step": 50200 }, { "epoch": 7.477658623771224, "grad_norm": 1.828795313835144, "learning_rate": 3.941968569262465e-05, "loss": 0.6663, "num_input_tokens_seen": 29140416, "step": 50205 }, { "epoch": 7.478403336312184, "grad_norm": 1.4092509746551514, "learning_rate": 3.9417031137032025e-05, "loss": 0.7058, "num_input_tokens_seen": 29143648, "step": 50210 }, { "epoch": 7.479148048853142, "grad_norm": 0.9575617909431458, "learning_rate": 3.941437633787831e-05, "loss": 0.6562, "num_input_tokens_seen": 29146464, "step": 50215 }, { "epoch": 7.479892761394102, "grad_norm": 1.8036911487579346, "learning_rate": 3.941172129520836e-05, "loss": 0.4737, "num_input_tokens_seen": 29148896, "step": 50220 }, { "epoch": 7.480637473935061, "grad_norm": 0.769875705242157, "learning_rate": 3.940906600906702e-05, "loss": 0.5331, "num_input_tokens_seen": 29151840, "step": 50225 }, { "epoch": 7.48138218647602, "grad_norm": 1.8403446674346924, "learning_rate": 3.9406410479499155e-05, "loss": 0.5911, "num_input_tokens_seen": 29154720, "step": 50230 }, { "epoch": 7.482126899016979, "grad_norm": 0.8250600099563599, "learning_rate": 3.940375470654963e-05, "loss": 0.5974, "num_input_tokens_seen": 29157504, "step": 50235 }, { "epoch": 7.482871611557939, "grad_norm": 1.560735821723938, "learning_rate": 3.9401098690263316e-05, "loss": 0.5802, "num_input_tokens_seen": 29160480, "step": 50240 }, { "epoch": 7.4836163240988975, "grad_norm": 1.4854203462600708, "learning_rate": 3.939844243068507e-05, "loss": 0.7031, "num_input_tokens_seen": 29163552, "step": 50245 }, { "epoch": 7.484361036639857, "grad_norm": 1.0218288898468018, "learning_rate": 3.939578592785977e-05, "loss": 0.5596, "num_input_tokens_seen": 29166336, "step": 50250 }, { "epoch": 7.485105749180816, "grad_norm": 1.0093806982040405, "learning_rate": 3.93931291818323e-05, "loss": 0.61, "num_input_tokens_seen": 29169216, "step": 50255 }, { "epoch": 7.485850461721776, "grad_norm": 1.2366493940353394, "learning_rate": 3.939047219264754e-05, "loss": 0.6226, "num_input_tokens_seen": 29172064, "step": 50260 }, { "epoch": 7.486595174262734, "grad_norm": 0.7108727097511292, "learning_rate": 3.938781496035038e-05, "loss": 0.5836, "num_input_tokens_seen": 29174848, "step": 50265 }, { "epoch": 7.487339886803694, "grad_norm": 1.9623056650161743, "learning_rate": 3.93851574849857e-05, "loss": 0.5176, "num_input_tokens_seen": 29178176, "step": 50270 }, { "epoch": 7.488084599344653, "grad_norm": 0.979856550693512, "learning_rate": 3.9382499766598416e-05, "loss": 0.5716, "num_input_tokens_seen": 29181184, "step": 50275 }, { "epoch": 7.488829311885612, "grad_norm": 1.0225176811218262, "learning_rate": 3.937984180523342e-05, "loss": 0.5832, "num_input_tokens_seen": 29184256, "step": 50280 }, { "epoch": 7.489574024426571, "grad_norm": 1.4237391948699951, "learning_rate": 3.9377183600935595e-05, "loss": 0.5523, "num_input_tokens_seen": 29187296, "step": 50285 }, { "epoch": 7.490318736967531, "grad_norm": 1.4026838541030884, "learning_rate": 3.937452515374987e-05, "loss": 0.6949, "num_input_tokens_seen": 29190368, "step": 50290 }, { "epoch": 7.49106344950849, "grad_norm": 1.0927903652191162, "learning_rate": 3.9371866463721165e-05, "loss": 0.4959, "num_input_tokens_seen": 29193472, "step": 50295 }, { "epoch": 7.491808162049449, "grad_norm": 0.8240058422088623, "learning_rate": 3.9369207530894374e-05, "loss": 0.6831, "num_input_tokens_seen": 29196352, "step": 50300 }, { "epoch": 7.492552874590408, "grad_norm": 1.1768869161605835, "learning_rate": 3.9366548355314426e-05, "loss": 0.5955, "num_input_tokens_seen": 29199392, "step": 50305 }, { "epoch": 7.493297587131368, "grad_norm": 1.977604866027832, "learning_rate": 3.936388893702625e-05, "loss": 0.5165, "num_input_tokens_seen": 29202272, "step": 50310 }, { "epoch": 7.494042299672326, "grad_norm": 1.1021718978881836, "learning_rate": 3.936122927607476e-05, "loss": 0.5091, "num_input_tokens_seen": 29204960, "step": 50315 }, { "epoch": 7.494787012213286, "grad_norm": 1.205838918685913, "learning_rate": 3.935856937250491e-05, "loss": 0.5485, "num_input_tokens_seen": 29207808, "step": 50320 }, { "epoch": 7.495531724754245, "grad_norm": 1.2697904109954834, "learning_rate": 3.935590922636161e-05, "loss": 0.4856, "num_input_tokens_seen": 29210592, "step": 50325 }, { "epoch": 7.496276437295204, "grad_norm": 1.3228222131729126, "learning_rate": 3.935324883768983e-05, "loss": 0.6811, "num_input_tokens_seen": 29213536, "step": 50330 }, { "epoch": 7.497021149836163, "grad_norm": 1.7013657093048096, "learning_rate": 3.9350588206534486e-05, "loss": 0.8473, "num_input_tokens_seen": 29216704, "step": 50335 }, { "epoch": 7.497765862377123, "grad_norm": 1.5748755931854248, "learning_rate": 3.934792733294054e-05, "loss": 0.7504, "num_input_tokens_seen": 29219872, "step": 50340 }, { "epoch": 7.498510574918082, "grad_norm": 1.6328964233398438, "learning_rate": 3.9345266216952945e-05, "loss": 0.6412, "num_input_tokens_seen": 29222720, "step": 50345 }, { "epoch": 7.499255287459041, "grad_norm": 1.1658893823623657, "learning_rate": 3.934260485861667e-05, "loss": 0.7217, "num_input_tokens_seen": 29225632, "step": 50350 }, { "epoch": 7.5, "grad_norm": 1.191338062286377, "learning_rate": 3.933994325797665e-05, "loss": 0.533, "num_input_tokens_seen": 29228704, "step": 50355 }, { "epoch": 7.500744712540959, "grad_norm": 0.9816061854362488, "learning_rate": 3.9337281415077866e-05, "loss": 0.5939, "num_input_tokens_seen": 29231712, "step": 50360 }, { "epoch": 7.501489425081918, "grad_norm": 0.8480657339096069, "learning_rate": 3.933461932996528e-05, "loss": 0.4625, "num_input_tokens_seen": 29234560, "step": 50365 }, { "epoch": 7.502234137622878, "grad_norm": 0.8000054955482483, "learning_rate": 3.933195700268388e-05, "loss": 0.5174, "num_input_tokens_seen": 29237376, "step": 50370 }, { "epoch": 7.502978850163837, "grad_norm": 1.6450928449630737, "learning_rate": 3.932929443327862e-05, "loss": 0.6435, "num_input_tokens_seen": 29240352, "step": 50375 }, { "epoch": 7.503723562704796, "grad_norm": 1.6185038089752197, "learning_rate": 3.932663162179451e-05, "loss": 0.7817, "num_input_tokens_seen": 29243040, "step": 50380 }, { "epoch": 7.504468275245755, "grad_norm": 0.7939472794532776, "learning_rate": 3.93239685682765e-05, "loss": 0.5542, "num_input_tokens_seen": 29246080, "step": 50385 }, { "epoch": 7.505212987786714, "grad_norm": 1.9410876035690308, "learning_rate": 3.932130527276961e-05, "loss": 0.706, "num_input_tokens_seen": 29248992, "step": 50390 }, { "epoch": 7.505957700327674, "grad_norm": 0.9358459115028381, "learning_rate": 3.931864173531883e-05, "loss": 0.683, "num_input_tokens_seen": 29251680, "step": 50395 }, { "epoch": 7.506702412868632, "grad_norm": 1.937404990196228, "learning_rate": 3.931597795596914e-05, "loss": 0.6781, "num_input_tokens_seen": 29254528, "step": 50400 }, { "epoch": 7.507447125409592, "grad_norm": 0.9975799918174744, "learning_rate": 3.931331393476556e-05, "loss": 0.706, "num_input_tokens_seen": 29257408, "step": 50405 }, { "epoch": 7.508191837950551, "grad_norm": 0.7317049503326416, "learning_rate": 3.931064967175309e-05, "loss": 0.5877, "num_input_tokens_seen": 29260192, "step": 50410 }, { "epoch": 7.50893655049151, "grad_norm": 1.486113429069519, "learning_rate": 3.9307985166976726e-05, "loss": 0.6904, "num_input_tokens_seen": 29263168, "step": 50415 }, { "epoch": 7.509681263032469, "grad_norm": 0.7600506544113159, "learning_rate": 3.93053204204815e-05, "loss": 0.6281, "num_input_tokens_seen": 29266208, "step": 50420 }, { "epoch": 7.510425975573429, "grad_norm": 0.9562033414840698, "learning_rate": 3.930265543231243e-05, "loss": 0.5833, "num_input_tokens_seen": 29269376, "step": 50425 }, { "epoch": 7.511170688114388, "grad_norm": 1.675089955329895, "learning_rate": 3.9299990202514525e-05, "loss": 0.6679, "num_input_tokens_seen": 29271968, "step": 50430 }, { "epoch": 7.511915400655347, "grad_norm": 1.0063116550445557, "learning_rate": 3.9297324731132826e-05, "loss": 0.7145, "num_input_tokens_seen": 29275264, "step": 50435 }, { "epoch": 7.512660113196306, "grad_norm": 1.3449301719665527, "learning_rate": 3.9294659018212356e-05, "loss": 0.4623, "num_input_tokens_seen": 29278080, "step": 50440 }, { "epoch": 7.513404825737266, "grad_norm": 1.3828052282333374, "learning_rate": 3.929199306379815e-05, "loss": 0.7545, "num_input_tokens_seen": 29280704, "step": 50445 }, { "epoch": 7.514149538278224, "grad_norm": 1.156672477722168, "learning_rate": 3.928932686793524e-05, "loss": 0.5326, "num_input_tokens_seen": 29283424, "step": 50450 }, { "epoch": 7.514894250819184, "grad_norm": 1.7121226787567139, "learning_rate": 3.9286660430668686e-05, "loss": 0.5781, "num_input_tokens_seen": 29286432, "step": 50455 }, { "epoch": 7.515638963360143, "grad_norm": 2.0152359008789062, "learning_rate": 3.928399375204352e-05, "loss": 0.5216, "num_input_tokens_seen": 29289504, "step": 50460 }, { "epoch": 7.5163836759011025, "grad_norm": 0.9603279829025269, "learning_rate": 3.9281326832104795e-05, "loss": 0.5611, "num_input_tokens_seen": 29292480, "step": 50465 }, { "epoch": 7.517128388442061, "grad_norm": 1.4175347089767456, "learning_rate": 3.9278659670897564e-05, "loss": 0.6938, "num_input_tokens_seen": 29295488, "step": 50470 }, { "epoch": 7.517873100983021, "grad_norm": 2.7885191440582275, "learning_rate": 3.92759922684669e-05, "loss": 0.7649, "num_input_tokens_seen": 29298208, "step": 50475 }, { "epoch": 7.51861781352398, "grad_norm": 1.4677648544311523, "learning_rate": 3.927332462485785e-05, "loss": 0.5678, "num_input_tokens_seen": 29301024, "step": 50480 }, { "epoch": 7.519362526064939, "grad_norm": 1.7560617923736572, "learning_rate": 3.92706567401155e-05, "loss": 0.6159, "num_input_tokens_seen": 29303840, "step": 50485 }, { "epoch": 7.520107238605898, "grad_norm": 0.8141717314720154, "learning_rate": 3.9267988614284886e-05, "loss": 0.6046, "num_input_tokens_seen": 29306784, "step": 50490 }, { "epoch": 7.520851951146858, "grad_norm": 1.2976704835891724, "learning_rate": 3.926532024741113e-05, "loss": 0.6672, "num_input_tokens_seen": 29309536, "step": 50495 }, { "epoch": 7.521596663687816, "grad_norm": 0.8423643112182617, "learning_rate": 3.926265163953927e-05, "loss": 0.7165, "num_input_tokens_seen": 29312896, "step": 50500 }, { "epoch": 7.522341376228776, "grad_norm": 0.6210420727729797, "learning_rate": 3.925998279071441e-05, "loss": 0.6151, "num_input_tokens_seen": 29315840, "step": 50505 }, { "epoch": 7.523086088769735, "grad_norm": 1.8430795669555664, "learning_rate": 3.9257313700981634e-05, "loss": 0.7264, "num_input_tokens_seen": 29318880, "step": 50510 }, { "epoch": 7.5238308013106945, "grad_norm": 1.353185772895813, "learning_rate": 3.9254644370386036e-05, "loss": 0.5909, "num_input_tokens_seen": 29321568, "step": 50515 }, { "epoch": 7.524575513851653, "grad_norm": 1.1919153928756714, "learning_rate": 3.925197479897271e-05, "loss": 0.556, "num_input_tokens_seen": 29324544, "step": 50520 }, { "epoch": 7.525320226392612, "grad_norm": 1.6975903511047363, "learning_rate": 3.924930498678675e-05, "loss": 0.5826, "num_input_tokens_seen": 29327232, "step": 50525 }, { "epoch": 7.526064938933572, "grad_norm": 2.3229830265045166, "learning_rate": 3.924663493387326e-05, "loss": 0.7303, "num_input_tokens_seen": 29330144, "step": 50530 }, { "epoch": 7.526809651474531, "grad_norm": 1.6436498165130615, "learning_rate": 3.924396464027736e-05, "loss": 0.6294, "num_input_tokens_seen": 29333152, "step": 50535 }, { "epoch": 7.52755436401549, "grad_norm": 1.0377737283706665, "learning_rate": 3.924129410604416e-05, "loss": 0.7243, "num_input_tokens_seen": 29335872, "step": 50540 }, { "epoch": 7.528299076556449, "grad_norm": 0.8200943470001221, "learning_rate": 3.923862333121876e-05, "loss": 0.658, "num_input_tokens_seen": 29338720, "step": 50545 }, { "epoch": 7.5290437890974085, "grad_norm": 2.506394863128662, "learning_rate": 3.92359523158463e-05, "loss": 0.8437, "num_input_tokens_seen": 29341600, "step": 50550 }, { "epoch": 7.529788501638367, "grad_norm": 2.491610288619995, "learning_rate": 3.923328105997188e-05, "loss": 0.7001, "num_input_tokens_seen": 29344608, "step": 50555 }, { "epoch": 7.530533214179327, "grad_norm": 1.6294476985931396, "learning_rate": 3.923060956364066e-05, "loss": 0.7247, "num_input_tokens_seen": 29347680, "step": 50560 }, { "epoch": 7.531277926720286, "grad_norm": 1.3963178396224976, "learning_rate": 3.922793782689774e-05, "loss": 0.4179, "num_input_tokens_seen": 29350688, "step": 50565 }, { "epoch": 7.532022639261245, "grad_norm": 1.0409501791000366, "learning_rate": 3.922526584978829e-05, "loss": 0.6657, "num_input_tokens_seen": 29353632, "step": 50570 }, { "epoch": 7.532767351802204, "grad_norm": 1.0814546346664429, "learning_rate": 3.922259363235741e-05, "loss": 0.5616, "num_input_tokens_seen": 29356448, "step": 50575 }, { "epoch": 7.533512064343164, "grad_norm": 2.25596284866333, "learning_rate": 3.921992117465028e-05, "loss": 0.5903, "num_input_tokens_seen": 29359488, "step": 50580 }, { "epoch": 7.534256776884122, "grad_norm": 1.3621671199798584, "learning_rate": 3.921724847671202e-05, "loss": 0.7543, "num_input_tokens_seen": 29362176, "step": 50585 }, { "epoch": 7.535001489425082, "grad_norm": 2.2398922443389893, "learning_rate": 3.9214575538587804e-05, "loss": 0.6649, "num_input_tokens_seen": 29364704, "step": 50590 }, { "epoch": 7.535746201966041, "grad_norm": 1.045780062675476, "learning_rate": 3.921190236032278e-05, "loss": 0.4162, "num_input_tokens_seen": 29367328, "step": 50595 }, { "epoch": 7.5364909145070005, "grad_norm": 1.163966417312622, "learning_rate": 3.920922894196212e-05, "loss": 0.5171, "num_input_tokens_seen": 29370464, "step": 50600 }, { "epoch": 7.537235627047959, "grad_norm": 1.499220609664917, "learning_rate": 3.920655528355097e-05, "loss": 0.5706, "num_input_tokens_seen": 29373344, "step": 50605 }, { "epoch": 7.537980339588919, "grad_norm": 0.9048068523406982, "learning_rate": 3.920388138513451e-05, "loss": 0.6115, "num_input_tokens_seen": 29376256, "step": 50610 }, { "epoch": 7.538725052129878, "grad_norm": 2.957019567489624, "learning_rate": 3.920120724675791e-05, "loss": 0.7386, "num_input_tokens_seen": 29378944, "step": 50615 }, { "epoch": 7.539469764670837, "grad_norm": 1.1199177503585815, "learning_rate": 3.9198532868466345e-05, "loss": 0.6914, "num_input_tokens_seen": 29381888, "step": 50620 }, { "epoch": 7.540214477211796, "grad_norm": 1.1246284246444702, "learning_rate": 3.919585825030499e-05, "loss": 0.6035, "num_input_tokens_seen": 29384768, "step": 50625 }, { "epoch": 7.540959189752756, "grad_norm": 1.9236726760864258, "learning_rate": 3.9193183392319054e-05, "loss": 0.7055, "num_input_tokens_seen": 29387648, "step": 50630 }, { "epoch": 7.5417039022937145, "grad_norm": 1.0347418785095215, "learning_rate": 3.9190508294553694e-05, "loss": 0.5572, "num_input_tokens_seen": 29390816, "step": 50635 }, { "epoch": 7.542448614834674, "grad_norm": 1.385054588317871, "learning_rate": 3.918783295705414e-05, "loss": 0.524, "num_input_tokens_seen": 29393632, "step": 50640 }, { "epoch": 7.543193327375633, "grad_norm": 1.1240969896316528, "learning_rate": 3.9185157379865553e-05, "loss": 0.6104, "num_input_tokens_seen": 29396576, "step": 50645 }, { "epoch": 7.5439380399165925, "grad_norm": 1.1673904657363892, "learning_rate": 3.9182481563033155e-05, "loss": 0.5135, "num_input_tokens_seen": 29399584, "step": 50650 }, { "epoch": 7.544682752457551, "grad_norm": 0.8380643725395203, "learning_rate": 3.917980550660214e-05, "loss": 0.4585, "num_input_tokens_seen": 29402336, "step": 50655 }, { "epoch": 7.545427464998511, "grad_norm": 2.954551935195923, "learning_rate": 3.9177129210617725e-05, "loss": 0.8788, "num_input_tokens_seen": 29405216, "step": 50660 }, { "epoch": 7.54617217753947, "grad_norm": 2.3073253631591797, "learning_rate": 3.9174452675125115e-05, "loss": 0.7073, "num_input_tokens_seen": 29407968, "step": 50665 }, { "epoch": 7.546916890080429, "grad_norm": 1.784289002418518, "learning_rate": 3.917177590016954e-05, "loss": 0.5615, "num_input_tokens_seen": 29410976, "step": 50670 }, { "epoch": 7.547661602621388, "grad_norm": 2.8086299896240234, "learning_rate": 3.9169098885796216e-05, "loss": 0.5297, "num_input_tokens_seen": 29413568, "step": 50675 }, { "epoch": 7.548406315162348, "grad_norm": 2.2366127967834473, "learning_rate": 3.916642163205036e-05, "loss": 0.6843, "num_input_tokens_seen": 29416224, "step": 50680 }, { "epoch": 7.5491510277033065, "grad_norm": 1.4906513690948486, "learning_rate": 3.916374413897722e-05, "loss": 0.7608, "num_input_tokens_seen": 29419328, "step": 50685 }, { "epoch": 7.549895740244265, "grad_norm": 1.4549106359481812, "learning_rate": 3.916106640662201e-05, "loss": 0.6271, "num_input_tokens_seen": 29422240, "step": 50690 }, { "epoch": 7.550640452785225, "grad_norm": 1.1257569789886475, "learning_rate": 3.915838843502998e-05, "loss": 0.6102, "num_input_tokens_seen": 29425056, "step": 50695 }, { "epoch": 7.5513851653261845, "grad_norm": 1.624337911605835, "learning_rate": 3.9155710224246365e-05, "loss": 0.5587, "num_input_tokens_seen": 29427680, "step": 50700 }, { "epoch": 7.552129877867143, "grad_norm": 1.1291375160217285, "learning_rate": 3.915303177431641e-05, "loss": 0.6542, "num_input_tokens_seen": 29430304, "step": 50705 }, { "epoch": 7.552874590408102, "grad_norm": 0.9506531357765198, "learning_rate": 3.915035308528537e-05, "loss": 0.6117, "num_input_tokens_seen": 29433056, "step": 50710 }, { "epoch": 7.553619302949062, "grad_norm": 2.335803270339966, "learning_rate": 3.91476741571985e-05, "loss": 0.7137, "num_input_tokens_seen": 29435776, "step": 50715 }, { "epoch": 7.554364015490021, "grad_norm": 1.3093700408935547, "learning_rate": 3.914499499010105e-05, "loss": 0.6458, "num_input_tokens_seen": 29438432, "step": 50720 }, { "epoch": 7.55510872803098, "grad_norm": 1.0930038690567017, "learning_rate": 3.9142315584038284e-05, "loss": 0.7753, "num_input_tokens_seen": 29441568, "step": 50725 }, { "epoch": 7.555853440571939, "grad_norm": 1.0682148933410645, "learning_rate": 3.913963593905548e-05, "loss": 0.6557, "num_input_tokens_seen": 29444320, "step": 50730 }, { "epoch": 7.5565981531128985, "grad_norm": 1.5313286781311035, "learning_rate": 3.913695605519788e-05, "loss": 0.6187, "num_input_tokens_seen": 29447104, "step": 50735 }, { "epoch": 7.557342865653857, "grad_norm": 1.3080421686172485, "learning_rate": 3.913427593251079e-05, "loss": 0.6131, "num_input_tokens_seen": 29450176, "step": 50740 }, { "epoch": 7.558087578194817, "grad_norm": 0.9814611077308655, "learning_rate": 3.913159557103947e-05, "loss": 0.7596, "num_input_tokens_seen": 29452992, "step": 50745 }, { "epoch": 7.558832290735776, "grad_norm": 0.7661518454551697, "learning_rate": 3.912891497082921e-05, "loss": 0.6032, "num_input_tokens_seen": 29455680, "step": 50750 }, { "epoch": 7.559577003276735, "grad_norm": 1.3518668413162231, "learning_rate": 3.9126234131925285e-05, "loss": 0.5984, "num_input_tokens_seen": 29458400, "step": 50755 }, { "epoch": 7.560321715817694, "grad_norm": 2.8582074642181396, "learning_rate": 3.9123553054372994e-05, "loss": 0.6252, "num_input_tokens_seen": 29461600, "step": 50760 }, { "epoch": 7.561066428358654, "grad_norm": 1.0491728782653809, "learning_rate": 3.912087173821762e-05, "loss": 0.5563, "num_input_tokens_seen": 29464448, "step": 50765 }, { "epoch": 7.5618111408996125, "grad_norm": 1.4016492366790771, "learning_rate": 3.911819018350449e-05, "loss": 0.7453, "num_input_tokens_seen": 29467424, "step": 50770 }, { "epoch": 7.562555853440572, "grad_norm": 2.259864091873169, "learning_rate": 3.9115508390278864e-05, "loss": 0.6955, "num_input_tokens_seen": 29469952, "step": 50775 }, { "epoch": 7.563300565981531, "grad_norm": 1.6466976404190063, "learning_rate": 3.9112826358586086e-05, "loss": 0.7237, "num_input_tokens_seen": 29473056, "step": 50780 }, { "epoch": 7.5640452785224905, "grad_norm": 1.3822307586669922, "learning_rate": 3.9110144088471437e-05, "loss": 0.7295, "num_input_tokens_seen": 29475840, "step": 50785 }, { "epoch": 7.564789991063449, "grad_norm": 1.1074923276901245, "learning_rate": 3.9107461579980255e-05, "loss": 0.7731, "num_input_tokens_seen": 29478784, "step": 50790 }, { "epoch": 7.565534703604409, "grad_norm": 2.114750385284424, "learning_rate": 3.910477883315785e-05, "loss": 0.6083, "num_input_tokens_seen": 29481792, "step": 50795 }, { "epoch": 7.566279416145368, "grad_norm": 2.0707333087921143, "learning_rate": 3.910209584804953e-05, "loss": 0.6404, "num_input_tokens_seen": 29484608, "step": 50800 }, { "epoch": 7.567024128686327, "grad_norm": 1.5843528509140015, "learning_rate": 3.909941262470064e-05, "loss": 0.4857, "num_input_tokens_seen": 29487456, "step": 50805 }, { "epoch": 7.567768841227286, "grad_norm": 1.2160111665725708, "learning_rate": 3.909672916315651e-05, "loss": 0.748, "num_input_tokens_seen": 29490176, "step": 50810 }, { "epoch": 7.568513553768246, "grad_norm": 1.1077944040298462, "learning_rate": 3.909404546346246e-05, "loss": 0.5646, "num_input_tokens_seen": 29493536, "step": 50815 }, { "epoch": 7.5692582663092045, "grad_norm": 1.8695268630981445, "learning_rate": 3.909136152566384e-05, "loss": 0.5663, "num_input_tokens_seen": 29496480, "step": 50820 }, { "epoch": 7.570002978850164, "grad_norm": 0.8962607383728027, "learning_rate": 3.908867734980599e-05, "loss": 0.7196, "num_input_tokens_seen": 29499136, "step": 50825 }, { "epoch": 7.570747691391123, "grad_norm": 0.6314185261726379, "learning_rate": 3.908599293593425e-05, "loss": 0.7427, "num_input_tokens_seen": 29502208, "step": 50830 }, { "epoch": 7.571492403932083, "grad_norm": 0.7513493299484253, "learning_rate": 3.908330828409397e-05, "loss": 0.5304, "num_input_tokens_seen": 29505216, "step": 50835 }, { "epoch": 7.572237116473041, "grad_norm": 1.6177680492401123, "learning_rate": 3.908062339433052e-05, "loss": 0.8497, "num_input_tokens_seen": 29508288, "step": 50840 }, { "epoch": 7.572981829014001, "grad_norm": 1.4523532390594482, "learning_rate": 3.907793826668925e-05, "loss": 0.6051, "num_input_tokens_seen": 29511200, "step": 50845 }, { "epoch": 7.57372654155496, "grad_norm": 1.4263447523117065, "learning_rate": 3.907525290121552e-05, "loss": 0.6113, "num_input_tokens_seen": 29514112, "step": 50850 }, { "epoch": 7.5744712540959185, "grad_norm": 1.1897422075271606, "learning_rate": 3.9072567297954694e-05, "loss": 0.527, "num_input_tokens_seen": 29517056, "step": 50855 }, { "epoch": 7.575215966636878, "grad_norm": 1.2856686115264893, "learning_rate": 3.906988145695215e-05, "loss": 0.6783, "num_input_tokens_seen": 29519968, "step": 50860 }, { "epoch": 7.575960679177838, "grad_norm": 2.202939510345459, "learning_rate": 3.906719537825325e-05, "loss": 0.7006, "num_input_tokens_seen": 29522784, "step": 50865 }, { "epoch": 7.5767053917187965, "grad_norm": 0.7340768575668335, "learning_rate": 3.906450906190339e-05, "loss": 0.4935, "num_input_tokens_seen": 29525440, "step": 50870 }, { "epoch": 7.577450104259755, "grad_norm": 1.2370672225952148, "learning_rate": 3.9061822507947945e-05, "loss": 0.4756, "num_input_tokens_seen": 29528352, "step": 50875 }, { "epoch": 7.578194816800715, "grad_norm": 2.2291259765625, "learning_rate": 3.9059135716432294e-05, "loss": 0.6319, "num_input_tokens_seen": 29531392, "step": 50880 }, { "epoch": 7.578939529341675, "grad_norm": 0.858473539352417, "learning_rate": 3.905644868740184e-05, "loss": 0.6365, "num_input_tokens_seen": 29534240, "step": 50885 }, { "epoch": 7.579684241882633, "grad_norm": 0.9794531464576721, "learning_rate": 3.905376142090197e-05, "loss": 0.6845, "num_input_tokens_seen": 29537472, "step": 50890 }, { "epoch": 7.580428954423592, "grad_norm": 1.5645862817764282, "learning_rate": 3.9051073916978084e-05, "loss": 0.5965, "num_input_tokens_seen": 29540224, "step": 50895 }, { "epoch": 7.581173666964552, "grad_norm": 1.2069783210754395, "learning_rate": 3.904838617567558e-05, "loss": 0.7351, "num_input_tokens_seen": 29543200, "step": 50900 }, { "epoch": 7.5819183795055105, "grad_norm": 1.4714422225952148, "learning_rate": 3.904569819703988e-05, "loss": 0.4899, "num_input_tokens_seen": 29545664, "step": 50905 }, { "epoch": 7.58266309204647, "grad_norm": 1.8216798305511475, "learning_rate": 3.9043009981116376e-05, "loss": 0.5568, "num_input_tokens_seen": 29548576, "step": 50910 }, { "epoch": 7.583407804587429, "grad_norm": 1.542565107345581, "learning_rate": 3.9040321527950497e-05, "loss": 0.6245, "num_input_tokens_seen": 29551648, "step": 50915 }, { "epoch": 7.584152517128389, "grad_norm": 0.7428631782531738, "learning_rate": 3.903763283758765e-05, "loss": 0.6528, "num_input_tokens_seen": 29554656, "step": 50920 }, { "epoch": 7.584897229669347, "grad_norm": 1.4336352348327637, "learning_rate": 3.903494391007327e-05, "loss": 0.783, "num_input_tokens_seen": 29557440, "step": 50925 }, { "epoch": 7.585641942210307, "grad_norm": 1.2144224643707275, "learning_rate": 3.9032254745452775e-05, "loss": 0.602, "num_input_tokens_seen": 29560544, "step": 50930 }, { "epoch": 7.586386654751266, "grad_norm": 1.7505109310150146, "learning_rate": 3.902956534377159e-05, "loss": 0.6469, "num_input_tokens_seen": 29563328, "step": 50935 }, { "epoch": 7.587131367292225, "grad_norm": 2.4331212043762207, "learning_rate": 3.902687570507517e-05, "loss": 0.6017, "num_input_tokens_seen": 29566464, "step": 50940 }, { "epoch": 7.587876079833184, "grad_norm": 1.6548399925231934, "learning_rate": 3.902418582940893e-05, "loss": 0.567, "num_input_tokens_seen": 29569280, "step": 50945 }, { "epoch": 7.588620792374144, "grad_norm": 1.4569205045700073, "learning_rate": 3.902149571681833e-05, "loss": 0.6851, "num_input_tokens_seen": 29572256, "step": 50950 }, { "epoch": 7.5893655049151025, "grad_norm": 1.6342887878417969, "learning_rate": 3.901880536734881e-05, "loss": 0.6967, "num_input_tokens_seen": 29575200, "step": 50955 }, { "epoch": 7.590110217456062, "grad_norm": 1.228027105331421, "learning_rate": 3.901611478104582e-05, "loss": 0.551, "num_input_tokens_seen": 29578176, "step": 50960 }, { "epoch": 7.590854929997021, "grad_norm": 1.3476322889328003, "learning_rate": 3.901342395795482e-05, "loss": 0.6337, "num_input_tokens_seen": 29580960, "step": 50965 }, { "epoch": 7.591599642537981, "grad_norm": 0.7893253564834595, "learning_rate": 3.901073289812126e-05, "loss": 0.6203, "num_input_tokens_seen": 29583776, "step": 50970 }, { "epoch": 7.592344355078939, "grad_norm": 1.2087855339050293, "learning_rate": 3.900804160159061e-05, "loss": 0.5604, "num_input_tokens_seen": 29586720, "step": 50975 }, { "epoch": 7.593089067619899, "grad_norm": 0.5894708633422852, "learning_rate": 3.900535006840833e-05, "loss": 0.7364, "num_input_tokens_seen": 29589600, "step": 50980 }, { "epoch": 7.593833780160858, "grad_norm": 1.1764225959777832, "learning_rate": 3.90026582986199e-05, "loss": 0.6672, "num_input_tokens_seen": 29592288, "step": 50985 }, { "epoch": 7.594578492701817, "grad_norm": 1.297550082206726, "learning_rate": 3.899996629227079e-05, "loss": 0.4599, "num_input_tokens_seen": 29595456, "step": 50990 }, { "epoch": 7.595323205242776, "grad_norm": 1.1097478866577148, "learning_rate": 3.899727404940647e-05, "loss": 0.6925, "num_input_tokens_seen": 29598624, "step": 50995 }, { "epoch": 7.596067917783736, "grad_norm": 1.335701823234558, "learning_rate": 3.899458157007244e-05, "loss": 0.4711, "num_input_tokens_seen": 29601472, "step": 51000 }, { "epoch": 7.596812630324695, "grad_norm": 1.2904430627822876, "learning_rate": 3.899188885431419e-05, "loss": 0.6891, "num_input_tokens_seen": 29604416, "step": 51005 }, { "epoch": 7.597557342865654, "grad_norm": 0.6257110238075256, "learning_rate": 3.898919590217718e-05, "loss": 0.7641, "num_input_tokens_seen": 29607264, "step": 51010 }, { "epoch": 7.598302055406613, "grad_norm": 1.0010114908218384, "learning_rate": 3.898650271370692e-05, "loss": 0.5512, "num_input_tokens_seen": 29609952, "step": 51015 }, { "epoch": 7.599046767947573, "grad_norm": 1.302230954170227, "learning_rate": 3.898380928894892e-05, "loss": 0.6087, "num_input_tokens_seen": 29613152, "step": 51020 }, { "epoch": 7.599791480488531, "grad_norm": 0.8790989518165588, "learning_rate": 3.8981115627948675e-05, "loss": 0.7378, "num_input_tokens_seen": 29616160, "step": 51025 }, { "epoch": 7.600536193029491, "grad_norm": 2.4015743732452393, "learning_rate": 3.897842173075169e-05, "loss": 0.6343, "num_input_tokens_seen": 29618976, "step": 51030 }, { "epoch": 7.60128090557045, "grad_norm": 1.9378637075424194, "learning_rate": 3.8975727597403475e-05, "loss": 0.63, "num_input_tokens_seen": 29621952, "step": 51035 }, { "epoch": 7.6020256181114085, "grad_norm": 1.3197085857391357, "learning_rate": 3.8973033227949554e-05, "loss": 0.5046, "num_input_tokens_seen": 29624736, "step": 51040 }, { "epoch": 7.602770330652368, "grad_norm": 2.292799234390259, "learning_rate": 3.897033862243543e-05, "loss": 0.5737, "num_input_tokens_seen": 29627840, "step": 51045 }, { "epoch": 7.603515043193328, "grad_norm": 1.0124170780181885, "learning_rate": 3.896764378090664e-05, "loss": 0.5565, "num_input_tokens_seen": 29630752, "step": 51050 }, { "epoch": 7.604259755734287, "grad_norm": 0.8031513690948486, "learning_rate": 3.89649487034087e-05, "loss": 0.5585, "num_input_tokens_seen": 29633600, "step": 51055 }, { "epoch": 7.605004468275245, "grad_norm": 1.1974239349365234, "learning_rate": 3.8962253389987145e-05, "loss": 0.6902, "num_input_tokens_seen": 29636640, "step": 51060 }, { "epoch": 7.605749180816205, "grad_norm": 1.3071619272232056, "learning_rate": 3.895955784068751e-05, "loss": 0.6549, "num_input_tokens_seen": 29639904, "step": 51065 }, { "epoch": 7.606493893357164, "grad_norm": 1.2097105979919434, "learning_rate": 3.8956862055555335e-05, "loss": 0.619, "num_input_tokens_seen": 29642592, "step": 51070 }, { "epoch": 7.607238605898123, "grad_norm": 1.7828787565231323, "learning_rate": 3.895416603463616e-05, "loss": 0.7364, "num_input_tokens_seen": 29645472, "step": 51075 }, { "epoch": 7.607983318439082, "grad_norm": 1.0008609294891357, "learning_rate": 3.895146977797553e-05, "loss": 0.4619, "num_input_tokens_seen": 29648800, "step": 51080 }, { "epoch": 7.608728030980042, "grad_norm": 0.7436990141868591, "learning_rate": 3.8948773285619e-05, "loss": 0.5773, "num_input_tokens_seen": 29651712, "step": 51085 }, { "epoch": 7.609472743521001, "grad_norm": 0.6587536931037903, "learning_rate": 3.894607655761212e-05, "loss": 0.574, "num_input_tokens_seen": 29654624, "step": 51090 }, { "epoch": 7.61021745606196, "grad_norm": 1.6495658159255981, "learning_rate": 3.894337959400045e-05, "loss": 0.6987, "num_input_tokens_seen": 29657312, "step": 51095 }, { "epoch": 7.610962168602919, "grad_norm": 0.7396696209907532, "learning_rate": 3.894068239482956e-05, "loss": 0.6481, "num_input_tokens_seen": 29660064, "step": 51100 }, { "epoch": 7.611706881143879, "grad_norm": 2.1908955574035645, "learning_rate": 3.8937984960145004e-05, "loss": 0.5144, "num_input_tokens_seen": 29663072, "step": 51105 }, { "epoch": 7.612451593684837, "grad_norm": 1.0780998468399048, "learning_rate": 3.893528728999236e-05, "loss": 0.539, "num_input_tokens_seen": 29666240, "step": 51110 }, { "epoch": 7.613196306225797, "grad_norm": 2.043503522872925, "learning_rate": 3.893258938441719e-05, "loss": 0.6336, "num_input_tokens_seen": 29669152, "step": 51115 }, { "epoch": 7.613941018766756, "grad_norm": 1.3830506801605225, "learning_rate": 3.89298912434651e-05, "loss": 0.6361, "num_input_tokens_seen": 29671776, "step": 51120 }, { "epoch": 7.614685731307715, "grad_norm": 1.4753202199935913, "learning_rate": 3.892719286718165e-05, "loss": 0.8193, "num_input_tokens_seen": 29674496, "step": 51125 }, { "epoch": 7.615430443848674, "grad_norm": 1.2874212265014648, "learning_rate": 3.892449425561243e-05, "loss": 0.7689, "num_input_tokens_seen": 29676896, "step": 51130 }, { "epoch": 7.616175156389634, "grad_norm": 1.0689222812652588, "learning_rate": 3.892179540880303e-05, "loss": 0.6276, "num_input_tokens_seen": 29679712, "step": 51135 }, { "epoch": 7.616919868930593, "grad_norm": 1.120426893234253, "learning_rate": 3.891909632679904e-05, "loss": 0.7075, "num_input_tokens_seen": 29682496, "step": 51140 }, { "epoch": 7.617664581471552, "grad_norm": 0.9151118993759155, "learning_rate": 3.8916397009646076e-05, "loss": 0.6261, "num_input_tokens_seen": 29685472, "step": 51145 }, { "epoch": 7.618409294012511, "grad_norm": 1.0647894144058228, "learning_rate": 3.891369745738972e-05, "loss": 0.5807, "num_input_tokens_seen": 29688160, "step": 51150 }, { "epoch": 7.619154006553471, "grad_norm": 0.8552068471908569, "learning_rate": 3.8910997670075593e-05, "loss": 0.5862, "num_input_tokens_seen": 29690976, "step": 51155 }, { "epoch": 7.619898719094429, "grad_norm": 3.474078893661499, "learning_rate": 3.890829764774929e-05, "loss": 0.7889, "num_input_tokens_seen": 29693888, "step": 51160 }, { "epoch": 7.620643431635389, "grad_norm": 1.0239709615707397, "learning_rate": 3.8905597390456446e-05, "loss": 0.4751, "num_input_tokens_seen": 29696320, "step": 51165 }, { "epoch": 7.621388144176348, "grad_norm": 0.8863353729248047, "learning_rate": 3.890289689824266e-05, "loss": 0.616, "num_input_tokens_seen": 29699232, "step": 51170 }, { "epoch": 7.6221328567173074, "grad_norm": 1.7228543758392334, "learning_rate": 3.890019617115357e-05, "loss": 0.7053, "num_input_tokens_seen": 29702176, "step": 51175 }, { "epoch": 7.622877569258266, "grad_norm": 1.0954989194869995, "learning_rate": 3.889749520923478e-05, "loss": 0.6989, "num_input_tokens_seen": 29704832, "step": 51180 }, { "epoch": 7.623622281799226, "grad_norm": 1.706511378288269, "learning_rate": 3.889479401253194e-05, "loss": 0.6484, "num_input_tokens_seen": 29707808, "step": 51185 }, { "epoch": 7.624366994340185, "grad_norm": 1.3554487228393555, "learning_rate": 3.8892092581090675e-05, "loss": 0.6497, "num_input_tokens_seen": 29710688, "step": 51190 }, { "epoch": 7.625111706881144, "grad_norm": 0.7727800011634827, "learning_rate": 3.888939091495663e-05, "loss": 0.7421, "num_input_tokens_seen": 29713472, "step": 51195 }, { "epoch": 7.625856419422103, "grad_norm": 1.0120570659637451, "learning_rate": 3.888668901417544e-05, "loss": 0.7314, "num_input_tokens_seen": 29716480, "step": 51200 }, { "epoch": 7.626601131963062, "grad_norm": 1.6563557386398315, "learning_rate": 3.888398687879274e-05, "loss": 0.7536, "num_input_tokens_seen": 29719232, "step": 51205 }, { "epoch": 7.627345844504021, "grad_norm": 1.5545953512191772, "learning_rate": 3.888128450885421e-05, "loss": 0.579, "num_input_tokens_seen": 29722016, "step": 51210 }, { "epoch": 7.628090557044981, "grad_norm": 1.031182885169983, "learning_rate": 3.887858190440549e-05, "loss": 0.6432, "num_input_tokens_seen": 29724832, "step": 51215 }, { "epoch": 7.62883526958594, "grad_norm": 1.4208651781082153, "learning_rate": 3.8875879065492216e-05, "loss": 0.6142, "num_input_tokens_seen": 29727680, "step": 51220 }, { "epoch": 7.629579982126899, "grad_norm": 0.9951719045639038, "learning_rate": 3.887317599216008e-05, "loss": 0.7463, "num_input_tokens_seen": 29730624, "step": 51225 }, { "epoch": 7.630324694667858, "grad_norm": 1.4068621397018433, "learning_rate": 3.887047268445473e-05, "loss": 0.7239, "num_input_tokens_seen": 29733536, "step": 51230 }, { "epoch": 7.631069407208818, "grad_norm": 1.1390031576156616, "learning_rate": 3.8867769142421844e-05, "loss": 0.7675, "num_input_tokens_seen": 29736608, "step": 51235 }, { "epoch": 7.631814119749777, "grad_norm": 1.690317988395691, "learning_rate": 3.886506536610709e-05, "loss": 0.7797, "num_input_tokens_seen": 29739648, "step": 51240 }, { "epoch": 7.632558832290735, "grad_norm": 1.1231918334960938, "learning_rate": 3.8862361355556156e-05, "loss": 0.7379, "num_input_tokens_seen": 29742528, "step": 51245 }, { "epoch": 7.633303544831695, "grad_norm": 0.9586109519004822, "learning_rate": 3.8859657110814704e-05, "loss": 0.6657, "num_input_tokens_seen": 29745504, "step": 51250 }, { "epoch": 7.634048257372654, "grad_norm": 0.6896604895591736, "learning_rate": 3.885695263192844e-05, "loss": 0.6266, "num_input_tokens_seen": 29748384, "step": 51255 }, { "epoch": 7.6347929699136134, "grad_norm": 2.1841752529144287, "learning_rate": 3.885424791894305e-05, "loss": 0.6915, "num_input_tokens_seen": 29751328, "step": 51260 }, { "epoch": 7.635537682454572, "grad_norm": 1.471580147743225, "learning_rate": 3.885154297190421e-05, "loss": 0.6963, "num_input_tokens_seen": 29754176, "step": 51265 }, { "epoch": 7.636282394995532, "grad_norm": 0.8185604214668274, "learning_rate": 3.884883779085764e-05, "loss": 0.5258, "num_input_tokens_seen": 29757024, "step": 51270 }, { "epoch": 7.637027107536491, "grad_norm": 1.0399234294891357, "learning_rate": 3.884613237584902e-05, "loss": 0.5578, "num_input_tokens_seen": 29760128, "step": 51275 }, { "epoch": 7.63777182007745, "grad_norm": 1.9664102792739868, "learning_rate": 3.884342672692407e-05, "loss": 0.5489, "num_input_tokens_seen": 29762752, "step": 51280 }, { "epoch": 7.638516532618409, "grad_norm": 1.121927261352539, "learning_rate": 3.88407208441285e-05, "loss": 0.5385, "num_input_tokens_seen": 29765984, "step": 51285 }, { "epoch": 7.639261245159369, "grad_norm": 1.8559006452560425, "learning_rate": 3.883801472750802e-05, "loss": 0.7818, "num_input_tokens_seen": 29768768, "step": 51290 }, { "epoch": 7.640005957700327, "grad_norm": 1.0494788885116577, "learning_rate": 3.8835308377108344e-05, "loss": 0.6352, "num_input_tokens_seen": 29771520, "step": 51295 }, { "epoch": 7.640750670241287, "grad_norm": 1.6441187858581543, "learning_rate": 3.883260179297519e-05, "loss": 0.7781, "num_input_tokens_seen": 29774592, "step": 51300 }, { "epoch": 7.641495382782246, "grad_norm": 1.3510884046554565, "learning_rate": 3.882989497515429e-05, "loss": 0.7022, "num_input_tokens_seen": 29777344, "step": 51305 }, { "epoch": 7.6422400953232055, "grad_norm": 1.7380074262619019, "learning_rate": 3.8827187923691365e-05, "loss": 0.7005, "num_input_tokens_seen": 29780256, "step": 51310 }, { "epoch": 7.642984807864164, "grad_norm": 1.5315968990325928, "learning_rate": 3.882448063863216e-05, "loss": 0.5237, "num_input_tokens_seen": 29783296, "step": 51315 }, { "epoch": 7.643729520405124, "grad_norm": 0.9835026860237122, "learning_rate": 3.882177312002241e-05, "loss": 0.738, "num_input_tokens_seen": 29786240, "step": 51320 }, { "epoch": 7.644474232946083, "grad_norm": 1.0032479763031006, "learning_rate": 3.881906536790784e-05, "loss": 0.5375, "num_input_tokens_seen": 29789120, "step": 51325 }, { "epoch": 7.645218945487042, "grad_norm": 0.9746387004852295, "learning_rate": 3.881635738233421e-05, "loss": 0.5886, "num_input_tokens_seen": 29791840, "step": 51330 }, { "epoch": 7.645963658028001, "grad_norm": 0.8567095398902893, "learning_rate": 3.8813649163347266e-05, "loss": 0.6745, "num_input_tokens_seen": 29794624, "step": 51335 }, { "epoch": 7.646708370568961, "grad_norm": 1.7821764945983887, "learning_rate": 3.881094071099276e-05, "loss": 0.6733, "num_input_tokens_seen": 29797376, "step": 51340 }, { "epoch": 7.6474530831099194, "grad_norm": 1.0686280727386475, "learning_rate": 3.880823202531644e-05, "loss": 0.6721, "num_input_tokens_seen": 29800256, "step": 51345 }, { "epoch": 7.648197795650879, "grad_norm": 1.1211978197097778, "learning_rate": 3.880552310636408e-05, "loss": 0.7603, "num_input_tokens_seen": 29803360, "step": 51350 }, { "epoch": 7.648942508191838, "grad_norm": 1.6198205947875977, "learning_rate": 3.880281395418144e-05, "loss": 0.7223, "num_input_tokens_seen": 29806720, "step": 51355 }, { "epoch": 7.6496872207327975, "grad_norm": 0.6697562336921692, "learning_rate": 3.8800104568814275e-05, "loss": 0.7546, "num_input_tokens_seen": 29809408, "step": 51360 }, { "epoch": 7.650431933273756, "grad_norm": 1.4927808046340942, "learning_rate": 3.879739495030839e-05, "loss": 0.6178, "num_input_tokens_seen": 29812352, "step": 51365 }, { "epoch": 7.651176645814716, "grad_norm": 0.9592649936676025, "learning_rate": 3.879468509870953e-05, "loss": 0.5214, "num_input_tokens_seen": 29815424, "step": 51370 }, { "epoch": 7.651921358355675, "grad_norm": 0.8896023631095886, "learning_rate": 3.879197501406347e-05, "loss": 0.583, "num_input_tokens_seen": 29818016, "step": 51375 }, { "epoch": 7.652666070896634, "grad_norm": 1.2722976207733154, "learning_rate": 3.878926469641603e-05, "loss": 0.6395, "num_input_tokens_seen": 29821024, "step": 51380 }, { "epoch": 7.653410783437593, "grad_norm": 1.39303719997406, "learning_rate": 3.878655414581297e-05, "loss": 0.6734, "num_input_tokens_seen": 29824128, "step": 51385 }, { "epoch": 7.654155495978552, "grad_norm": 2.5637590885162354, "learning_rate": 3.878384336230009e-05, "loss": 0.7771, "num_input_tokens_seen": 29827008, "step": 51390 }, { "epoch": 7.6549002085195115, "grad_norm": 0.8079811930656433, "learning_rate": 3.878113234592319e-05, "loss": 0.6821, "num_input_tokens_seen": 29829920, "step": 51395 }, { "epoch": 7.655644921060471, "grad_norm": 1.0946271419525146, "learning_rate": 3.8778421096728065e-05, "loss": 0.6875, "num_input_tokens_seen": 29832800, "step": 51400 }, { "epoch": 7.65638963360143, "grad_norm": 1.058219313621521, "learning_rate": 3.8775709614760514e-05, "loss": 0.5998, "num_input_tokens_seen": 29835584, "step": 51405 }, { "epoch": 7.657134346142389, "grad_norm": 1.6779522895812988, "learning_rate": 3.877299790006635e-05, "loss": 0.7938, "num_input_tokens_seen": 29838784, "step": 51410 }, { "epoch": 7.657879058683348, "grad_norm": 1.338655948638916, "learning_rate": 3.877028595269139e-05, "loss": 0.6491, "num_input_tokens_seen": 29841760, "step": 51415 }, { "epoch": 7.658623771224307, "grad_norm": 0.6438277959823608, "learning_rate": 3.876757377268144e-05, "loss": 0.5787, "num_input_tokens_seen": 29844608, "step": 51420 }, { "epoch": 7.659368483765267, "grad_norm": 1.3099349737167358, "learning_rate": 3.8764861360082324e-05, "loss": 0.6015, "num_input_tokens_seen": 29847488, "step": 51425 }, { "epoch": 7.6601131963062254, "grad_norm": 0.8953874707221985, "learning_rate": 3.876214871493987e-05, "loss": 0.6824, "num_input_tokens_seen": 29850560, "step": 51430 }, { "epoch": 7.660857908847185, "grad_norm": 0.7858549356460571, "learning_rate": 3.8759435837299904e-05, "loss": 0.5694, "num_input_tokens_seen": 29853632, "step": 51435 }, { "epoch": 7.661602621388144, "grad_norm": 1.4309799671173096, "learning_rate": 3.8756722727208246e-05, "loss": 0.5572, "num_input_tokens_seen": 29856512, "step": 51440 }, { "epoch": 7.6623473339291035, "grad_norm": 0.7962716817855835, "learning_rate": 3.8754009384710736e-05, "loss": 0.7096, "num_input_tokens_seen": 29859424, "step": 51445 }, { "epoch": 7.663092046470062, "grad_norm": 0.9975802898406982, "learning_rate": 3.8751295809853225e-05, "loss": 0.6403, "num_input_tokens_seen": 29862176, "step": 51450 }, { "epoch": 7.663836759011022, "grad_norm": 0.9162741303443909, "learning_rate": 3.8748582002681545e-05, "loss": 0.5351, "num_input_tokens_seen": 29865120, "step": 51455 }, { "epoch": 7.664581471551981, "grad_norm": 2.1813714504241943, "learning_rate": 3.8745867963241545e-05, "loss": 0.7725, "num_input_tokens_seen": 29868096, "step": 51460 }, { "epoch": 7.66532618409294, "grad_norm": 1.1798007488250732, "learning_rate": 3.874315369157907e-05, "loss": 0.5057, "num_input_tokens_seen": 29870880, "step": 51465 }, { "epoch": 7.666070896633899, "grad_norm": 1.768982172012329, "learning_rate": 3.8740439187739993e-05, "loss": 0.6651, "num_input_tokens_seen": 29873632, "step": 51470 }, { "epoch": 7.666815609174859, "grad_norm": 1.420591950416565, "learning_rate": 3.873772445177015e-05, "loss": 0.6757, "num_input_tokens_seen": 29876448, "step": 51475 }, { "epoch": 7.6675603217158175, "grad_norm": 1.6884281635284424, "learning_rate": 3.873500948371542e-05, "loss": 0.783, "num_input_tokens_seen": 29879424, "step": 51480 }, { "epoch": 7.668305034256777, "grad_norm": 1.7035813331604004, "learning_rate": 3.873229428362167e-05, "loss": 0.6769, "num_input_tokens_seen": 29882336, "step": 51485 }, { "epoch": 7.669049746797736, "grad_norm": 1.321421504020691, "learning_rate": 3.872957885153476e-05, "loss": 0.6017, "num_input_tokens_seen": 29885088, "step": 51490 }, { "epoch": 7.6697944593386955, "grad_norm": 0.8705435991287231, "learning_rate": 3.8726863187500564e-05, "loss": 0.6006, "num_input_tokens_seen": 29888032, "step": 51495 }, { "epoch": 7.670539171879654, "grad_norm": 1.96678626537323, "learning_rate": 3.872414729156497e-05, "loss": 0.6335, "num_input_tokens_seen": 29890880, "step": 51500 }, { "epoch": 7.671283884420614, "grad_norm": 0.8808736801147461, "learning_rate": 3.872143116377386e-05, "loss": 0.5766, "num_input_tokens_seen": 29893952, "step": 51505 }, { "epoch": 7.672028596961573, "grad_norm": 1.202449917793274, "learning_rate": 3.871871480417311e-05, "loss": 0.7059, "num_input_tokens_seen": 29896960, "step": 51510 }, { "epoch": 7.672773309502532, "grad_norm": 0.8571066856384277, "learning_rate": 3.871599821280863e-05, "loss": 0.4626, "num_input_tokens_seen": 29900288, "step": 51515 }, { "epoch": 7.673518022043491, "grad_norm": 1.0856832265853882, "learning_rate": 3.8713281389726285e-05, "loss": 0.632, "num_input_tokens_seen": 29903520, "step": 51520 }, { "epoch": 7.674262734584451, "grad_norm": 0.7983782887458801, "learning_rate": 3.871056433497199e-05, "loss": 0.6639, "num_input_tokens_seen": 29906368, "step": 51525 }, { "epoch": 7.6750074471254095, "grad_norm": 1.183609962463379, "learning_rate": 3.870784704859165e-05, "loss": 0.7396, "num_input_tokens_seen": 29909376, "step": 51530 }, { "epoch": 7.675752159666369, "grad_norm": 2.2834131717681885, "learning_rate": 3.8705129530631165e-05, "loss": 0.6785, "num_input_tokens_seen": 29912544, "step": 51535 }, { "epoch": 7.676496872207328, "grad_norm": 1.0564255714416504, "learning_rate": 3.870241178113645e-05, "loss": 0.5218, "num_input_tokens_seen": 29915168, "step": 51540 }, { "epoch": 7.6772415847482876, "grad_norm": 1.4103050231933594, "learning_rate": 3.86996938001534e-05, "loss": 0.6188, "num_input_tokens_seen": 29918272, "step": 51545 }, { "epoch": 7.677986297289246, "grad_norm": 0.8848995566368103, "learning_rate": 3.869697558772796e-05, "loss": 0.5644, "num_input_tokens_seen": 29921440, "step": 51550 }, { "epoch": 7.678731009830205, "grad_norm": 0.8946012258529663, "learning_rate": 3.8694257143906035e-05, "loss": 0.7074, "num_input_tokens_seen": 29924352, "step": 51555 }, { "epoch": 7.679475722371165, "grad_norm": 1.0645018815994263, "learning_rate": 3.869153846873356e-05, "loss": 0.7583, "num_input_tokens_seen": 29927488, "step": 51560 }, { "epoch": 7.680220434912124, "grad_norm": 1.7065545320510864, "learning_rate": 3.868881956225645e-05, "loss": 0.7064, "num_input_tokens_seen": 29930272, "step": 51565 }, { "epoch": 7.680965147453083, "grad_norm": 0.7546919584274292, "learning_rate": 3.868610042452065e-05, "loss": 0.5348, "num_input_tokens_seen": 29933152, "step": 51570 }, { "epoch": 7.681709859994042, "grad_norm": 1.8665027618408203, "learning_rate": 3.8683381055572095e-05, "loss": 0.7778, "num_input_tokens_seen": 29935936, "step": 51575 }, { "epoch": 7.6824545725350015, "grad_norm": 1.0856859683990479, "learning_rate": 3.868066145545672e-05, "loss": 0.5857, "num_input_tokens_seen": 29938688, "step": 51580 }, { "epoch": 7.683199285075961, "grad_norm": 1.1600030660629272, "learning_rate": 3.867794162422047e-05, "loss": 0.5981, "num_input_tokens_seen": 29941472, "step": 51585 }, { "epoch": 7.68394399761692, "grad_norm": 1.2656917572021484, "learning_rate": 3.86752215619093e-05, "loss": 0.5452, "num_input_tokens_seen": 29944224, "step": 51590 }, { "epoch": 7.684688710157879, "grad_norm": 0.7782050967216492, "learning_rate": 3.867250126856917e-05, "loss": 0.6434, "num_input_tokens_seen": 29947296, "step": 51595 }, { "epoch": 7.685433422698838, "grad_norm": 1.7314902544021606, "learning_rate": 3.866978074424602e-05, "loss": 0.7289, "num_input_tokens_seen": 29950336, "step": 51600 }, { "epoch": 7.686178135239797, "grad_norm": 0.8387249112129211, "learning_rate": 3.866705998898582e-05, "loss": 0.5057, "num_input_tokens_seen": 29953184, "step": 51605 }, { "epoch": 7.686922847780757, "grad_norm": 0.9272893071174622, "learning_rate": 3.866433900283453e-05, "loss": 0.6941, "num_input_tokens_seen": 29956000, "step": 51610 }, { "epoch": 7.6876675603217155, "grad_norm": 0.9772170186042786, "learning_rate": 3.866161778583812e-05, "loss": 0.5437, "num_input_tokens_seen": 29958944, "step": 51615 }, { "epoch": 7.688412272862675, "grad_norm": 1.1663316488265991, "learning_rate": 3.865889633804257e-05, "loss": 0.5361, "num_input_tokens_seen": 29962016, "step": 51620 }, { "epoch": 7.689156985403634, "grad_norm": 1.6472326517105103, "learning_rate": 3.8656174659493835e-05, "loss": 0.7372, "num_input_tokens_seen": 29964928, "step": 51625 }, { "epoch": 7.6899016979445936, "grad_norm": 1.6543104648590088, "learning_rate": 3.865345275023792e-05, "loss": 0.6398, "num_input_tokens_seen": 29968000, "step": 51630 }, { "epoch": 7.690646410485552, "grad_norm": 1.060522437095642, "learning_rate": 3.8650730610320796e-05, "loss": 0.6765, "num_input_tokens_seen": 29970912, "step": 51635 }, { "epoch": 7.691391123026512, "grad_norm": 1.80040442943573, "learning_rate": 3.864800823978845e-05, "loss": 0.6508, "num_input_tokens_seen": 29973824, "step": 51640 }, { "epoch": 7.692135835567471, "grad_norm": 1.9335284233093262, "learning_rate": 3.864528563868687e-05, "loss": 0.6587, "num_input_tokens_seen": 29976736, "step": 51645 }, { "epoch": 7.69288054810843, "grad_norm": 2.3914361000061035, "learning_rate": 3.864256280706206e-05, "loss": 0.6467, "num_input_tokens_seen": 29979712, "step": 51650 }, { "epoch": 7.693625260649389, "grad_norm": 1.016241431236267, "learning_rate": 3.8639839744960025e-05, "loss": 0.6402, "num_input_tokens_seen": 29982720, "step": 51655 }, { "epoch": 7.694369973190349, "grad_norm": 1.0115704536437988, "learning_rate": 3.863711645242676e-05, "loss": 0.4618, "num_input_tokens_seen": 29985632, "step": 51660 }, { "epoch": 7.6951146857313075, "grad_norm": 0.9531733393669128, "learning_rate": 3.863439292950827e-05, "loss": 0.6105, "num_input_tokens_seen": 29989120, "step": 51665 }, { "epoch": 7.695859398272267, "grad_norm": 1.2042149305343628, "learning_rate": 3.863166917625056e-05, "loss": 0.6568, "num_input_tokens_seen": 29991968, "step": 51670 }, { "epoch": 7.696604110813226, "grad_norm": 1.9096487760543823, "learning_rate": 3.862894519269966e-05, "loss": 0.7837, "num_input_tokens_seen": 29994720, "step": 51675 }, { "epoch": 7.697348823354186, "grad_norm": 1.1030359268188477, "learning_rate": 3.8626220978901585e-05, "loss": 0.578, "num_input_tokens_seen": 29997728, "step": 51680 }, { "epoch": 7.698093535895144, "grad_norm": 1.2872061729431152, "learning_rate": 3.862349653490236e-05, "loss": 0.6805, "num_input_tokens_seen": 30000480, "step": 51685 }, { "epoch": 7.698838248436104, "grad_norm": 0.6313443779945374, "learning_rate": 3.8620771860748005e-05, "loss": 0.6648, "num_input_tokens_seen": 30003648, "step": 51690 }, { "epoch": 7.699582960977063, "grad_norm": 0.8789545297622681, "learning_rate": 3.861804695648455e-05, "loss": 0.4746, "num_input_tokens_seen": 30006432, "step": 51695 }, { "epoch": 7.700327673518022, "grad_norm": 1.9054045677185059, "learning_rate": 3.861532182215802e-05, "loss": 0.6586, "num_input_tokens_seen": 30009248, "step": 51700 }, { "epoch": 7.701072386058981, "grad_norm": 1.1883833408355713, "learning_rate": 3.861259645781449e-05, "loss": 0.6129, "num_input_tokens_seen": 30011968, "step": 51705 }, { "epoch": 7.701817098599941, "grad_norm": 1.0018742084503174, "learning_rate": 3.860987086349996e-05, "loss": 0.6307, "num_input_tokens_seen": 30014816, "step": 51710 }, { "epoch": 7.7025618111408996, "grad_norm": 1.0660356283187866, "learning_rate": 3.86071450392605e-05, "loss": 0.5098, "num_input_tokens_seen": 30017696, "step": 51715 }, { "epoch": 7.703306523681858, "grad_norm": 0.978858470916748, "learning_rate": 3.860441898514215e-05, "loss": 0.6069, "num_input_tokens_seen": 30020608, "step": 51720 }, { "epoch": 7.704051236222818, "grad_norm": 1.2491000890731812, "learning_rate": 3.8601692701190975e-05, "loss": 0.6511, "num_input_tokens_seen": 30023744, "step": 51725 }, { "epoch": 7.704795948763778, "grad_norm": 0.9261216521263123, "learning_rate": 3.8598966187453034e-05, "loss": 0.555, "num_input_tokens_seen": 30026880, "step": 51730 }, { "epoch": 7.705540661304736, "grad_norm": 1.1190117597579956, "learning_rate": 3.859623944397437e-05, "loss": 0.7606, "num_input_tokens_seen": 30029888, "step": 51735 }, { "epoch": 7.706285373845695, "grad_norm": 2.2132983207702637, "learning_rate": 3.859351247080106e-05, "loss": 0.6136, "num_input_tokens_seen": 30033024, "step": 51740 }, { "epoch": 7.707030086386655, "grad_norm": 1.0707008838653564, "learning_rate": 3.859078526797917e-05, "loss": 0.6038, "num_input_tokens_seen": 30035744, "step": 51745 }, { "epoch": 7.707774798927614, "grad_norm": 1.2664637565612793, "learning_rate": 3.8588057835554776e-05, "loss": 0.5404, "num_input_tokens_seen": 30038496, "step": 51750 }, { "epoch": 7.708519511468573, "grad_norm": 1.7628393173217773, "learning_rate": 3.858533017357396e-05, "loss": 0.7121, "num_input_tokens_seen": 30041376, "step": 51755 }, { "epoch": 7.709264224009532, "grad_norm": 1.5059576034545898, "learning_rate": 3.858260228208279e-05, "loss": 0.7002, "num_input_tokens_seen": 30044224, "step": 51760 }, { "epoch": 7.710008936550492, "grad_norm": 0.7433223724365234, "learning_rate": 3.857987416112737e-05, "loss": 0.6244, "num_input_tokens_seen": 30046816, "step": 51765 }, { "epoch": 7.71075364909145, "grad_norm": 1.6936209201812744, "learning_rate": 3.857714581075377e-05, "loss": 0.6285, "num_input_tokens_seen": 30049600, "step": 51770 }, { "epoch": 7.71149836163241, "grad_norm": 1.2099837064743042, "learning_rate": 3.85744172310081e-05, "loss": 0.6736, "num_input_tokens_seen": 30052736, "step": 51775 }, { "epoch": 7.712243074173369, "grad_norm": 3.1301112174987793, "learning_rate": 3.8571688421936434e-05, "loss": 0.6911, "num_input_tokens_seen": 30055520, "step": 51780 }, { "epoch": 7.712987786714328, "grad_norm": 1.0643969774246216, "learning_rate": 3.85689593835849e-05, "loss": 0.709, "num_input_tokens_seen": 30058528, "step": 51785 }, { "epoch": 7.713732499255287, "grad_norm": 2.259852409362793, "learning_rate": 3.8566230115999575e-05, "loss": 0.67, "num_input_tokens_seen": 30061312, "step": 51790 }, { "epoch": 7.714477211796247, "grad_norm": 0.897920548915863, "learning_rate": 3.856350061922659e-05, "loss": 0.6362, "num_input_tokens_seen": 30064320, "step": 51795 }, { "epoch": 7.7152219243372056, "grad_norm": 1.2875477075576782, "learning_rate": 3.856077089331204e-05, "loss": 0.7239, "num_input_tokens_seen": 30067168, "step": 51800 }, { "epoch": 7.715966636878165, "grad_norm": 1.2169015407562256, "learning_rate": 3.855804093830205e-05, "loss": 0.7539, "num_input_tokens_seen": 30070144, "step": 51805 }, { "epoch": 7.716711349419124, "grad_norm": 1.0713859796524048, "learning_rate": 3.855531075424274e-05, "loss": 0.6656, "num_input_tokens_seen": 30073088, "step": 51810 }, { "epoch": 7.717456061960084, "grad_norm": 2.0644497871398926, "learning_rate": 3.8552580341180236e-05, "loss": 0.6907, "num_input_tokens_seen": 30075872, "step": 51815 }, { "epoch": 7.718200774501042, "grad_norm": 1.3744045495986938, "learning_rate": 3.8549849699160655e-05, "loss": 0.6625, "num_input_tokens_seen": 30078816, "step": 51820 }, { "epoch": 7.718945487042002, "grad_norm": 1.0371782779693604, "learning_rate": 3.8547118828230135e-05, "loss": 0.4269, "num_input_tokens_seen": 30081472, "step": 51825 }, { "epoch": 7.719690199582961, "grad_norm": 1.1879639625549316, "learning_rate": 3.854438772843482e-05, "loss": 0.7116, "num_input_tokens_seen": 30084352, "step": 51830 }, { "epoch": 7.72043491212392, "grad_norm": 0.7375251650810242, "learning_rate": 3.8541656399820825e-05, "loss": 0.64, "num_input_tokens_seen": 30087296, "step": 51835 }, { "epoch": 7.721179624664879, "grad_norm": 1.219139575958252, "learning_rate": 3.853892484243432e-05, "loss": 0.7286, "num_input_tokens_seen": 30090304, "step": 51840 }, { "epoch": 7.721924337205839, "grad_norm": 2.1927788257598877, "learning_rate": 3.8536193056321436e-05, "loss": 0.8939, "num_input_tokens_seen": 30092928, "step": 51845 }, { "epoch": 7.722669049746798, "grad_norm": 1.3206297159194946, "learning_rate": 3.853346104152833e-05, "loss": 0.6111, "num_input_tokens_seen": 30096032, "step": 51850 }, { "epoch": 7.723413762287757, "grad_norm": 0.7930057048797607, "learning_rate": 3.853072879810115e-05, "loss": 0.5291, "num_input_tokens_seen": 30098592, "step": 51855 }, { "epoch": 7.724158474828716, "grad_norm": 1.0350475311279297, "learning_rate": 3.8527996326086065e-05, "loss": 0.7267, "num_input_tokens_seen": 30101632, "step": 51860 }, { "epoch": 7.724903187369676, "grad_norm": 1.1598883867263794, "learning_rate": 3.852526362552923e-05, "loss": 0.5979, "num_input_tokens_seen": 30104608, "step": 51865 }, { "epoch": 7.725647899910634, "grad_norm": 1.8635512590408325, "learning_rate": 3.852253069647681e-05, "loss": 0.6013, "num_input_tokens_seen": 30107200, "step": 51870 }, { "epoch": 7.726392612451594, "grad_norm": 1.143721103668213, "learning_rate": 3.851979753897498e-05, "loss": 0.6355, "num_input_tokens_seen": 30110112, "step": 51875 }, { "epoch": 7.727137324992553, "grad_norm": 1.5254279375076294, "learning_rate": 3.8517064153069905e-05, "loss": 0.5409, "num_input_tokens_seen": 30112928, "step": 51880 }, { "epoch": 7.727882037533512, "grad_norm": 1.3372851610183716, "learning_rate": 3.8514330538807775e-05, "loss": 0.6684, "num_input_tokens_seen": 30116096, "step": 51885 }, { "epoch": 7.728626750074471, "grad_norm": 1.0581618547439575, "learning_rate": 3.8511596696234765e-05, "loss": 0.6009, "num_input_tokens_seen": 30119552, "step": 51890 }, { "epoch": 7.729371462615431, "grad_norm": 1.788699746131897, "learning_rate": 3.8508862625397055e-05, "loss": 0.7241, "num_input_tokens_seen": 30122336, "step": 51895 }, { "epoch": 7.73011617515639, "grad_norm": 1.287680983543396, "learning_rate": 3.850612832634085e-05, "loss": 0.6757, "num_input_tokens_seen": 30125536, "step": 51900 }, { "epoch": 7.730860887697348, "grad_norm": 2.687375783920288, "learning_rate": 3.850339379911233e-05, "loss": 0.6119, "num_input_tokens_seen": 30128352, "step": 51905 }, { "epoch": 7.731605600238308, "grad_norm": 1.0933963060379028, "learning_rate": 3.8500659043757705e-05, "loss": 0.6309, "num_input_tokens_seen": 30131072, "step": 51910 }, { "epoch": 7.732350312779268, "grad_norm": 1.2311997413635254, "learning_rate": 3.8497924060323154e-05, "loss": 0.5832, "num_input_tokens_seen": 30133856, "step": 51915 }, { "epoch": 7.733095025320226, "grad_norm": 1.3213465213775635, "learning_rate": 3.84951888488549e-05, "loss": 0.747, "num_input_tokens_seen": 30136864, "step": 51920 }, { "epoch": 7.733839737861185, "grad_norm": 1.1945704221725464, "learning_rate": 3.849245340939914e-05, "loss": 0.5478, "num_input_tokens_seen": 30139808, "step": 51925 }, { "epoch": 7.734584450402145, "grad_norm": 1.0172816514968872, "learning_rate": 3.84897177420021e-05, "loss": 0.648, "num_input_tokens_seen": 30142624, "step": 51930 }, { "epoch": 7.735329162943104, "grad_norm": 0.9362760186195374, "learning_rate": 3.848698184670999e-05, "loss": 0.5039, "num_input_tokens_seen": 30145472, "step": 51935 }, { "epoch": 7.736073875484063, "grad_norm": 2.0816574096679688, "learning_rate": 3.848424572356902e-05, "loss": 0.5523, "num_input_tokens_seen": 30148032, "step": 51940 }, { "epoch": 7.736818588025022, "grad_norm": 0.8763095140457153, "learning_rate": 3.848150937262544e-05, "loss": 0.6297, "num_input_tokens_seen": 30150848, "step": 51945 }, { "epoch": 7.737563300565982, "grad_norm": 1.2324992418289185, "learning_rate": 3.847877279392546e-05, "loss": 0.6009, "num_input_tokens_seen": 30153728, "step": 51950 }, { "epoch": 7.73830801310694, "grad_norm": 1.3658864498138428, "learning_rate": 3.847603598751529e-05, "loss": 0.6901, "num_input_tokens_seen": 30156512, "step": 51955 }, { "epoch": 7.7390527256479, "grad_norm": 2.146336078643799, "learning_rate": 3.847329895344121e-05, "loss": 0.7229, "num_input_tokens_seen": 30159360, "step": 51960 }, { "epoch": 7.739797438188859, "grad_norm": 1.7615420818328857, "learning_rate": 3.847056169174942e-05, "loss": 0.7682, "num_input_tokens_seen": 30162208, "step": 51965 }, { "epoch": 7.740542150729818, "grad_norm": 0.8853272199630737, "learning_rate": 3.846782420248619e-05, "loss": 0.7305, "num_input_tokens_seen": 30165216, "step": 51970 }, { "epoch": 7.741286863270777, "grad_norm": 1.2067712545394897, "learning_rate": 3.8465086485697766e-05, "loss": 0.5656, "num_input_tokens_seen": 30168256, "step": 51975 }, { "epoch": 7.742031575811737, "grad_norm": 1.184190273284912, "learning_rate": 3.8462348541430396e-05, "loss": 0.6268, "num_input_tokens_seen": 30171104, "step": 51980 }, { "epoch": 7.742776288352696, "grad_norm": 1.8176239728927612, "learning_rate": 3.8459610369730316e-05, "loss": 0.6576, "num_input_tokens_seen": 30174112, "step": 51985 }, { "epoch": 7.743521000893655, "grad_norm": 1.4479880332946777, "learning_rate": 3.8456871970643794e-05, "loss": 0.6262, "num_input_tokens_seen": 30176928, "step": 51990 }, { "epoch": 7.744265713434614, "grad_norm": 0.949795663356781, "learning_rate": 3.8454133344217105e-05, "loss": 0.6397, "num_input_tokens_seen": 30179872, "step": 51995 }, { "epoch": 7.745010425975574, "grad_norm": 0.9315348863601685, "learning_rate": 3.8451394490496505e-05, "loss": 0.5748, "num_input_tokens_seen": 30182912, "step": 52000 }, { "epoch": 7.745755138516532, "grad_norm": 1.5830179452896118, "learning_rate": 3.8448655409528274e-05, "loss": 0.716, "num_input_tokens_seen": 30186272, "step": 52005 }, { "epoch": 7.746499851057492, "grad_norm": 0.9808546900749207, "learning_rate": 3.844591610135867e-05, "loss": 0.608, "num_input_tokens_seen": 30189152, "step": 52010 }, { "epoch": 7.747244563598451, "grad_norm": 1.4914699792861938, "learning_rate": 3.844317656603398e-05, "loss": 0.5786, "num_input_tokens_seen": 30192128, "step": 52015 }, { "epoch": 7.7479892761394105, "grad_norm": 1.2249162197113037, "learning_rate": 3.844043680360049e-05, "loss": 0.6274, "num_input_tokens_seen": 30194848, "step": 52020 }, { "epoch": 7.748733988680369, "grad_norm": 1.4421545267105103, "learning_rate": 3.8437696814104476e-05, "loss": 0.7018, "num_input_tokens_seen": 30197760, "step": 52025 }, { "epoch": 7.749478701221329, "grad_norm": 1.262107253074646, "learning_rate": 3.8434956597592234e-05, "loss": 0.6152, "num_input_tokens_seen": 30200704, "step": 52030 }, { "epoch": 7.750223413762288, "grad_norm": 0.9234206676483154, "learning_rate": 3.8432216154110053e-05, "loss": 0.5837, "num_input_tokens_seen": 30203904, "step": 52035 }, { "epoch": 7.750968126303247, "grad_norm": 0.9342339038848877, "learning_rate": 3.8429475483704236e-05, "loss": 0.6544, "num_input_tokens_seen": 30207104, "step": 52040 }, { "epoch": 7.751712838844206, "grad_norm": 1.495039463043213, "learning_rate": 3.842673458642108e-05, "loss": 0.6239, "num_input_tokens_seen": 30209888, "step": 52045 }, { "epoch": 7.752457551385166, "grad_norm": 2.219806432723999, "learning_rate": 3.842399346230688e-05, "loss": 0.7126, "num_input_tokens_seen": 30212672, "step": 52050 }, { "epoch": 7.753202263926124, "grad_norm": 0.7981657385826111, "learning_rate": 3.842125211140796e-05, "loss": 0.5228, "num_input_tokens_seen": 30215616, "step": 52055 }, { "epoch": 7.753946976467084, "grad_norm": 1.3169044256210327, "learning_rate": 3.8418510533770624e-05, "loss": 0.6748, "num_input_tokens_seen": 30219456, "step": 52060 }, { "epoch": 7.754691689008043, "grad_norm": 1.058746337890625, "learning_rate": 3.841576872944119e-05, "loss": 0.5058, "num_input_tokens_seen": 30222656, "step": 52065 }, { "epoch": 7.755436401549002, "grad_norm": 1.4927270412445068, "learning_rate": 3.841302669846599e-05, "loss": 0.7153, "num_input_tokens_seen": 30225504, "step": 52070 }, { "epoch": 7.756181114089961, "grad_norm": 1.0884730815887451, "learning_rate": 3.841028444089133e-05, "loss": 0.7871, "num_input_tokens_seen": 30228256, "step": 52075 }, { "epoch": 7.756925826630921, "grad_norm": 1.103976845741272, "learning_rate": 3.840754195676354e-05, "loss": 0.5927, "num_input_tokens_seen": 30231200, "step": 52080 }, { "epoch": 7.75767053917188, "grad_norm": 1.303669810295105, "learning_rate": 3.8404799246128956e-05, "loss": 0.6199, "num_input_tokens_seen": 30234272, "step": 52085 }, { "epoch": 7.758415251712838, "grad_norm": 1.3724665641784668, "learning_rate": 3.8402056309033915e-05, "loss": 0.6708, "num_input_tokens_seen": 30236992, "step": 52090 }, { "epoch": 7.759159964253798, "grad_norm": 1.2098838090896606, "learning_rate": 3.839931314552475e-05, "loss": 0.6162, "num_input_tokens_seen": 30240064, "step": 52095 }, { "epoch": 7.759904676794758, "grad_norm": 1.9605532884597778, "learning_rate": 3.8396569755647816e-05, "loss": 0.6391, "num_input_tokens_seen": 30243424, "step": 52100 }, { "epoch": 7.7606493893357165, "grad_norm": 0.8538742661476135, "learning_rate": 3.839382613944944e-05, "loss": 0.4691, "num_input_tokens_seen": 30246048, "step": 52105 }, { "epoch": 7.761394101876675, "grad_norm": 1.1731798648834229, "learning_rate": 3.8391082296976e-05, "loss": 0.791, "num_input_tokens_seen": 30248832, "step": 52110 }, { "epoch": 7.762138814417635, "grad_norm": 1.0573921203613281, "learning_rate": 3.8388338228273824e-05, "loss": 0.6578, "num_input_tokens_seen": 30251552, "step": 52115 }, { "epoch": 7.762883526958594, "grad_norm": 0.9711741209030151, "learning_rate": 3.838559393338927e-05, "loss": 0.5563, "num_input_tokens_seen": 30254304, "step": 52120 }, { "epoch": 7.763628239499553, "grad_norm": 1.5621367692947388, "learning_rate": 3.838284941236873e-05, "loss": 0.6045, "num_input_tokens_seen": 30257312, "step": 52125 }, { "epoch": 7.764372952040512, "grad_norm": 1.5574373006820679, "learning_rate": 3.8380104665258545e-05, "loss": 0.624, "num_input_tokens_seen": 30260160, "step": 52130 }, { "epoch": 7.765117664581472, "grad_norm": 1.458718180656433, "learning_rate": 3.837735969210509e-05, "loss": 0.6467, "num_input_tokens_seen": 30263200, "step": 52135 }, { "epoch": 7.76586237712243, "grad_norm": 0.9668581485748291, "learning_rate": 3.837461449295474e-05, "loss": 0.605, "num_input_tokens_seen": 30266048, "step": 52140 }, { "epoch": 7.76660708966339, "grad_norm": 0.8464290499687195, "learning_rate": 3.837186906785387e-05, "loss": 0.6265, "num_input_tokens_seen": 30269024, "step": 52145 }, { "epoch": 7.767351802204349, "grad_norm": 1.143006682395935, "learning_rate": 3.836912341684886e-05, "loss": 0.6643, "num_input_tokens_seen": 30271904, "step": 52150 }, { "epoch": 7.7680965147453085, "grad_norm": 1.1053874492645264, "learning_rate": 3.83663775399861e-05, "loss": 0.731, "num_input_tokens_seen": 30274752, "step": 52155 }, { "epoch": 7.768841227286267, "grad_norm": 0.8292887210845947, "learning_rate": 3.836363143731198e-05, "loss": 0.5798, "num_input_tokens_seen": 30277696, "step": 52160 }, { "epoch": 7.769585939827227, "grad_norm": 1.5706923007965088, "learning_rate": 3.8360885108872885e-05, "loss": 0.7417, "num_input_tokens_seen": 30280832, "step": 52165 }, { "epoch": 7.770330652368186, "grad_norm": 1.0529849529266357, "learning_rate": 3.8358138554715215e-05, "loss": 0.6817, "num_input_tokens_seen": 30283872, "step": 52170 }, { "epoch": 7.771075364909145, "grad_norm": 1.3785209655761719, "learning_rate": 3.8355391774885375e-05, "loss": 0.4847, "num_input_tokens_seen": 30286944, "step": 52175 }, { "epoch": 7.771820077450104, "grad_norm": 1.1218764781951904, "learning_rate": 3.835264476942977e-05, "loss": 0.7866, "num_input_tokens_seen": 30289856, "step": 52180 }, { "epoch": 7.772564789991064, "grad_norm": 1.3468488454818726, "learning_rate": 3.834989753839479e-05, "loss": 0.5945, "num_input_tokens_seen": 30292736, "step": 52185 }, { "epoch": 7.7733095025320225, "grad_norm": 1.2000007629394531, "learning_rate": 3.834715008182687e-05, "loss": 0.7434, "num_input_tokens_seen": 30295520, "step": 52190 }, { "epoch": 7.774054215072982, "grad_norm": 1.0313639640808105, "learning_rate": 3.83444023997724e-05, "loss": 0.672, "num_input_tokens_seen": 30298976, "step": 52195 }, { "epoch": 7.774798927613941, "grad_norm": 0.9238858819007874, "learning_rate": 3.834165449227782e-05, "loss": 0.6174, "num_input_tokens_seen": 30301792, "step": 52200 }, { "epoch": 7.7755436401549005, "grad_norm": 2.0720160007476807, "learning_rate": 3.833890635938956e-05, "loss": 0.7288, "num_input_tokens_seen": 30304416, "step": 52205 }, { "epoch": 7.776288352695859, "grad_norm": 1.7041056156158447, "learning_rate": 3.8336158001154024e-05, "loss": 0.6078, "num_input_tokens_seen": 30307552, "step": 52210 }, { "epoch": 7.777033065236819, "grad_norm": 1.0057904720306396, "learning_rate": 3.8333409417617654e-05, "loss": 0.6345, "num_input_tokens_seen": 30310560, "step": 52215 }, { "epoch": 7.777777777777778, "grad_norm": 1.4739264249801636, "learning_rate": 3.8330660608826885e-05, "loss": 0.609, "num_input_tokens_seen": 30313472, "step": 52220 }, { "epoch": 7.778522490318737, "grad_norm": 1.1089671850204468, "learning_rate": 3.832791157482815e-05, "loss": 0.8225, "num_input_tokens_seen": 30316480, "step": 52225 }, { "epoch": 7.779267202859696, "grad_norm": 1.6860135793685913, "learning_rate": 3.8325162315667895e-05, "loss": 0.6312, "num_input_tokens_seen": 30319776, "step": 52230 }, { "epoch": 7.780011915400656, "grad_norm": 1.0786155462265015, "learning_rate": 3.832241283139256e-05, "loss": 0.6027, "num_input_tokens_seen": 30322752, "step": 52235 }, { "epoch": 7.7807566279416145, "grad_norm": 0.8494980931282043, "learning_rate": 3.831966312204861e-05, "loss": 0.5456, "num_input_tokens_seen": 30325472, "step": 52240 }, { "epoch": 7.781501340482574, "grad_norm": 0.8452039361000061, "learning_rate": 3.831691318768249e-05, "loss": 0.7175, "num_input_tokens_seen": 30328320, "step": 52245 }, { "epoch": 7.782246053023533, "grad_norm": 2.2232284545898438, "learning_rate": 3.831416302834065e-05, "loss": 0.4093, "num_input_tokens_seen": 30331168, "step": 52250 }, { "epoch": 7.782990765564492, "grad_norm": 1.5269728899002075, "learning_rate": 3.831141264406957e-05, "loss": 0.5484, "num_input_tokens_seen": 30333856, "step": 52255 }, { "epoch": 7.783735478105451, "grad_norm": 1.541347622871399, "learning_rate": 3.8308662034915685e-05, "loss": 0.5524, "num_input_tokens_seen": 30336640, "step": 52260 }, { "epoch": 7.784480190646411, "grad_norm": 1.7238044738769531, "learning_rate": 3.830591120092549e-05, "loss": 0.695, "num_input_tokens_seen": 30339968, "step": 52265 }, { "epoch": 7.78522490318737, "grad_norm": 1.1168216466903687, "learning_rate": 3.8303160142145444e-05, "loss": 0.5072, "num_input_tokens_seen": 30342688, "step": 52270 }, { "epoch": 7.7859696157283285, "grad_norm": 1.3436038494110107, "learning_rate": 3.830040885862204e-05, "loss": 0.5392, "num_input_tokens_seen": 30345856, "step": 52275 }, { "epoch": 7.786714328269288, "grad_norm": 1.0639985799789429, "learning_rate": 3.8297657350401735e-05, "loss": 0.6774, "num_input_tokens_seen": 30348896, "step": 52280 }, { "epoch": 7.787459040810247, "grad_norm": 0.8883997797966003, "learning_rate": 3.829490561753103e-05, "loss": 0.5804, "num_input_tokens_seen": 30351872, "step": 52285 }, { "epoch": 7.7882037533512065, "grad_norm": 2.0055887699127197, "learning_rate": 3.82921536600564e-05, "loss": 0.6176, "num_input_tokens_seen": 30354784, "step": 52290 }, { "epoch": 7.788948465892165, "grad_norm": 0.8382315635681152, "learning_rate": 3.828940147802435e-05, "loss": 0.5854, "num_input_tokens_seen": 30357600, "step": 52295 }, { "epoch": 7.789693178433125, "grad_norm": 0.9788637757301331, "learning_rate": 3.828664907148137e-05, "loss": 0.5126, "num_input_tokens_seen": 30360384, "step": 52300 }, { "epoch": 7.790437890974084, "grad_norm": 2.248185873031616, "learning_rate": 3.828389644047395e-05, "loss": 0.7474, "num_input_tokens_seen": 30363008, "step": 52305 }, { "epoch": 7.791182603515043, "grad_norm": 1.4922372102737427, "learning_rate": 3.8281143585048604e-05, "loss": 0.5615, "num_input_tokens_seen": 30366016, "step": 52310 }, { "epoch": 7.791927316056002, "grad_norm": 0.9172883629798889, "learning_rate": 3.8278390505251835e-05, "loss": 0.581, "num_input_tokens_seen": 30368928, "step": 52315 }, { "epoch": 7.792672028596962, "grad_norm": 1.3393425941467285, "learning_rate": 3.827563720113016e-05, "loss": 0.5841, "num_input_tokens_seen": 30372224, "step": 52320 }, { "epoch": 7.7934167411379205, "grad_norm": 1.0999915599822998, "learning_rate": 3.827288367273008e-05, "loss": 0.5153, "num_input_tokens_seen": 30374880, "step": 52325 }, { "epoch": 7.79416145367888, "grad_norm": 0.8061578273773193, "learning_rate": 3.827012992009812e-05, "loss": 0.501, "num_input_tokens_seen": 30377824, "step": 52330 }, { "epoch": 7.794906166219839, "grad_norm": 0.8939370512962341, "learning_rate": 3.826737594328082e-05, "loss": 0.6111, "num_input_tokens_seen": 30380896, "step": 52335 }, { "epoch": 7.7956508787607985, "grad_norm": 1.2078990936279297, "learning_rate": 3.826462174232467e-05, "loss": 0.6588, "num_input_tokens_seen": 30383616, "step": 52340 }, { "epoch": 7.796395591301757, "grad_norm": 1.03184974193573, "learning_rate": 3.8261867317276225e-05, "loss": 0.5975, "num_input_tokens_seen": 30386624, "step": 52345 }, { "epoch": 7.797140303842717, "grad_norm": 1.3071695566177368, "learning_rate": 3.8259112668181995e-05, "loss": 0.61, "num_input_tokens_seen": 30389728, "step": 52350 }, { "epoch": 7.797885016383676, "grad_norm": 1.311885952949524, "learning_rate": 3.825635779508855e-05, "loss": 0.7754, "num_input_tokens_seen": 30392608, "step": 52355 }, { "epoch": 7.798629728924635, "grad_norm": 1.3071459531784058, "learning_rate": 3.82536026980424e-05, "loss": 0.5905, "num_input_tokens_seen": 30395712, "step": 52360 }, { "epoch": 7.799374441465594, "grad_norm": 0.8528749942779541, "learning_rate": 3.825084737709011e-05, "loss": 0.6541, "num_input_tokens_seen": 30398560, "step": 52365 }, { "epoch": 7.800119154006554, "grad_norm": 0.8889904022216797, "learning_rate": 3.824809183227822e-05, "loss": 0.6274, "num_input_tokens_seen": 30401280, "step": 52370 }, { "epoch": 7.8008638665475125, "grad_norm": 1.33997642993927, "learning_rate": 3.824533606365329e-05, "loss": 0.5989, "num_input_tokens_seen": 30403840, "step": 52375 }, { "epoch": 7.801608579088472, "grad_norm": 1.3304208517074585, "learning_rate": 3.824258007126186e-05, "loss": 0.7184, "num_input_tokens_seen": 30406848, "step": 52380 }, { "epoch": 7.802353291629431, "grad_norm": 1.0163159370422363, "learning_rate": 3.82398238551505e-05, "loss": 0.6865, "num_input_tokens_seen": 30409600, "step": 52385 }, { "epoch": 7.803098004170391, "grad_norm": 1.5840154886245728, "learning_rate": 3.823706741536578e-05, "loss": 0.5711, "num_input_tokens_seen": 30412416, "step": 52390 }, { "epoch": 7.803842716711349, "grad_norm": 1.833100438117981, "learning_rate": 3.823431075195425e-05, "loss": 0.7213, "num_input_tokens_seen": 30415200, "step": 52395 }, { "epoch": 7.804587429252309, "grad_norm": 1.3637229204177856, "learning_rate": 3.8231553864962486e-05, "loss": 0.5453, "num_input_tokens_seen": 30418080, "step": 52400 }, { "epoch": 7.805332141793268, "grad_norm": 0.8569368124008179, "learning_rate": 3.8228796754437086e-05, "loss": 0.7105, "num_input_tokens_seen": 30420896, "step": 52405 }, { "epoch": 7.806076854334227, "grad_norm": 1.6443865299224854, "learning_rate": 3.8226039420424596e-05, "loss": 0.5362, "num_input_tokens_seen": 30423904, "step": 52410 }, { "epoch": 7.806821566875186, "grad_norm": 0.9138069152832031, "learning_rate": 3.822328186297162e-05, "loss": 0.6732, "num_input_tokens_seen": 30426720, "step": 52415 }, { "epoch": 7.807566279416145, "grad_norm": 1.0638850927352905, "learning_rate": 3.822052408212473e-05, "loss": 0.6258, "num_input_tokens_seen": 30429504, "step": 52420 }, { "epoch": 7.8083109919571045, "grad_norm": 1.2073111534118652, "learning_rate": 3.8217766077930527e-05, "loss": 0.6799, "num_input_tokens_seen": 30432640, "step": 52425 }, { "epoch": 7.809055704498064, "grad_norm": 1.4653390645980835, "learning_rate": 3.82150078504356e-05, "loss": 0.6373, "num_input_tokens_seen": 30435520, "step": 52430 }, { "epoch": 7.809800417039023, "grad_norm": 1.520181655883789, "learning_rate": 3.821224939968654e-05, "loss": 0.5368, "num_input_tokens_seen": 30438272, "step": 52435 }, { "epoch": 7.810545129579982, "grad_norm": 2.1925723552703857, "learning_rate": 3.820949072572996e-05, "loss": 0.6223, "num_input_tokens_seen": 30441312, "step": 52440 }, { "epoch": 7.811289842120941, "grad_norm": 1.216838002204895, "learning_rate": 3.820673182861246e-05, "loss": 0.6124, "num_input_tokens_seen": 30444384, "step": 52445 }, { "epoch": 7.812034554661901, "grad_norm": 0.9211708307266235, "learning_rate": 3.820397270838064e-05, "loss": 0.5263, "num_input_tokens_seen": 30447296, "step": 52450 }, { "epoch": 7.81277926720286, "grad_norm": 1.309697151184082, "learning_rate": 3.820121336508113e-05, "loss": 0.6593, "num_input_tokens_seen": 30450624, "step": 52455 }, { "epoch": 7.8135239797438185, "grad_norm": 0.9123666882514954, "learning_rate": 3.819845379876054e-05, "loss": 0.7281, "num_input_tokens_seen": 30453248, "step": 52460 }, { "epoch": 7.814268692284778, "grad_norm": 1.4982202053070068, "learning_rate": 3.8195694009465486e-05, "loss": 0.4959, "num_input_tokens_seen": 30456096, "step": 52465 }, { "epoch": 7.815013404825737, "grad_norm": 1.6833069324493408, "learning_rate": 3.819293399724259e-05, "loss": 0.7583, "num_input_tokens_seen": 30459008, "step": 52470 }, { "epoch": 7.815758117366697, "grad_norm": 1.342748761177063, "learning_rate": 3.819017376213848e-05, "loss": 0.4959, "num_input_tokens_seen": 30461824, "step": 52475 }, { "epoch": 7.816502829907655, "grad_norm": 1.2582532167434692, "learning_rate": 3.8187413304199796e-05, "loss": 0.5879, "num_input_tokens_seen": 30464704, "step": 52480 }, { "epoch": 7.817247542448615, "grad_norm": 1.9235628843307495, "learning_rate": 3.818465262347316e-05, "loss": 0.7277, "num_input_tokens_seen": 30467584, "step": 52485 }, { "epoch": 7.817992254989574, "grad_norm": 1.1516354084014893, "learning_rate": 3.818189172000522e-05, "loss": 0.6758, "num_input_tokens_seen": 30470560, "step": 52490 }, { "epoch": 7.818736967530533, "grad_norm": 1.0651432275772095, "learning_rate": 3.8179130593842626e-05, "loss": 0.5429, "num_input_tokens_seen": 30473568, "step": 52495 }, { "epoch": 7.819481680071492, "grad_norm": 0.5367913842201233, "learning_rate": 3.8176369245032006e-05, "loss": 0.6965, "num_input_tokens_seen": 30476384, "step": 52500 }, { "epoch": 7.820226392612452, "grad_norm": 1.0553345680236816, "learning_rate": 3.817360767362003e-05, "loss": 0.8065, "num_input_tokens_seen": 30479360, "step": 52505 }, { "epoch": 7.8209711051534105, "grad_norm": 1.0605878829956055, "learning_rate": 3.817084587965333e-05, "loss": 0.5021, "num_input_tokens_seen": 30482400, "step": 52510 }, { "epoch": 7.82171581769437, "grad_norm": 2.1210379600524902, "learning_rate": 3.8168083863178586e-05, "loss": 0.6576, "num_input_tokens_seen": 30485344, "step": 52515 }, { "epoch": 7.822460530235329, "grad_norm": 1.3381309509277344, "learning_rate": 3.8165321624242434e-05, "loss": 0.6067, "num_input_tokens_seen": 30488384, "step": 52520 }, { "epoch": 7.823205242776289, "grad_norm": 1.039962887763977, "learning_rate": 3.816255916289156e-05, "loss": 0.7522, "num_input_tokens_seen": 30491424, "step": 52525 }, { "epoch": 7.823949955317247, "grad_norm": 0.5400413274765015, "learning_rate": 3.8159796479172626e-05, "loss": 0.4486, "num_input_tokens_seen": 30494304, "step": 52530 }, { "epoch": 7.824694667858207, "grad_norm": 1.1284739971160889, "learning_rate": 3.815703357313231e-05, "loss": 0.5367, "num_input_tokens_seen": 30497312, "step": 52535 }, { "epoch": 7.825439380399166, "grad_norm": 1.5748672485351562, "learning_rate": 3.8154270444817285e-05, "loss": 0.6142, "num_input_tokens_seen": 30500160, "step": 52540 }, { "epoch": 7.826184092940125, "grad_norm": 1.428483247756958, "learning_rate": 3.815150709427423e-05, "loss": 0.6069, "num_input_tokens_seen": 30502944, "step": 52545 }, { "epoch": 7.826928805481084, "grad_norm": 1.2788351774215698, "learning_rate": 3.8148743521549824e-05, "loss": 0.7009, "num_input_tokens_seen": 30505824, "step": 52550 }, { "epoch": 7.827673518022044, "grad_norm": 1.0228132009506226, "learning_rate": 3.814597972669076e-05, "loss": 0.4986, "num_input_tokens_seen": 30508832, "step": 52555 }, { "epoch": 7.828418230563003, "grad_norm": 1.2347289323806763, "learning_rate": 3.814321570974373e-05, "loss": 0.729, "num_input_tokens_seen": 30511776, "step": 52560 }, { "epoch": 7.829162943103962, "grad_norm": 0.8095167279243469, "learning_rate": 3.814045147075543e-05, "loss": 0.4995, "num_input_tokens_seen": 30514592, "step": 52565 }, { "epoch": 7.829907655644921, "grad_norm": 1.283494234085083, "learning_rate": 3.813768700977256e-05, "loss": 0.6182, "num_input_tokens_seen": 30517504, "step": 52570 }, { "epoch": 7.830652368185881, "grad_norm": 0.569593071937561, "learning_rate": 3.813492232684182e-05, "loss": 0.5664, "num_input_tokens_seen": 30520096, "step": 52575 }, { "epoch": 7.831397080726839, "grad_norm": 1.2560170888900757, "learning_rate": 3.813215742200992e-05, "loss": 0.8228, "num_input_tokens_seen": 30523264, "step": 52580 }, { "epoch": 7.832141793267798, "grad_norm": 1.575595498085022, "learning_rate": 3.8129392295323566e-05, "loss": 0.5816, "num_input_tokens_seen": 30526048, "step": 52585 }, { "epoch": 7.832886505808758, "grad_norm": 1.804955005645752, "learning_rate": 3.812662694682946e-05, "loss": 0.6374, "num_input_tokens_seen": 30529088, "step": 52590 }, { "epoch": 7.833631218349717, "grad_norm": 0.8751593232154846, "learning_rate": 3.8123861376574344e-05, "loss": 0.685, "num_input_tokens_seen": 30531936, "step": 52595 }, { "epoch": 7.834375930890676, "grad_norm": 1.3724453449249268, "learning_rate": 3.8121095584604925e-05, "loss": 0.6275, "num_input_tokens_seen": 30534912, "step": 52600 }, { "epoch": 7.835120643431635, "grad_norm": 1.2406816482543945, "learning_rate": 3.811832957096794e-05, "loss": 0.6518, "num_input_tokens_seen": 30538944, "step": 52605 }, { "epoch": 7.835865355972595, "grad_norm": 1.5036530494689941, "learning_rate": 3.81155633357101e-05, "loss": 0.5112, "num_input_tokens_seen": 30541888, "step": 52610 }, { "epoch": 7.836610068513554, "grad_norm": 1.3323789834976196, "learning_rate": 3.8112796878878155e-05, "loss": 0.624, "num_input_tokens_seen": 30544576, "step": 52615 }, { "epoch": 7.837354781054513, "grad_norm": 0.7082791924476624, "learning_rate": 3.811003020051883e-05, "loss": 0.6004, "num_input_tokens_seen": 30547360, "step": 52620 }, { "epoch": 7.838099493595472, "grad_norm": 1.246826410293579, "learning_rate": 3.8107263300678874e-05, "loss": 0.6369, "num_input_tokens_seen": 30550816, "step": 52625 }, { "epoch": 7.838844206136431, "grad_norm": 1.2312113046646118, "learning_rate": 3.810449617940502e-05, "loss": 0.5171, "num_input_tokens_seen": 30553536, "step": 52630 }, { "epoch": 7.83958891867739, "grad_norm": 1.0913841724395752, "learning_rate": 3.810172883674402e-05, "loss": 0.7153, "num_input_tokens_seen": 30556032, "step": 52635 }, { "epoch": 7.84033363121835, "grad_norm": 2.6103992462158203, "learning_rate": 3.809896127274264e-05, "loss": 0.7736, "num_input_tokens_seen": 30558784, "step": 52640 }, { "epoch": 7.841078343759309, "grad_norm": 1.413910150527954, "learning_rate": 3.8096193487447604e-05, "loss": 0.5634, "num_input_tokens_seen": 30561920, "step": 52645 }, { "epoch": 7.841823056300268, "grad_norm": 1.533742904663086, "learning_rate": 3.8093425480905706e-05, "loss": 0.6314, "num_input_tokens_seen": 30564800, "step": 52650 }, { "epoch": 7.842567768841227, "grad_norm": 0.8089976906776428, "learning_rate": 3.809065725316368e-05, "loss": 0.572, "num_input_tokens_seen": 30567968, "step": 52655 }, { "epoch": 7.843312481382187, "grad_norm": 1.373314619064331, "learning_rate": 3.808788880426831e-05, "loss": 0.7189, "num_input_tokens_seen": 30570592, "step": 52660 }, { "epoch": 7.844057193923145, "grad_norm": 1.3376609086990356, "learning_rate": 3.8085120134266364e-05, "loss": 0.6491, "num_input_tokens_seen": 30573440, "step": 52665 }, { "epoch": 7.844801906464105, "grad_norm": 1.077850580215454, "learning_rate": 3.8082351243204605e-05, "loss": 0.6668, "num_input_tokens_seen": 30576128, "step": 52670 }, { "epoch": 7.845546619005064, "grad_norm": 1.2801904678344727, "learning_rate": 3.8079582131129826e-05, "loss": 0.5813, "num_input_tokens_seen": 30579072, "step": 52675 }, { "epoch": 7.846291331546023, "grad_norm": 1.1253230571746826, "learning_rate": 3.8076812798088796e-05, "loss": 0.5338, "num_input_tokens_seen": 30582624, "step": 52680 }, { "epoch": 7.847036044086982, "grad_norm": 1.2084358930587769, "learning_rate": 3.80740432441283e-05, "loss": 0.62, "num_input_tokens_seen": 30585344, "step": 52685 }, { "epoch": 7.847780756627942, "grad_norm": 1.188594937324524, "learning_rate": 3.807127346929514e-05, "loss": 0.6257, "num_input_tokens_seen": 30587968, "step": 52690 }, { "epoch": 7.848525469168901, "grad_norm": 0.6211334466934204, "learning_rate": 3.806850347363609e-05, "loss": 0.6866, "num_input_tokens_seen": 30590976, "step": 52695 }, { "epoch": 7.84927018170986, "grad_norm": 1.436390995979309, "learning_rate": 3.8065733257197964e-05, "loss": 0.8261, "num_input_tokens_seen": 30593888, "step": 52700 }, { "epoch": 7.850014894250819, "grad_norm": 0.7763235569000244, "learning_rate": 3.806296282002756e-05, "loss": 0.4596, "num_input_tokens_seen": 30596480, "step": 52705 }, { "epoch": 7.850759606791779, "grad_norm": 1.0351357460021973, "learning_rate": 3.8060192162171664e-05, "loss": 0.7324, "num_input_tokens_seen": 30599744, "step": 52710 }, { "epoch": 7.851504319332737, "grad_norm": 0.861719012260437, "learning_rate": 3.80574212836771e-05, "loss": 0.7349, "num_input_tokens_seen": 30602656, "step": 52715 }, { "epoch": 7.852249031873697, "grad_norm": 1.6202772855758667, "learning_rate": 3.805465018459067e-05, "loss": 0.6339, "num_input_tokens_seen": 30605280, "step": 52720 }, { "epoch": 7.852993744414656, "grad_norm": 2.1736855506896973, "learning_rate": 3.8051878864959194e-05, "loss": 0.5578, "num_input_tokens_seen": 30608608, "step": 52725 }, { "epoch": 7.8537384569556155, "grad_norm": 0.8265146613121033, "learning_rate": 3.804910732482949e-05, "loss": 0.4304, "num_input_tokens_seen": 30612096, "step": 52730 }, { "epoch": 7.854483169496574, "grad_norm": 0.7743984460830688, "learning_rate": 3.804633556424839e-05, "loss": 0.5763, "num_input_tokens_seen": 30614848, "step": 52735 }, { "epoch": 7.855227882037534, "grad_norm": 0.759177565574646, "learning_rate": 3.804356358326271e-05, "loss": 0.669, "num_input_tokens_seen": 30617632, "step": 52740 }, { "epoch": 7.855972594578493, "grad_norm": 1.906569242477417, "learning_rate": 3.804079138191927e-05, "loss": 0.8155, "num_input_tokens_seen": 30620480, "step": 52745 }, { "epoch": 7.856717307119452, "grad_norm": 1.375097393989563, "learning_rate": 3.803801896026491e-05, "loss": 0.86, "num_input_tokens_seen": 30623392, "step": 52750 }, { "epoch": 7.857462019660411, "grad_norm": 1.0025179386138916, "learning_rate": 3.803524631834648e-05, "loss": 0.5328, "num_input_tokens_seen": 30626144, "step": 52755 }, { "epoch": 7.858206732201371, "grad_norm": 1.4578927755355835, "learning_rate": 3.8032473456210805e-05, "loss": 0.7489, "num_input_tokens_seen": 30628928, "step": 52760 }, { "epoch": 7.858951444742329, "grad_norm": 2.5909876823425293, "learning_rate": 3.8029700373904744e-05, "loss": 0.6564, "num_input_tokens_seen": 30632000, "step": 52765 }, { "epoch": 7.859696157283288, "grad_norm": 1.4157278537750244, "learning_rate": 3.802692707147514e-05, "loss": 0.7366, "num_input_tokens_seen": 30634944, "step": 52770 }, { "epoch": 7.860440869824248, "grad_norm": 1.7672340869903564, "learning_rate": 3.8024153548968835e-05, "loss": 0.682, "num_input_tokens_seen": 30637472, "step": 52775 }, { "epoch": 7.8611855823652075, "grad_norm": 1.0355778932571411, "learning_rate": 3.80213798064327e-05, "loss": 0.7187, "num_input_tokens_seen": 30640512, "step": 52780 }, { "epoch": 7.861930294906166, "grad_norm": 0.9446369409561157, "learning_rate": 3.801860584391358e-05, "loss": 0.6383, "num_input_tokens_seen": 30643264, "step": 52785 }, { "epoch": 7.862675007447125, "grad_norm": 1.5410236120224, "learning_rate": 3.801583166145835e-05, "loss": 0.6961, "num_input_tokens_seen": 30646080, "step": 52790 }, { "epoch": 7.863419719988085, "grad_norm": 0.8107097148895264, "learning_rate": 3.801305725911387e-05, "loss": 0.6252, "num_input_tokens_seen": 30648928, "step": 52795 }, { "epoch": 7.864164432529043, "grad_norm": 1.228050947189331, "learning_rate": 3.8010282636927016e-05, "loss": 0.649, "num_input_tokens_seen": 30652000, "step": 52800 }, { "epoch": 7.864909145070003, "grad_norm": 1.461769700050354, "learning_rate": 3.800750779494466e-05, "loss": 0.6389, "num_input_tokens_seen": 30654752, "step": 52805 }, { "epoch": 7.865653857610962, "grad_norm": 1.7663187980651855, "learning_rate": 3.8004732733213674e-05, "loss": 0.6415, "num_input_tokens_seen": 30657568, "step": 52810 }, { "epoch": 7.8663985701519215, "grad_norm": 1.388342261314392, "learning_rate": 3.8001957451780956e-05, "loss": 0.5895, "num_input_tokens_seen": 30660288, "step": 52815 }, { "epoch": 7.86714328269288, "grad_norm": 1.126677393913269, "learning_rate": 3.799918195069338e-05, "loss": 0.5866, "num_input_tokens_seen": 30663296, "step": 52820 }, { "epoch": 7.86788799523384, "grad_norm": 1.2305107116699219, "learning_rate": 3.799640622999784e-05, "loss": 0.6369, "num_input_tokens_seen": 30666144, "step": 52825 }, { "epoch": 7.868632707774799, "grad_norm": 2.0143795013427734, "learning_rate": 3.799363028974121e-05, "loss": 0.6799, "num_input_tokens_seen": 30669120, "step": 52830 }, { "epoch": 7.869377420315758, "grad_norm": 1.3130556344985962, "learning_rate": 3.799085412997041e-05, "loss": 0.5409, "num_input_tokens_seen": 30672608, "step": 52835 }, { "epoch": 7.870122132856717, "grad_norm": 1.0541280508041382, "learning_rate": 3.798807775073234e-05, "loss": 0.6362, "num_input_tokens_seen": 30675360, "step": 52840 }, { "epoch": 7.870866845397677, "grad_norm": 1.1263129711151123, "learning_rate": 3.7985301152073896e-05, "loss": 0.6315, "num_input_tokens_seen": 30678304, "step": 52845 }, { "epoch": 7.871611557938635, "grad_norm": 0.8310675024986267, "learning_rate": 3.798252433404198e-05, "loss": 0.6044, "num_input_tokens_seen": 30681440, "step": 52850 }, { "epoch": 7.872356270479595, "grad_norm": 0.9165559411048889, "learning_rate": 3.797974729668351e-05, "loss": 0.7676, "num_input_tokens_seen": 30684480, "step": 52855 }, { "epoch": 7.873100983020554, "grad_norm": 1.0878382921218872, "learning_rate": 3.7976970040045404e-05, "loss": 0.7044, "num_input_tokens_seen": 30687456, "step": 52860 }, { "epoch": 7.8738456955615135, "grad_norm": 0.7868936061859131, "learning_rate": 3.797419256417458e-05, "loss": 0.5806, "num_input_tokens_seen": 30690560, "step": 52865 }, { "epoch": 7.874590408102472, "grad_norm": 2.3050150871276855, "learning_rate": 3.797141486911796e-05, "loss": 0.6196, "num_input_tokens_seen": 30693408, "step": 52870 }, { "epoch": 7.875335120643432, "grad_norm": 1.2478392124176025, "learning_rate": 3.796863695492247e-05, "loss": 0.583, "num_input_tokens_seen": 30696352, "step": 52875 }, { "epoch": 7.876079833184391, "grad_norm": 0.9672490358352661, "learning_rate": 3.796585882163503e-05, "loss": 0.689, "num_input_tokens_seen": 30699456, "step": 52880 }, { "epoch": 7.87682454572535, "grad_norm": 1.340332269668579, "learning_rate": 3.796308046930258e-05, "loss": 0.849, "num_input_tokens_seen": 30702368, "step": 52885 }, { "epoch": 7.877569258266309, "grad_norm": 1.2978134155273438, "learning_rate": 3.796030189797207e-05, "loss": 0.46, "num_input_tokens_seen": 30705024, "step": 52890 }, { "epoch": 7.878313970807269, "grad_norm": 0.9182813167572021, "learning_rate": 3.795752310769044e-05, "loss": 0.7185, "num_input_tokens_seen": 30707936, "step": 52895 }, { "epoch": 7.8790586833482275, "grad_norm": 1.5005229711532593, "learning_rate": 3.795474409850462e-05, "loss": 0.663, "num_input_tokens_seen": 30710848, "step": 52900 }, { "epoch": 7.879803395889187, "grad_norm": 1.2227839231491089, "learning_rate": 3.795196487046157e-05, "loss": 0.7128, "num_input_tokens_seen": 30713664, "step": 52905 }, { "epoch": 7.880548108430146, "grad_norm": 1.96206796169281, "learning_rate": 3.794918542360822e-05, "loss": 0.5841, "num_input_tokens_seen": 30716512, "step": 52910 }, { "epoch": 7.8812928209711055, "grad_norm": 2.1300673484802246, "learning_rate": 3.7946405757991556e-05, "loss": 0.7211, "num_input_tokens_seen": 30719520, "step": 52915 }, { "epoch": 7.882037533512064, "grad_norm": 0.927255392074585, "learning_rate": 3.7943625873658515e-05, "loss": 0.7259, "num_input_tokens_seen": 30722752, "step": 52920 }, { "epoch": 7.882782246053024, "grad_norm": 1.934085726737976, "learning_rate": 3.7940845770656085e-05, "loss": 0.592, "num_input_tokens_seen": 30725824, "step": 52925 }, { "epoch": 7.883526958593983, "grad_norm": 1.383164644241333, "learning_rate": 3.7938065449031206e-05, "loss": 0.4961, "num_input_tokens_seen": 30728672, "step": 52930 }, { "epoch": 7.884271671134941, "grad_norm": 1.7553950548171997, "learning_rate": 3.793528490883087e-05, "loss": 0.5685, "num_input_tokens_seen": 30731776, "step": 52935 }, { "epoch": 7.885016383675901, "grad_norm": 1.1557812690734863, "learning_rate": 3.7932504150102045e-05, "loss": 0.6305, "num_input_tokens_seen": 30734880, "step": 52940 }, { "epoch": 7.885761096216861, "grad_norm": 1.5628647804260254, "learning_rate": 3.7929723172891696e-05, "loss": 0.761, "num_input_tokens_seen": 30737632, "step": 52945 }, { "epoch": 7.8865058087578195, "grad_norm": 1.5946317911148071, "learning_rate": 3.792694197724682e-05, "loss": 0.732, "num_input_tokens_seen": 30740256, "step": 52950 }, { "epoch": 7.887250521298778, "grad_norm": 1.022908329963684, "learning_rate": 3.7924160563214395e-05, "loss": 0.5342, "num_input_tokens_seen": 30743008, "step": 52955 }, { "epoch": 7.887995233839738, "grad_norm": 1.1626763343811035, "learning_rate": 3.792137893084141e-05, "loss": 0.3671, "num_input_tokens_seen": 30745664, "step": 52960 }, { "epoch": 7.8887399463806975, "grad_norm": 1.4202104806900024, "learning_rate": 3.791859708017486e-05, "loss": 0.5689, "num_input_tokens_seen": 30748416, "step": 52965 }, { "epoch": 7.889484658921656, "grad_norm": 1.7173813581466675, "learning_rate": 3.791581501126175e-05, "loss": 0.6429, "num_input_tokens_seen": 30751776, "step": 52970 }, { "epoch": 7.890229371462615, "grad_norm": 1.3464818000793457, "learning_rate": 3.791303272414907e-05, "loss": 0.7786, "num_input_tokens_seen": 30754720, "step": 52975 }, { "epoch": 7.890974084003575, "grad_norm": 0.6591389775276184, "learning_rate": 3.791025021888382e-05, "loss": 0.6189, "num_input_tokens_seen": 30757504, "step": 52980 }, { "epoch": 7.8917187965445335, "grad_norm": 1.2730854749679565, "learning_rate": 3.7907467495513026e-05, "loss": 0.6004, "num_input_tokens_seen": 30760416, "step": 52985 }, { "epoch": 7.892463509085493, "grad_norm": 0.828516960144043, "learning_rate": 3.790468455408368e-05, "loss": 0.695, "num_input_tokens_seen": 30763264, "step": 52990 }, { "epoch": 7.893208221626452, "grad_norm": 2.168534994125366, "learning_rate": 3.79019013946428e-05, "loss": 0.7643, "num_input_tokens_seen": 30765952, "step": 52995 }, { "epoch": 7.8939529341674115, "grad_norm": 1.4583812952041626, "learning_rate": 3.789911801723742e-05, "loss": 0.7071, "num_input_tokens_seen": 30768864, "step": 53000 }, { "epoch": 7.89469764670837, "grad_norm": 1.144473910331726, "learning_rate": 3.789633442191455e-05, "loss": 0.5652, "num_input_tokens_seen": 30771680, "step": 53005 }, { "epoch": 7.89544235924933, "grad_norm": 2.798396110534668, "learning_rate": 3.7893550608721206e-05, "loss": 0.6444, "num_input_tokens_seen": 30774720, "step": 53010 }, { "epoch": 7.896187071790289, "grad_norm": 1.9904295206069946, "learning_rate": 3.789076657770444e-05, "loss": 0.572, "num_input_tokens_seen": 30777696, "step": 53015 }, { "epoch": 7.896931784331248, "grad_norm": 0.9516871571540833, "learning_rate": 3.788798232891127e-05, "loss": 0.5673, "num_input_tokens_seen": 30780448, "step": 53020 }, { "epoch": 7.897676496872207, "grad_norm": 1.2286744117736816, "learning_rate": 3.788519786238873e-05, "loss": 0.685, "num_input_tokens_seen": 30783104, "step": 53025 }, { "epoch": 7.898421209413167, "grad_norm": 1.0556234121322632, "learning_rate": 3.788241317818388e-05, "loss": 0.5024, "num_input_tokens_seen": 30786176, "step": 53030 }, { "epoch": 7.8991659219541255, "grad_norm": 1.274909257888794, "learning_rate": 3.7879628276343746e-05, "loss": 0.5558, "num_input_tokens_seen": 30788992, "step": 53035 }, { "epoch": 7.899910634495085, "grad_norm": 1.2303402423858643, "learning_rate": 3.787684315691539e-05, "loss": 0.6223, "num_input_tokens_seen": 30792000, "step": 53040 }, { "epoch": 7.900655347036044, "grad_norm": 0.6888096928596497, "learning_rate": 3.787405781994584e-05, "loss": 0.737, "num_input_tokens_seen": 30794880, "step": 53045 }, { "epoch": 7.9014000595770035, "grad_norm": 2.0123815536499023, "learning_rate": 3.7871272265482184e-05, "loss": 0.5478, "num_input_tokens_seen": 30797856, "step": 53050 }, { "epoch": 7.902144772117962, "grad_norm": 1.0671885013580322, "learning_rate": 3.786848649357145e-05, "loss": 0.6039, "num_input_tokens_seen": 30800608, "step": 53055 }, { "epoch": 7.902889484658922, "grad_norm": 1.047520399093628, "learning_rate": 3.786570050426073e-05, "loss": 0.6632, "num_input_tokens_seen": 30803616, "step": 53060 }, { "epoch": 7.903634197199881, "grad_norm": 1.4904464483261108, "learning_rate": 3.7862914297597075e-05, "loss": 0.6911, "num_input_tokens_seen": 30806528, "step": 53065 }, { "epoch": 7.90437890974084, "grad_norm": 0.9470510482788086, "learning_rate": 3.7860127873627546e-05, "loss": 0.6425, "num_input_tokens_seen": 30809344, "step": 53070 }, { "epoch": 7.905123622281799, "grad_norm": 1.050796627998352, "learning_rate": 3.785734123239924e-05, "loss": 0.5814, "num_input_tokens_seen": 30812224, "step": 53075 }, { "epoch": 7.905868334822759, "grad_norm": 1.4123717546463013, "learning_rate": 3.785455437395921e-05, "loss": 0.5765, "num_input_tokens_seen": 30815296, "step": 53080 }, { "epoch": 7.9066130473637175, "grad_norm": 1.1095571517944336, "learning_rate": 3.7851767298354554e-05, "loss": 0.5256, "num_input_tokens_seen": 30818240, "step": 53085 }, { "epoch": 7.907357759904677, "grad_norm": 1.7983797788619995, "learning_rate": 3.7848980005632344e-05, "loss": 0.783, "num_input_tokens_seen": 30821216, "step": 53090 }, { "epoch": 7.908102472445636, "grad_norm": 0.9591997265815735, "learning_rate": 3.7846192495839686e-05, "loss": 0.5256, "num_input_tokens_seen": 30823872, "step": 53095 }, { "epoch": 7.908847184986596, "grad_norm": 0.9525596499443054, "learning_rate": 3.784340476902366e-05, "loss": 0.6491, "num_input_tokens_seen": 30826720, "step": 53100 }, { "epoch": 7.909591897527554, "grad_norm": 1.2160385847091675, "learning_rate": 3.7840616825231365e-05, "loss": 0.6362, "num_input_tokens_seen": 30829536, "step": 53105 }, { "epoch": 7.910336610068514, "grad_norm": 1.2701597213745117, "learning_rate": 3.783782866450989e-05, "loss": 0.5849, "num_input_tokens_seen": 30832352, "step": 53110 }, { "epoch": 7.911081322609473, "grad_norm": 0.9154493808746338, "learning_rate": 3.783504028690635e-05, "loss": 0.5544, "num_input_tokens_seen": 30835296, "step": 53115 }, { "epoch": 7.9118260351504315, "grad_norm": 1.1153004169464111, "learning_rate": 3.783225169246786e-05, "loss": 0.5819, "num_input_tokens_seen": 30838240, "step": 53120 }, { "epoch": 7.912570747691391, "grad_norm": 1.2801504135131836, "learning_rate": 3.782946288124151e-05, "loss": 0.6319, "num_input_tokens_seen": 30841120, "step": 53125 }, { "epoch": 7.913315460232351, "grad_norm": 0.9772573113441467, "learning_rate": 3.782667385327442e-05, "loss": 0.6889, "num_input_tokens_seen": 30843904, "step": 53130 }, { "epoch": 7.9140601727733095, "grad_norm": 1.4766724109649658, "learning_rate": 3.782388460861372e-05, "loss": 0.6462, "num_input_tokens_seen": 30846816, "step": 53135 }, { "epoch": 7.914804885314268, "grad_norm": 1.498864769935608, "learning_rate": 3.7821095147306527e-05, "loss": 0.7366, "num_input_tokens_seen": 30849696, "step": 53140 }, { "epoch": 7.915549597855228, "grad_norm": 1.6033906936645508, "learning_rate": 3.781830546939996e-05, "loss": 0.592, "num_input_tokens_seen": 30852480, "step": 53145 }, { "epoch": 7.916294310396187, "grad_norm": 2.560459852218628, "learning_rate": 3.781551557494115e-05, "loss": 0.6749, "num_input_tokens_seen": 30855264, "step": 53150 }, { "epoch": 7.917039022937146, "grad_norm": 1.2074978351593018, "learning_rate": 3.7812725463977225e-05, "loss": 0.7577, "num_input_tokens_seen": 30857888, "step": 53155 }, { "epoch": 7.917783735478105, "grad_norm": 1.0710684061050415, "learning_rate": 3.7809935136555326e-05, "loss": 0.5921, "num_input_tokens_seen": 30860832, "step": 53160 }, { "epoch": 7.918528448019065, "grad_norm": 3.4463789463043213, "learning_rate": 3.780714459272259e-05, "loss": 0.7842, "num_input_tokens_seen": 30863584, "step": 53165 }, { "epoch": 7.9192731605600235, "grad_norm": 1.1456589698791504, "learning_rate": 3.780435383252617e-05, "loss": 0.7214, "num_input_tokens_seen": 30866720, "step": 53170 }, { "epoch": 7.920017873100983, "grad_norm": 1.7246313095092773, "learning_rate": 3.78015628560132e-05, "loss": 0.6751, "num_input_tokens_seen": 30869600, "step": 53175 }, { "epoch": 7.920762585641942, "grad_norm": 1.067232370376587, "learning_rate": 3.779877166323084e-05, "loss": 0.6196, "num_input_tokens_seen": 30872480, "step": 53180 }, { "epoch": 7.921507298182902, "grad_norm": 1.1811439990997314, "learning_rate": 3.779598025422624e-05, "loss": 0.4861, "num_input_tokens_seen": 30875168, "step": 53185 }, { "epoch": 7.92225201072386, "grad_norm": 1.0078562498092651, "learning_rate": 3.779318862904656e-05, "loss": 0.7063, "num_input_tokens_seen": 30878176, "step": 53190 }, { "epoch": 7.92299672326482, "grad_norm": 1.6783047914505005, "learning_rate": 3.779039678773896e-05, "loss": 0.6322, "num_input_tokens_seen": 30881408, "step": 53195 }, { "epoch": 7.923741435805779, "grad_norm": 1.65333092212677, "learning_rate": 3.77876047303506e-05, "loss": 0.735, "num_input_tokens_seen": 30884224, "step": 53200 }, { "epoch": 7.924486148346738, "grad_norm": 0.777342677116394, "learning_rate": 3.778481245692866e-05, "loss": 0.4551, "num_input_tokens_seen": 30887072, "step": 53205 }, { "epoch": 7.925230860887697, "grad_norm": 1.073270320892334, "learning_rate": 3.7782019967520305e-05, "loss": 0.79, "num_input_tokens_seen": 30889824, "step": 53210 }, { "epoch": 7.925975573428657, "grad_norm": 1.2969621419906616, "learning_rate": 3.777922726217271e-05, "loss": 0.5898, "num_input_tokens_seen": 30892736, "step": 53215 }, { "epoch": 7.9267202859696155, "grad_norm": 1.3638322353363037, "learning_rate": 3.7776434340933065e-05, "loss": 0.6486, "num_input_tokens_seen": 30895744, "step": 53220 }, { "epoch": 7.927464998510575, "grad_norm": 0.7325578331947327, "learning_rate": 3.7773641203848554e-05, "loss": 0.5292, "num_input_tokens_seen": 30898528, "step": 53225 }, { "epoch": 7.928209711051534, "grad_norm": 0.9109122157096863, "learning_rate": 3.7770847850966354e-05, "loss": 0.5513, "num_input_tokens_seen": 30901440, "step": 53230 }, { "epoch": 7.928954423592494, "grad_norm": 1.9696189165115356, "learning_rate": 3.7768054282333655e-05, "loss": 0.6517, "num_input_tokens_seen": 30904384, "step": 53235 }, { "epoch": 7.929699136133452, "grad_norm": 2.682661533355713, "learning_rate": 3.776526049799765e-05, "loss": 0.634, "num_input_tokens_seen": 30907264, "step": 53240 }, { "epoch": 7.930443848674412, "grad_norm": 1.3642518520355225, "learning_rate": 3.7762466498005544e-05, "loss": 0.4929, "num_input_tokens_seen": 30910304, "step": 53245 }, { "epoch": 7.931188561215371, "grad_norm": 0.7837597131729126, "learning_rate": 3.7759672282404546e-05, "loss": 0.5963, "num_input_tokens_seen": 30913120, "step": 53250 }, { "epoch": 7.93193327375633, "grad_norm": 0.8010066747665405, "learning_rate": 3.775687785124185e-05, "loss": 0.5681, "num_input_tokens_seen": 30916096, "step": 53255 }, { "epoch": 7.932677986297289, "grad_norm": 0.8730800151824951, "learning_rate": 3.775408320456466e-05, "loss": 0.4413, "num_input_tokens_seen": 30919200, "step": 53260 }, { "epoch": 7.933422698838249, "grad_norm": 1.1070610284805298, "learning_rate": 3.775128834242021e-05, "loss": 0.5661, "num_input_tokens_seen": 30922048, "step": 53265 }, { "epoch": 7.934167411379208, "grad_norm": 1.0865992307662964, "learning_rate": 3.77484932648557e-05, "loss": 0.4482, "num_input_tokens_seen": 30925120, "step": 53270 }, { "epoch": 7.934912123920167, "grad_norm": 0.8642128705978394, "learning_rate": 3.774569797191835e-05, "loss": 0.5995, "num_input_tokens_seen": 30927872, "step": 53275 }, { "epoch": 7.935656836461126, "grad_norm": 1.3614877462387085, "learning_rate": 3.774290246365539e-05, "loss": 0.5288, "num_input_tokens_seen": 30930784, "step": 53280 }, { "epoch": 7.936401549002085, "grad_norm": 1.298169732093811, "learning_rate": 3.774010674011404e-05, "loss": 0.6457, "num_input_tokens_seen": 30933760, "step": 53285 }, { "epoch": 7.937146261543044, "grad_norm": 0.6997523307800293, "learning_rate": 3.773731080134154e-05, "loss": 0.5503, "num_input_tokens_seen": 30937088, "step": 53290 }, { "epoch": 7.937890974084004, "grad_norm": 2.8742599487304688, "learning_rate": 3.7734514647385114e-05, "loss": 0.7525, "num_input_tokens_seen": 30939744, "step": 53295 }, { "epoch": 7.938635686624963, "grad_norm": 1.0055056810379028, "learning_rate": 3.773171827829201e-05, "loss": 0.6776, "num_input_tokens_seen": 30942528, "step": 53300 }, { "epoch": 7.9393803991659215, "grad_norm": 1.0809370279312134, "learning_rate": 3.772892169410947e-05, "loss": 0.623, "num_input_tokens_seen": 30945664, "step": 53305 }, { "epoch": 7.940125111706881, "grad_norm": 1.7305625677108765, "learning_rate": 3.772612489488473e-05, "loss": 0.6102, "num_input_tokens_seen": 30948416, "step": 53310 }, { "epoch": 7.940869824247841, "grad_norm": 0.617012619972229, "learning_rate": 3.772332788066504e-05, "loss": 0.6243, "num_input_tokens_seen": 30951264, "step": 53315 }, { "epoch": 7.9416145367888, "grad_norm": 1.3114296197891235, "learning_rate": 3.772053065149766e-05, "loss": 0.811, "num_input_tokens_seen": 30954176, "step": 53320 }, { "epoch": 7.942359249329758, "grad_norm": 1.48833167552948, "learning_rate": 3.771773320742984e-05, "loss": 0.82, "num_input_tokens_seen": 30957088, "step": 53325 }, { "epoch": 7.943103961870718, "grad_norm": 1.151304006576538, "learning_rate": 3.7714935548508846e-05, "loss": 0.8274, "num_input_tokens_seen": 30959936, "step": 53330 }, { "epoch": 7.943848674411677, "grad_norm": 1.0884184837341309, "learning_rate": 3.771213767478194e-05, "loss": 0.6011, "num_input_tokens_seen": 30962752, "step": 53335 }, { "epoch": 7.944593386952636, "grad_norm": 0.9290961027145386, "learning_rate": 3.770933958629639e-05, "loss": 0.6846, "num_input_tokens_seen": 30965920, "step": 53340 }, { "epoch": 7.945338099493595, "grad_norm": 1.0910459756851196, "learning_rate": 3.7706541283099466e-05, "loss": 0.7564, "num_input_tokens_seen": 30968704, "step": 53345 }, { "epoch": 7.946082812034555, "grad_norm": 1.274212121963501, "learning_rate": 3.7703742765238436e-05, "loss": 0.701, "num_input_tokens_seen": 30971424, "step": 53350 }, { "epoch": 7.946827524575514, "grad_norm": 0.7761458158493042, "learning_rate": 3.770094403276059e-05, "loss": 0.712, "num_input_tokens_seen": 30974176, "step": 53355 }, { "epoch": 7.947572237116473, "grad_norm": 1.6452292203903198, "learning_rate": 3.7698145085713196e-05, "loss": 0.6489, "num_input_tokens_seen": 30977120, "step": 53360 }, { "epoch": 7.948316949657432, "grad_norm": 1.114324688911438, "learning_rate": 3.7695345924143555e-05, "loss": 0.7599, "num_input_tokens_seen": 30979936, "step": 53365 }, { "epoch": 7.949061662198392, "grad_norm": 0.5676306486129761, "learning_rate": 3.769254654809894e-05, "loss": 0.4528, "num_input_tokens_seen": 30982944, "step": 53370 }, { "epoch": 7.94980637473935, "grad_norm": 0.9060440063476562, "learning_rate": 3.768974695762665e-05, "loss": 0.5474, "num_input_tokens_seen": 30985888, "step": 53375 }, { "epoch": 7.95055108728031, "grad_norm": 1.0725990533828735, "learning_rate": 3.768694715277398e-05, "loss": 0.4892, "num_input_tokens_seen": 30989056, "step": 53380 }, { "epoch": 7.951295799821269, "grad_norm": 1.6792082786560059, "learning_rate": 3.7684147133588245e-05, "loss": 0.6675, "num_input_tokens_seen": 30991968, "step": 53385 }, { "epoch": 7.952040512362228, "grad_norm": 3.183640718460083, "learning_rate": 3.7681346900116726e-05, "loss": 0.6952, "num_input_tokens_seen": 30994816, "step": 53390 }, { "epoch": 7.952785224903187, "grad_norm": 1.9356515407562256, "learning_rate": 3.7678546452406736e-05, "loss": 0.6227, "num_input_tokens_seen": 30997760, "step": 53395 }, { "epoch": 7.953529937444147, "grad_norm": 1.4123446941375732, "learning_rate": 3.76757457905056e-05, "loss": 0.5587, "num_input_tokens_seen": 31000640, "step": 53400 }, { "epoch": 7.954274649985106, "grad_norm": 1.2817504405975342, "learning_rate": 3.767294491446062e-05, "loss": 0.4877, "num_input_tokens_seen": 31003808, "step": 53405 }, { "epoch": 7.955019362526065, "grad_norm": 1.0551776885986328, "learning_rate": 3.7670143824319116e-05, "loss": 0.632, "num_input_tokens_seen": 31006720, "step": 53410 }, { "epoch": 7.955764075067024, "grad_norm": 2.5327491760253906, "learning_rate": 3.76673425201284e-05, "loss": 0.5623, "num_input_tokens_seen": 31009568, "step": 53415 }, { "epoch": 7.956508787607984, "grad_norm": 1.1611442565917969, "learning_rate": 3.766454100193581e-05, "loss": 0.6443, "num_input_tokens_seen": 31012288, "step": 53420 }, { "epoch": 7.957253500148942, "grad_norm": 1.1322561502456665, "learning_rate": 3.7661739269788687e-05, "loss": 0.6372, "num_input_tokens_seen": 31015296, "step": 53425 }, { "epoch": 7.957998212689902, "grad_norm": 1.5756804943084717, "learning_rate": 3.765893732373433e-05, "loss": 0.6887, "num_input_tokens_seen": 31018048, "step": 53430 }, { "epoch": 7.958742925230861, "grad_norm": 1.4792448282241821, "learning_rate": 3.7656135163820105e-05, "loss": 0.7377, "num_input_tokens_seen": 31020960, "step": 53435 }, { "epoch": 7.9594876377718204, "grad_norm": 1.033547043800354, "learning_rate": 3.7653332790093334e-05, "loss": 0.6013, "num_input_tokens_seen": 31024064, "step": 53440 }, { "epoch": 7.960232350312779, "grad_norm": 1.3683056831359863, "learning_rate": 3.765053020260137e-05, "loss": 0.6954, "num_input_tokens_seen": 31026688, "step": 53445 }, { "epoch": 7.960977062853738, "grad_norm": 1.8737505674362183, "learning_rate": 3.764772740139154e-05, "loss": 0.6407, "num_input_tokens_seen": 31029536, "step": 53450 }, { "epoch": 7.961721775394698, "grad_norm": 1.106041669845581, "learning_rate": 3.7644924386511225e-05, "loss": 0.5834, "num_input_tokens_seen": 31032160, "step": 53455 }, { "epoch": 7.962466487935657, "grad_norm": 1.5113197565078735, "learning_rate": 3.7642121158007756e-05, "loss": 0.6841, "num_input_tokens_seen": 31035200, "step": 53460 }, { "epoch": 7.963211200476616, "grad_norm": 1.7019011974334717, "learning_rate": 3.7639317715928514e-05, "loss": 0.529, "num_input_tokens_seen": 31038240, "step": 53465 }, { "epoch": 7.963955913017575, "grad_norm": 1.4943710565567017, "learning_rate": 3.763651406032083e-05, "loss": 0.6484, "num_input_tokens_seen": 31041504, "step": 53470 }, { "epoch": 7.964700625558534, "grad_norm": 1.9877440929412842, "learning_rate": 3.763371019123209e-05, "loss": 0.564, "num_input_tokens_seen": 31044448, "step": 53475 }, { "epoch": 7.965445338099494, "grad_norm": 1.296258568763733, "learning_rate": 3.7630906108709654e-05, "loss": 0.614, "num_input_tokens_seen": 31047904, "step": 53480 }, { "epoch": 7.966190050640453, "grad_norm": 1.6230427026748657, "learning_rate": 3.76281018128009e-05, "loss": 0.7603, "num_input_tokens_seen": 31050784, "step": 53485 }, { "epoch": 7.966934763181412, "grad_norm": 1.413596510887146, "learning_rate": 3.7625297303553195e-05, "loss": 0.6017, "num_input_tokens_seen": 31053536, "step": 53490 }, { "epoch": 7.967679475722371, "grad_norm": 0.9882829189300537, "learning_rate": 3.762249258101392e-05, "loss": 0.6397, "num_input_tokens_seen": 31056672, "step": 53495 }, { "epoch": 7.96842418826333, "grad_norm": 1.5521308183670044, "learning_rate": 3.761968764523048e-05, "loss": 0.6559, "num_input_tokens_seen": 31059808, "step": 53500 }, { "epoch": 7.96916890080429, "grad_norm": 1.9494613409042358, "learning_rate": 3.761688249625024e-05, "loss": 0.6777, "num_input_tokens_seen": 31062656, "step": 53505 }, { "epoch": 7.969913613345248, "grad_norm": 1.243857741355896, "learning_rate": 3.761407713412058e-05, "loss": 0.6908, "num_input_tokens_seen": 31065312, "step": 53510 }, { "epoch": 7.970658325886208, "grad_norm": 1.9050679206848145, "learning_rate": 3.761127155888891e-05, "loss": 0.6838, "num_input_tokens_seen": 31068736, "step": 53515 }, { "epoch": 7.971403038427167, "grad_norm": 1.621951937675476, "learning_rate": 3.760846577060263e-05, "loss": 0.5246, "num_input_tokens_seen": 31071456, "step": 53520 }, { "epoch": 7.9721477509681264, "grad_norm": 1.1783486604690552, "learning_rate": 3.760565976930913e-05, "loss": 0.7989, "num_input_tokens_seen": 31074080, "step": 53525 }, { "epoch": 7.972892463509085, "grad_norm": 0.9652889966964722, "learning_rate": 3.760285355505583e-05, "loss": 0.6865, "num_input_tokens_seen": 31076960, "step": 53530 }, { "epoch": 7.973637176050045, "grad_norm": 1.545312762260437, "learning_rate": 3.760004712789012e-05, "loss": 0.7007, "num_input_tokens_seen": 31080000, "step": 53535 }, { "epoch": 7.974381888591004, "grad_norm": 1.1178326606750488, "learning_rate": 3.759724048785942e-05, "loss": 0.6579, "num_input_tokens_seen": 31083104, "step": 53540 }, { "epoch": 7.975126601131963, "grad_norm": 1.8071891069412231, "learning_rate": 3.759443363501115e-05, "loss": 0.568, "num_input_tokens_seen": 31085888, "step": 53545 }, { "epoch": 7.975871313672922, "grad_norm": 2.5231235027313232, "learning_rate": 3.759162656939271e-05, "loss": 0.7107, "num_input_tokens_seen": 31088576, "step": 53550 }, { "epoch": 7.976616026213882, "grad_norm": 1.044748067855835, "learning_rate": 3.758881929105155e-05, "loss": 0.5997, "num_input_tokens_seen": 31091424, "step": 53555 }, { "epoch": 7.97736073875484, "grad_norm": 2.676673650741577, "learning_rate": 3.758601180003508e-05, "loss": 0.6561, "num_input_tokens_seen": 31094432, "step": 53560 }, { "epoch": 7.9781054512958, "grad_norm": 0.9335111379623413, "learning_rate": 3.758320409639074e-05, "loss": 0.4602, "num_input_tokens_seen": 31097280, "step": 53565 }, { "epoch": 7.978850163836759, "grad_norm": 1.1423656940460205, "learning_rate": 3.758039618016595e-05, "loss": 0.5105, "num_input_tokens_seen": 31099904, "step": 53570 }, { "epoch": 7.9795948763777185, "grad_norm": 0.9652023911476135, "learning_rate": 3.757758805140814e-05, "loss": 0.6532, "num_input_tokens_seen": 31102976, "step": 53575 }, { "epoch": 7.980339588918677, "grad_norm": 1.7347819805145264, "learning_rate": 3.757477971016478e-05, "loss": 0.5973, "num_input_tokens_seen": 31105632, "step": 53580 }, { "epoch": 7.981084301459637, "grad_norm": 1.0912855863571167, "learning_rate": 3.7571971156483285e-05, "loss": 0.6902, "num_input_tokens_seen": 31108480, "step": 53585 }, { "epoch": 7.981829014000596, "grad_norm": 1.313176155090332, "learning_rate": 3.756916239041113e-05, "loss": 0.4828, "num_input_tokens_seen": 31111328, "step": 53590 }, { "epoch": 7.982573726541555, "grad_norm": 1.0816582441329956, "learning_rate": 3.756635341199574e-05, "loss": 0.6334, "num_input_tokens_seen": 31113888, "step": 53595 }, { "epoch": 7.983318439082514, "grad_norm": 1.3217006921768188, "learning_rate": 3.756354422128459e-05, "loss": 0.657, "num_input_tokens_seen": 31117056, "step": 53600 }, { "epoch": 7.984063151623474, "grad_norm": 2.350538730621338, "learning_rate": 3.756073481832512e-05, "loss": 0.6535, "num_input_tokens_seen": 31119968, "step": 53605 }, { "epoch": 7.9848078641644324, "grad_norm": 1.0912245512008667, "learning_rate": 3.75579252031648e-05, "loss": 0.5957, "num_input_tokens_seen": 31122816, "step": 53610 }, { "epoch": 7.985552576705392, "grad_norm": 0.9669964909553528, "learning_rate": 3.75551153758511e-05, "loss": 0.6358, "num_input_tokens_seen": 31125728, "step": 53615 }, { "epoch": 7.986297289246351, "grad_norm": 0.6398459076881409, "learning_rate": 3.755230533643148e-05, "loss": 0.5523, "num_input_tokens_seen": 31128704, "step": 53620 }, { "epoch": 7.9870420017873105, "grad_norm": 1.735372543334961, "learning_rate": 3.754949508495344e-05, "loss": 0.6113, "num_input_tokens_seen": 31131424, "step": 53625 }, { "epoch": 7.987786714328269, "grad_norm": 1.7502893209457397, "learning_rate": 3.7546684621464415e-05, "loss": 0.5977, "num_input_tokens_seen": 31134176, "step": 53630 }, { "epoch": 7.988531426869228, "grad_norm": 1.7659220695495605, "learning_rate": 3.7543873946011916e-05, "loss": 0.538, "num_input_tokens_seen": 31136960, "step": 53635 }, { "epoch": 7.989276139410188, "grad_norm": 1.202351450920105, "learning_rate": 3.754106305864341e-05, "loss": 0.653, "num_input_tokens_seen": 31139968, "step": 53640 }, { "epoch": 7.990020851951147, "grad_norm": 0.9399142861366272, "learning_rate": 3.753825195940639e-05, "loss": 0.5653, "num_input_tokens_seen": 31142784, "step": 53645 }, { "epoch": 7.990765564492106, "grad_norm": 1.6147700548171997, "learning_rate": 3.753544064834835e-05, "loss": 0.6686, "num_input_tokens_seen": 31145568, "step": 53650 }, { "epoch": 7.991510277033065, "grad_norm": 1.1604986190795898, "learning_rate": 3.753262912551677e-05, "loss": 0.7069, "num_input_tokens_seen": 31148608, "step": 53655 }, { "epoch": 7.9922549895740245, "grad_norm": 0.8306676149368286, "learning_rate": 3.7529817390959164e-05, "loss": 0.5807, "num_input_tokens_seen": 31151424, "step": 53660 }, { "epoch": 7.992999702114983, "grad_norm": 1.3567556142807007, "learning_rate": 3.752700544472304e-05, "loss": 0.6005, "num_input_tokens_seen": 31154368, "step": 53665 }, { "epoch": 7.993744414655943, "grad_norm": 2.7799389362335205, "learning_rate": 3.752419328685588e-05, "loss": 0.6403, "num_input_tokens_seen": 31157440, "step": 53670 }, { "epoch": 7.994489127196902, "grad_norm": 1.029926061630249, "learning_rate": 3.752138091740521e-05, "loss": 0.7139, "num_input_tokens_seen": 31160384, "step": 53675 }, { "epoch": 7.995233839737861, "grad_norm": 1.2906614542007446, "learning_rate": 3.7518568336418525e-05, "loss": 0.5685, "num_input_tokens_seen": 31163008, "step": 53680 }, { "epoch": 7.99597855227882, "grad_norm": 1.4119881391525269, "learning_rate": 3.751575554394336e-05, "loss": 0.5183, "num_input_tokens_seen": 31166048, "step": 53685 }, { "epoch": 7.99672326481978, "grad_norm": 0.71514493227005, "learning_rate": 3.751294254002722e-05, "loss": 0.6153, "num_input_tokens_seen": 31169152, "step": 53690 }, { "epoch": 7.9974679773607384, "grad_norm": 1.1605082750320435, "learning_rate": 3.751012932471764e-05, "loss": 0.5003, "num_input_tokens_seen": 31171936, "step": 53695 }, { "epoch": 7.998212689901698, "grad_norm": 0.6324681639671326, "learning_rate": 3.7507315898062136e-05, "loss": 0.6459, "num_input_tokens_seen": 31174496, "step": 53700 }, { "epoch": 7.998957402442657, "grad_norm": 2.9953370094299316, "learning_rate": 3.7504502260108245e-05, "loss": 0.777, "num_input_tokens_seen": 31177248, "step": 53705 }, { "epoch": 7.9997021149836165, "grad_norm": 1.6510485410690308, "learning_rate": 3.750168841090349e-05, "loss": 0.5407, "num_input_tokens_seen": 31180224, "step": 53710 }, { "epoch": 8.0, "eval_loss": 0.6555541753768921, "eval_runtime": 46.9874, "eval_samples_per_second": 63.506, "eval_steps_per_second": 15.877, "num_input_tokens_seen": 31180904, "step": 53712 }, { "epoch": 8.000446827524575, "grad_norm": 1.3400026559829712, "learning_rate": 3.749887435049541e-05, "loss": 0.6306, "num_input_tokens_seen": 31182472, "step": 53715 }, { "epoch": 8.001191540065534, "grad_norm": 1.0852597951889038, "learning_rate": 3.749606007893157e-05, "loss": 0.6965, "num_input_tokens_seen": 31185448, "step": 53720 }, { "epoch": 8.001936252606495, "grad_norm": 1.1810599565505981, "learning_rate": 3.7493245596259484e-05, "loss": 0.5029, "num_input_tokens_seen": 31188168, "step": 53725 }, { "epoch": 8.002680965147453, "grad_norm": 1.735387921333313, "learning_rate": 3.7490430902526715e-05, "loss": 0.6106, "num_input_tokens_seen": 31190984, "step": 53730 }, { "epoch": 8.003425677688412, "grad_norm": 1.2138563394546509, "learning_rate": 3.7487615997780815e-05, "loss": 0.7312, "num_input_tokens_seen": 31193960, "step": 53735 }, { "epoch": 8.00417039022937, "grad_norm": 1.8315834999084473, "learning_rate": 3.7484800882069324e-05, "loss": 0.7068, "num_input_tokens_seen": 31197192, "step": 53740 }, { "epoch": 8.004915102770331, "grad_norm": 0.9707964658737183, "learning_rate": 3.748198555543981e-05, "loss": 0.7283, "num_input_tokens_seen": 31199944, "step": 53745 }, { "epoch": 8.00565981531129, "grad_norm": 1.247136116027832, "learning_rate": 3.747917001793985e-05, "loss": 0.8363, "num_input_tokens_seen": 31202888, "step": 53750 }, { "epoch": 8.006404527852249, "grad_norm": 0.784345269203186, "learning_rate": 3.7476354269616984e-05, "loss": 0.7064, "num_input_tokens_seen": 31205960, "step": 53755 }, { "epoch": 8.007149240393208, "grad_norm": 1.1349146366119385, "learning_rate": 3.747353831051879e-05, "loss": 0.5652, "num_input_tokens_seen": 31208968, "step": 53760 }, { "epoch": 8.007893952934168, "grad_norm": 1.3185145854949951, "learning_rate": 3.747072214069286e-05, "loss": 0.5389, "num_input_tokens_seen": 31211880, "step": 53765 }, { "epoch": 8.008638665475127, "grad_norm": 1.2074671983718872, "learning_rate": 3.746790576018674e-05, "loss": 0.6532, "num_input_tokens_seen": 31214760, "step": 53770 }, { "epoch": 8.009383378016086, "grad_norm": 1.3113815784454346, "learning_rate": 3.746508916904803e-05, "loss": 0.7396, "num_input_tokens_seen": 31217704, "step": 53775 }, { "epoch": 8.010128090557044, "grad_norm": 2.2206380367279053, "learning_rate": 3.74622723673243e-05, "loss": 0.5502, "num_input_tokens_seen": 31220520, "step": 53780 }, { "epoch": 8.010872803098005, "grad_norm": 1.107617974281311, "learning_rate": 3.745945535506315e-05, "loss": 0.4987, "num_input_tokens_seen": 31223432, "step": 53785 }, { "epoch": 8.011617515638964, "grad_norm": 1.5097386837005615, "learning_rate": 3.7456638132312164e-05, "loss": 0.7111, "num_input_tokens_seen": 31226152, "step": 53790 }, { "epoch": 8.012362228179922, "grad_norm": 1.3539234399795532, "learning_rate": 3.745382069911894e-05, "loss": 0.7246, "num_input_tokens_seen": 31228904, "step": 53795 }, { "epoch": 8.013106940720881, "grad_norm": 0.9333794116973877, "learning_rate": 3.745100305553107e-05, "loss": 0.5295, "num_input_tokens_seen": 31231816, "step": 53800 }, { "epoch": 8.013851653261842, "grad_norm": 1.5733228921890259, "learning_rate": 3.744818520159616e-05, "loss": 0.4998, "num_input_tokens_seen": 31234952, "step": 53805 }, { "epoch": 8.0145963658028, "grad_norm": 1.248123049736023, "learning_rate": 3.744536713736182e-05, "loss": 0.7544, "num_input_tokens_seen": 31238088, "step": 53810 }, { "epoch": 8.01534107834376, "grad_norm": 1.8241479396820068, "learning_rate": 3.744254886287564e-05, "loss": 0.7159, "num_input_tokens_seen": 31240936, "step": 53815 }, { "epoch": 8.016085790884718, "grad_norm": 1.3971571922302246, "learning_rate": 3.743973037818524e-05, "loss": 0.6479, "num_input_tokens_seen": 31243848, "step": 53820 }, { "epoch": 8.016830503425677, "grad_norm": 1.3386298418045044, "learning_rate": 3.7436911683338244e-05, "loss": 0.5877, "num_input_tokens_seen": 31246920, "step": 53825 }, { "epoch": 8.017575215966637, "grad_norm": 1.8281476497650146, "learning_rate": 3.743409277838227e-05, "loss": 0.6134, "num_input_tokens_seen": 31249768, "step": 53830 }, { "epoch": 8.018319928507596, "grad_norm": 1.5464838743209839, "learning_rate": 3.7431273663364926e-05, "loss": 0.6518, "num_input_tokens_seen": 31252328, "step": 53835 }, { "epoch": 8.019064641048555, "grad_norm": 0.637043297290802, "learning_rate": 3.742845433833386e-05, "loss": 0.577, "num_input_tokens_seen": 31255432, "step": 53840 }, { "epoch": 8.019809353589514, "grad_norm": 0.8907098174095154, "learning_rate": 3.742563480333668e-05, "loss": 0.4534, "num_input_tokens_seen": 31258504, "step": 53845 }, { "epoch": 8.020554066130474, "grad_norm": 1.4803533554077148, "learning_rate": 3.742281505842103e-05, "loss": 0.602, "num_input_tokens_seen": 31261480, "step": 53850 }, { "epoch": 8.021298778671433, "grad_norm": 1.3592909574508667, "learning_rate": 3.7419995103634546e-05, "loss": 0.6264, "num_input_tokens_seen": 31264328, "step": 53855 }, { "epoch": 8.022043491212392, "grad_norm": 1.9633928537368774, "learning_rate": 3.741717493902488e-05, "loss": 0.6819, "num_input_tokens_seen": 31267368, "step": 53860 }, { "epoch": 8.02278820375335, "grad_norm": 1.8952147960662842, "learning_rate": 3.741435456463965e-05, "loss": 0.6931, "num_input_tokens_seen": 31270344, "step": 53865 }, { "epoch": 8.023532916294311, "grad_norm": 0.8157370686531067, "learning_rate": 3.741153398052653e-05, "loss": 0.5285, "num_input_tokens_seen": 31273192, "step": 53870 }, { "epoch": 8.02427762883527, "grad_norm": 2.0387165546417236, "learning_rate": 3.740871318673314e-05, "loss": 0.6694, "num_input_tokens_seen": 31275720, "step": 53875 }, { "epoch": 8.025022341376228, "grad_norm": 1.761622428894043, "learning_rate": 3.740589218330716e-05, "loss": 0.7022, "num_input_tokens_seen": 31278600, "step": 53880 }, { "epoch": 8.025767053917187, "grad_norm": 1.463894248008728, "learning_rate": 3.740307097029624e-05, "loss": 0.626, "num_input_tokens_seen": 31281288, "step": 53885 }, { "epoch": 8.026511766458148, "grad_norm": 1.6303119659423828, "learning_rate": 3.740024954774804e-05, "loss": 0.6227, "num_input_tokens_seen": 31284488, "step": 53890 }, { "epoch": 8.027256478999107, "grad_norm": 1.949365258216858, "learning_rate": 3.739742791571023e-05, "loss": 0.6391, "num_input_tokens_seen": 31287624, "step": 53895 }, { "epoch": 8.028001191540065, "grad_norm": 1.1098759174346924, "learning_rate": 3.739460607423048e-05, "loss": 0.8201, "num_input_tokens_seen": 31290760, "step": 53900 }, { "epoch": 8.028745904081024, "grad_norm": 1.6132959127426147, "learning_rate": 3.7391784023356445e-05, "loss": 0.6228, "num_input_tokens_seen": 31293768, "step": 53905 }, { "epoch": 8.029490616621985, "grad_norm": 1.306352972984314, "learning_rate": 3.7388961763135835e-05, "loss": 0.6371, "num_input_tokens_seen": 31296680, "step": 53910 }, { "epoch": 8.030235329162943, "grad_norm": 0.7668256759643555, "learning_rate": 3.7386139293616285e-05, "loss": 0.5281, "num_input_tokens_seen": 31299368, "step": 53915 }, { "epoch": 8.030980041703902, "grad_norm": 1.6816195249557495, "learning_rate": 3.738331661484551e-05, "loss": 0.7796, "num_input_tokens_seen": 31302632, "step": 53920 }, { "epoch": 8.03172475424486, "grad_norm": 1.1537550687789917, "learning_rate": 3.7380493726871186e-05, "loss": 0.5907, "num_input_tokens_seen": 31305608, "step": 53925 }, { "epoch": 8.032469466785821, "grad_norm": 1.0229387283325195, "learning_rate": 3.737767062974101e-05, "loss": 0.6674, "num_input_tokens_seen": 31308360, "step": 53930 }, { "epoch": 8.03321417932678, "grad_norm": 2.436506509780884, "learning_rate": 3.737484732350266e-05, "loss": 0.8792, "num_input_tokens_seen": 31311144, "step": 53935 }, { "epoch": 8.033958891867739, "grad_norm": 1.4790313243865967, "learning_rate": 3.7372023808203836e-05, "loss": 0.5557, "num_input_tokens_seen": 31313960, "step": 53940 }, { "epoch": 8.034703604408698, "grad_norm": 1.2082669734954834, "learning_rate": 3.736920008389225e-05, "loss": 0.5373, "num_input_tokens_seen": 31317160, "step": 53945 }, { "epoch": 8.035448316949658, "grad_norm": 1.794503927230835, "learning_rate": 3.7366376150615614e-05, "loss": 0.5366, "num_input_tokens_seen": 31320424, "step": 53950 }, { "epoch": 8.036193029490617, "grad_norm": 1.1050009727478027, "learning_rate": 3.7363552008421606e-05, "loss": 0.6629, "num_input_tokens_seen": 31323112, "step": 53955 }, { "epoch": 8.036937742031576, "grad_norm": 3.1972391605377197, "learning_rate": 3.7360727657357954e-05, "loss": 0.6351, "num_input_tokens_seen": 31325864, "step": 53960 }, { "epoch": 8.037682454572534, "grad_norm": 1.0455937385559082, "learning_rate": 3.7357903097472376e-05, "loss": 0.5921, "num_input_tokens_seen": 31329032, "step": 53965 }, { "epoch": 8.038427167113495, "grad_norm": 1.8867912292480469, "learning_rate": 3.7355078328812583e-05, "loss": 0.6402, "num_input_tokens_seen": 31332296, "step": 53970 }, { "epoch": 8.039171879654454, "grad_norm": 1.1836133003234863, "learning_rate": 3.7352253351426295e-05, "loss": 0.7258, "num_input_tokens_seen": 31335272, "step": 53975 }, { "epoch": 8.039916592195413, "grad_norm": 1.9702621698379517, "learning_rate": 3.734942816536124e-05, "loss": 0.5903, "num_input_tokens_seen": 31338184, "step": 53980 }, { "epoch": 8.040661304736371, "grad_norm": 1.224959373474121, "learning_rate": 3.734660277066515e-05, "loss": 0.6368, "num_input_tokens_seen": 31341064, "step": 53985 }, { "epoch": 8.041406017277332, "grad_norm": 1.4031842947006226, "learning_rate": 3.734377716738576e-05, "loss": 0.4936, "num_input_tokens_seen": 31344104, "step": 53990 }, { "epoch": 8.04215072981829, "grad_norm": 1.3533769845962524, "learning_rate": 3.73409513555708e-05, "loss": 0.4562, "num_input_tokens_seen": 31347112, "step": 53995 }, { "epoch": 8.04289544235925, "grad_norm": 1.7176655530929565, "learning_rate": 3.733812533526801e-05, "loss": 0.7377, "num_input_tokens_seen": 31350088, "step": 54000 }, { "epoch": 8.043640154900208, "grad_norm": 1.17734694480896, "learning_rate": 3.733529910652513e-05, "loss": 0.6756, "num_input_tokens_seen": 31352872, "step": 54005 }, { "epoch": 8.044384867441167, "grad_norm": 0.8173859119415283, "learning_rate": 3.73324726693899e-05, "loss": 0.7682, "num_input_tokens_seen": 31355720, "step": 54010 }, { "epoch": 8.045129579982127, "grad_norm": 0.97593754529953, "learning_rate": 3.732964602391009e-05, "loss": 0.657, "num_input_tokens_seen": 31358600, "step": 54015 }, { "epoch": 8.045874292523086, "grad_norm": 2.388782262802124, "learning_rate": 3.7326819170133434e-05, "loss": 0.8331, "num_input_tokens_seen": 31361480, "step": 54020 }, { "epoch": 8.046619005064045, "grad_norm": 1.0246002674102783, "learning_rate": 3.7323992108107705e-05, "loss": 0.6211, "num_input_tokens_seen": 31364648, "step": 54025 }, { "epoch": 8.047363717605004, "grad_norm": 1.0554124116897583, "learning_rate": 3.7321164837880654e-05, "loss": 0.6177, "num_input_tokens_seen": 31367272, "step": 54030 }, { "epoch": 8.048108430145964, "grad_norm": 1.6380815505981445, "learning_rate": 3.731833735950004e-05, "loss": 0.693, "num_input_tokens_seen": 31370472, "step": 54035 }, { "epoch": 8.048853142686923, "grad_norm": 1.33966064453125, "learning_rate": 3.731550967301364e-05, "loss": 0.851, "num_input_tokens_seen": 31373608, "step": 54040 }, { "epoch": 8.049597855227882, "grad_norm": 0.9563925266265869, "learning_rate": 3.7312681778469216e-05, "loss": 0.7011, "num_input_tokens_seen": 31376552, "step": 54045 }, { "epoch": 8.05034256776884, "grad_norm": 2.179396629333496, "learning_rate": 3.730985367591455e-05, "loss": 0.738, "num_input_tokens_seen": 31379304, "step": 54050 }, { "epoch": 8.051087280309801, "grad_norm": 1.1606515645980835, "learning_rate": 3.730702536539741e-05, "loss": 0.6408, "num_input_tokens_seen": 31382152, "step": 54055 }, { "epoch": 8.05183199285076, "grad_norm": 1.6892012357711792, "learning_rate": 3.73041968469656e-05, "loss": 0.6969, "num_input_tokens_seen": 31385224, "step": 54060 }, { "epoch": 8.052576705391719, "grad_norm": 1.0520962476730347, "learning_rate": 3.730136812066688e-05, "loss": 0.6567, "num_input_tokens_seen": 31388168, "step": 54065 }, { "epoch": 8.053321417932677, "grad_norm": 1.1524529457092285, "learning_rate": 3.7298539186549054e-05, "loss": 0.4262, "num_input_tokens_seen": 31390888, "step": 54070 }, { "epoch": 8.054066130473638, "grad_norm": 1.11497962474823, "learning_rate": 3.7295710044659904e-05, "loss": 0.5005, "num_input_tokens_seen": 31393640, "step": 54075 }, { "epoch": 8.054810843014597, "grad_norm": 1.1384994983673096, "learning_rate": 3.7292880695047225e-05, "loss": 0.6775, "num_input_tokens_seen": 31396840, "step": 54080 }, { "epoch": 8.055555555555555, "grad_norm": 1.1258374452590942, "learning_rate": 3.729005113775883e-05, "loss": 0.6083, "num_input_tokens_seen": 31399688, "step": 54085 }, { "epoch": 8.056300268096514, "grad_norm": 0.9697370529174805, "learning_rate": 3.7287221372842506e-05, "loss": 0.5533, "num_input_tokens_seen": 31402376, "step": 54090 }, { "epoch": 8.057044980637475, "grad_norm": 1.2286261320114136, "learning_rate": 3.728439140034607e-05, "loss": 0.5923, "num_input_tokens_seen": 31405640, "step": 54095 }, { "epoch": 8.057789693178433, "grad_norm": 1.0958130359649658, "learning_rate": 3.728156122031732e-05, "loss": 0.569, "num_input_tokens_seen": 31408712, "step": 54100 }, { "epoch": 8.058534405719392, "grad_norm": 1.3679020404815674, "learning_rate": 3.7278730832804076e-05, "loss": 0.6083, "num_input_tokens_seen": 31411496, "step": 54105 }, { "epoch": 8.059279118260351, "grad_norm": 1.1324909925460815, "learning_rate": 3.727590023785416e-05, "loss": 0.5785, "num_input_tokens_seen": 31414440, "step": 54110 }, { "epoch": 8.060023830801311, "grad_norm": 0.9442831873893738, "learning_rate": 3.727306943551538e-05, "loss": 0.5883, "num_input_tokens_seen": 31417224, "step": 54115 }, { "epoch": 8.06076854334227, "grad_norm": 1.275571346282959, "learning_rate": 3.727023842583557e-05, "loss": 0.4704, "num_input_tokens_seen": 31420168, "step": 54120 }, { "epoch": 8.061513255883229, "grad_norm": 1.0551804304122925, "learning_rate": 3.726740720886255e-05, "loss": 0.4806, "num_input_tokens_seen": 31423112, "step": 54125 }, { "epoch": 8.062257968424188, "grad_norm": 0.6325231790542603, "learning_rate": 3.726457578464416e-05, "loss": 0.5003, "num_input_tokens_seen": 31426408, "step": 54130 }, { "epoch": 8.063002680965148, "grad_norm": 1.3698959350585938, "learning_rate": 3.726174415322822e-05, "loss": 0.5238, "num_input_tokens_seen": 31429384, "step": 54135 }, { "epoch": 8.063747393506107, "grad_norm": 1.2668397426605225, "learning_rate": 3.725891231466258e-05, "loss": 0.9138, "num_input_tokens_seen": 31432328, "step": 54140 }, { "epoch": 8.064492106047066, "grad_norm": 1.4362393617630005, "learning_rate": 3.725608026899507e-05, "loss": 0.6213, "num_input_tokens_seen": 31435400, "step": 54145 }, { "epoch": 8.065236818588025, "grad_norm": 1.3449808359146118, "learning_rate": 3.7253248016273545e-05, "loss": 0.5371, "num_input_tokens_seen": 31438216, "step": 54150 }, { "epoch": 8.065981531128985, "grad_norm": 1.4284390211105347, "learning_rate": 3.725041555654585e-05, "loss": 0.6833, "num_input_tokens_seen": 31441128, "step": 54155 }, { "epoch": 8.066726243669944, "grad_norm": 2.299286127090454, "learning_rate": 3.7247582889859824e-05, "loss": 0.7858, "num_input_tokens_seen": 31444136, "step": 54160 }, { "epoch": 8.067470956210903, "grad_norm": 1.797550082206726, "learning_rate": 3.724475001626335e-05, "loss": 0.7377, "num_input_tokens_seen": 31447432, "step": 54165 }, { "epoch": 8.068215668751861, "grad_norm": 1.4788955450057983, "learning_rate": 3.7241916935804254e-05, "loss": 0.6034, "num_input_tokens_seen": 31450440, "step": 54170 }, { "epoch": 8.06896038129282, "grad_norm": 1.4926050901412964, "learning_rate": 3.723908364853042e-05, "loss": 0.6213, "num_input_tokens_seen": 31453288, "step": 54175 }, { "epoch": 8.06970509383378, "grad_norm": 0.7694974541664124, "learning_rate": 3.723625015448971e-05, "loss": 0.5135, "num_input_tokens_seen": 31456136, "step": 54180 }, { "epoch": 8.07044980637474, "grad_norm": 0.8055437803268433, "learning_rate": 3.723341645372998e-05, "loss": 0.5111, "num_input_tokens_seen": 31459048, "step": 54185 }, { "epoch": 8.071194518915698, "grad_norm": 1.8114094734191895, "learning_rate": 3.723058254629912e-05, "loss": 0.6109, "num_input_tokens_seen": 31461928, "step": 54190 }, { "epoch": 8.071939231456657, "grad_norm": 1.3411861658096313, "learning_rate": 3.7227748432245e-05, "loss": 0.6239, "num_input_tokens_seen": 31464872, "step": 54195 }, { "epoch": 8.072683943997617, "grad_norm": 1.4652812480926514, "learning_rate": 3.722491411161549e-05, "loss": 0.5333, "num_input_tokens_seen": 31467464, "step": 54200 }, { "epoch": 8.073428656538576, "grad_norm": 1.1354728937149048, "learning_rate": 3.722207958445849e-05, "loss": 0.595, "num_input_tokens_seen": 31470344, "step": 54205 }, { "epoch": 8.074173369079535, "grad_norm": 1.449260950088501, "learning_rate": 3.721924485082187e-05, "loss": 0.6724, "num_input_tokens_seen": 31473352, "step": 54210 }, { "epoch": 8.074918081620494, "grad_norm": 1.4586901664733887, "learning_rate": 3.721640991075354e-05, "loss": 0.888, "num_input_tokens_seen": 31476328, "step": 54215 }, { "epoch": 8.075662794161454, "grad_norm": 1.3018807172775269, "learning_rate": 3.7213574764301363e-05, "loss": 0.7059, "num_input_tokens_seen": 31479656, "step": 54220 }, { "epoch": 8.076407506702413, "grad_norm": 1.1371787786483765, "learning_rate": 3.721073941151327e-05, "loss": 0.7231, "num_input_tokens_seen": 31482536, "step": 54225 }, { "epoch": 8.077152219243372, "grad_norm": 0.7871238589286804, "learning_rate": 3.7207903852437134e-05, "loss": 0.4529, "num_input_tokens_seen": 31485480, "step": 54230 }, { "epoch": 8.07789693178433, "grad_norm": 0.818048357963562, "learning_rate": 3.7205068087120876e-05, "loss": 0.5498, "num_input_tokens_seen": 31488392, "step": 54235 }, { "epoch": 8.078641644325291, "grad_norm": 1.274933099746704, "learning_rate": 3.7202232115612396e-05, "loss": 0.5588, "num_input_tokens_seen": 31491304, "step": 54240 }, { "epoch": 8.07938635686625, "grad_norm": 0.9045366048812866, "learning_rate": 3.7199395937959604e-05, "loss": 0.5487, "num_input_tokens_seen": 31494280, "step": 54245 }, { "epoch": 8.080131069407209, "grad_norm": 1.744572401046753, "learning_rate": 3.7196559554210415e-05, "loss": 0.7165, "num_input_tokens_seen": 31497128, "step": 54250 }, { "epoch": 8.080875781948167, "grad_norm": 1.8923524618148804, "learning_rate": 3.719372296441275e-05, "loss": 0.6958, "num_input_tokens_seen": 31499848, "step": 54255 }, { "epoch": 8.081620494489128, "grad_norm": 1.1596959829330444, "learning_rate": 3.719088616861453e-05, "loss": 0.5513, "num_input_tokens_seen": 31502760, "step": 54260 }, { "epoch": 8.082365207030087, "grad_norm": 1.5763565301895142, "learning_rate": 3.718804916686368e-05, "loss": 0.6636, "num_input_tokens_seen": 31505480, "step": 54265 }, { "epoch": 8.083109919571045, "grad_norm": 1.8671423196792603, "learning_rate": 3.7185211959208124e-05, "loss": 0.6365, "num_input_tokens_seen": 31508104, "step": 54270 }, { "epoch": 8.083854632112004, "grad_norm": 1.912984013557434, "learning_rate": 3.71823745456958e-05, "loss": 0.6708, "num_input_tokens_seen": 31511272, "step": 54275 }, { "epoch": 8.084599344652965, "grad_norm": 1.535288691520691, "learning_rate": 3.7179536926374636e-05, "loss": 0.7637, "num_input_tokens_seen": 31514216, "step": 54280 }, { "epoch": 8.085344057193923, "grad_norm": 1.6481884717941284, "learning_rate": 3.7176699101292574e-05, "loss": 0.629, "num_input_tokens_seen": 31517096, "step": 54285 }, { "epoch": 8.086088769734882, "grad_norm": 1.2847987413406372, "learning_rate": 3.7173861070497556e-05, "loss": 0.3987, "num_input_tokens_seen": 31519912, "step": 54290 }, { "epoch": 8.086833482275841, "grad_norm": 1.1982064247131348, "learning_rate": 3.717102283403753e-05, "loss": 0.5598, "num_input_tokens_seen": 31522824, "step": 54295 }, { "epoch": 8.087578194816802, "grad_norm": 2.0548250675201416, "learning_rate": 3.716818439196045e-05, "loss": 0.5067, "num_input_tokens_seen": 31525960, "step": 54300 }, { "epoch": 8.08832290735776, "grad_norm": 1.5506457090377808, "learning_rate": 3.716534574431425e-05, "loss": 0.5316, "num_input_tokens_seen": 31528936, "step": 54305 }, { "epoch": 8.089067619898719, "grad_norm": 1.3411084413528442, "learning_rate": 3.7162506891146896e-05, "loss": 0.5724, "num_input_tokens_seen": 31531656, "step": 54310 }, { "epoch": 8.089812332439678, "grad_norm": 1.2802505493164062, "learning_rate": 3.7159667832506365e-05, "loss": 0.4696, "num_input_tokens_seen": 31534440, "step": 54315 }, { "epoch": 8.090557044980638, "grad_norm": 1.8776297569274902, "learning_rate": 3.715682856844059e-05, "loss": 0.5498, "num_input_tokens_seen": 31537128, "step": 54320 }, { "epoch": 8.091301757521597, "grad_norm": 0.8112499713897705, "learning_rate": 3.715398909899756e-05, "loss": 0.4458, "num_input_tokens_seen": 31539848, "step": 54325 }, { "epoch": 8.092046470062556, "grad_norm": 1.1660268306732178, "learning_rate": 3.715114942422524e-05, "loss": 0.5679, "num_input_tokens_seen": 31542792, "step": 54330 }, { "epoch": 8.092791182603515, "grad_norm": 1.446174144744873, "learning_rate": 3.71483095441716e-05, "loss": 0.5574, "num_input_tokens_seen": 31545640, "step": 54335 }, { "epoch": 8.093535895144473, "grad_norm": 3.418952226638794, "learning_rate": 3.7145469458884606e-05, "loss": 0.7228, "num_input_tokens_seen": 31548392, "step": 54340 }, { "epoch": 8.094280607685434, "grad_norm": 0.9731821417808533, "learning_rate": 3.714262916841226e-05, "loss": 0.6492, "num_input_tokens_seen": 31551240, "step": 54345 }, { "epoch": 8.095025320226393, "grad_norm": 1.4697731733322144, "learning_rate": 3.7139788672802526e-05, "loss": 0.6862, "num_input_tokens_seen": 31554472, "step": 54350 }, { "epoch": 8.095770032767351, "grad_norm": 1.2504380941390991, "learning_rate": 3.713694797210341e-05, "loss": 0.7431, "num_input_tokens_seen": 31557224, "step": 54355 }, { "epoch": 8.09651474530831, "grad_norm": 2.03056001663208, "learning_rate": 3.713410706636289e-05, "loss": 0.6754, "num_input_tokens_seen": 31560168, "step": 54360 }, { "epoch": 8.09725945784927, "grad_norm": 2.6444013118743896, "learning_rate": 3.713126595562896e-05, "loss": 0.8032, "num_input_tokens_seen": 31562920, "step": 54365 }, { "epoch": 8.09800417039023, "grad_norm": 1.1533350944519043, "learning_rate": 3.712842463994963e-05, "loss": 0.5353, "num_input_tokens_seen": 31566056, "step": 54370 }, { "epoch": 8.098748882931188, "grad_norm": 1.0375438928604126, "learning_rate": 3.7125583119372884e-05, "loss": 0.6205, "num_input_tokens_seen": 31568744, "step": 54375 }, { "epoch": 8.099493595472147, "grad_norm": 1.4285306930541992, "learning_rate": 3.712274139394674e-05, "loss": 0.6317, "num_input_tokens_seen": 31571688, "step": 54380 }, { "epoch": 8.100238308013108, "grad_norm": 0.8807997703552246, "learning_rate": 3.71198994637192e-05, "loss": 0.6531, "num_input_tokens_seen": 31574632, "step": 54385 }, { "epoch": 8.100983020554066, "grad_norm": 1.5302814245224, "learning_rate": 3.711705732873828e-05, "loss": 0.6687, "num_input_tokens_seen": 31577672, "step": 54390 }, { "epoch": 8.101727733095025, "grad_norm": 1.3351775407791138, "learning_rate": 3.711421498905198e-05, "loss": 0.8269, "num_input_tokens_seen": 31580616, "step": 54395 }, { "epoch": 8.102472445635984, "grad_norm": 0.5535401105880737, "learning_rate": 3.7111372444708345e-05, "loss": 0.664, "num_input_tokens_seen": 31583368, "step": 54400 }, { "epoch": 8.103217158176944, "grad_norm": 1.5162886381149292, "learning_rate": 3.7108529695755375e-05, "loss": 0.535, "num_input_tokens_seen": 31586280, "step": 54405 }, { "epoch": 8.103961870717903, "grad_norm": 0.9173539876937866, "learning_rate": 3.7105686742241095e-05, "loss": 0.654, "num_input_tokens_seen": 31589032, "step": 54410 }, { "epoch": 8.104706583258862, "grad_norm": 1.12931489944458, "learning_rate": 3.7102843584213556e-05, "loss": 0.5256, "num_input_tokens_seen": 31591848, "step": 54415 }, { "epoch": 8.10545129579982, "grad_norm": 1.1533491611480713, "learning_rate": 3.710000022172076e-05, "loss": 0.6864, "num_input_tokens_seen": 31594888, "step": 54420 }, { "epoch": 8.106196008340781, "grad_norm": 1.7011282444000244, "learning_rate": 3.7097156654810774e-05, "loss": 0.6907, "num_input_tokens_seen": 31598216, "step": 54425 }, { "epoch": 8.10694072088174, "grad_norm": 1.1192233562469482, "learning_rate": 3.709431288353161e-05, "loss": 0.815, "num_input_tokens_seen": 31601128, "step": 54430 }, { "epoch": 8.107685433422699, "grad_norm": 1.4431790113449097, "learning_rate": 3.7091468907931324e-05, "loss": 0.517, "num_input_tokens_seen": 31603912, "step": 54435 }, { "epoch": 8.108430145963657, "grad_norm": 1.498306155204773, "learning_rate": 3.708862472805796e-05, "loss": 0.7247, "num_input_tokens_seen": 31606856, "step": 54440 }, { "epoch": 8.109174858504618, "grad_norm": 1.215125560760498, "learning_rate": 3.708578034395957e-05, "loss": 0.5365, "num_input_tokens_seen": 31609544, "step": 54445 }, { "epoch": 8.109919571045577, "grad_norm": 1.6729414463043213, "learning_rate": 3.70829357556842e-05, "loss": 0.6608, "num_input_tokens_seen": 31612520, "step": 54450 }, { "epoch": 8.110664283586535, "grad_norm": 1.1695653200149536, "learning_rate": 3.7080090963279915e-05, "loss": 0.6066, "num_input_tokens_seen": 31615368, "step": 54455 }, { "epoch": 8.111408996127494, "grad_norm": 0.9653853178024292, "learning_rate": 3.7077245966794774e-05, "loss": 0.7007, "num_input_tokens_seen": 31618184, "step": 54460 }, { "epoch": 8.112153708668455, "grad_norm": 1.24351966381073, "learning_rate": 3.707440076627683e-05, "loss": 0.6488, "num_input_tokens_seen": 31620968, "step": 54465 }, { "epoch": 8.112898421209414, "grad_norm": 1.1989573240280151, "learning_rate": 3.7071555361774165e-05, "loss": 0.5947, "num_input_tokens_seen": 31623976, "step": 54470 }, { "epoch": 8.113643133750372, "grad_norm": 1.358808994293213, "learning_rate": 3.706870975333484e-05, "loss": 0.7548, "num_input_tokens_seen": 31626920, "step": 54475 }, { "epoch": 8.114387846291331, "grad_norm": 1.0586228370666504, "learning_rate": 3.706586394100692e-05, "loss": 0.6757, "num_input_tokens_seen": 31629832, "step": 54480 }, { "epoch": 8.115132558832292, "grad_norm": 1.1238329410552979, "learning_rate": 3.70630179248385e-05, "loss": 0.6541, "num_input_tokens_seen": 31632776, "step": 54485 }, { "epoch": 8.11587727137325, "grad_norm": 2.1567840576171875, "learning_rate": 3.706017170487765e-05, "loss": 0.6166, "num_input_tokens_seen": 31635656, "step": 54490 }, { "epoch": 8.116621983914209, "grad_norm": 1.5919562578201294, "learning_rate": 3.705732528117246e-05, "loss": 0.6628, "num_input_tokens_seen": 31638504, "step": 54495 }, { "epoch": 8.117366696455168, "grad_norm": 1.222153663635254, "learning_rate": 3.7054478653771005e-05, "loss": 0.7663, "num_input_tokens_seen": 31641544, "step": 54500 }, { "epoch": 8.118111408996128, "grad_norm": 1.5469865798950195, "learning_rate": 3.7051631822721395e-05, "loss": 0.6443, "num_input_tokens_seen": 31644360, "step": 54505 }, { "epoch": 8.118856121537087, "grad_norm": 0.7772789597511292, "learning_rate": 3.704878478807171e-05, "loss": 0.6631, "num_input_tokens_seen": 31647272, "step": 54510 }, { "epoch": 8.119600834078046, "grad_norm": 3.1116819381713867, "learning_rate": 3.704593754987005e-05, "loss": 0.6483, "num_input_tokens_seen": 31650440, "step": 54515 }, { "epoch": 8.120345546619005, "grad_norm": 1.2250800132751465, "learning_rate": 3.704309010816452e-05, "loss": 0.5677, "num_input_tokens_seen": 31653576, "step": 54520 }, { "epoch": 8.121090259159963, "grad_norm": 1.3868634700775146, "learning_rate": 3.7040242463003225e-05, "loss": 0.6736, "num_input_tokens_seen": 31656488, "step": 54525 }, { "epoch": 8.121834971700924, "grad_norm": 1.7815473079681396, "learning_rate": 3.703739461443427e-05, "loss": 0.658, "num_input_tokens_seen": 31659112, "step": 54530 }, { "epoch": 8.122579684241883, "grad_norm": 1.4459030628204346, "learning_rate": 3.703454656250576e-05, "loss": 0.5831, "num_input_tokens_seen": 31662056, "step": 54535 }, { "epoch": 8.123324396782841, "grad_norm": 1.607147216796875, "learning_rate": 3.7031698307265824e-05, "loss": 0.486, "num_input_tokens_seen": 31664936, "step": 54540 }, { "epoch": 8.1240691093238, "grad_norm": 0.9955068230628967, "learning_rate": 3.702884984876257e-05, "loss": 0.4888, "num_input_tokens_seen": 31668232, "step": 54545 }, { "epoch": 8.12481382186476, "grad_norm": 1.8188197612762451, "learning_rate": 3.702600118704412e-05, "loss": 0.8206, "num_input_tokens_seen": 31671144, "step": 54550 }, { "epoch": 8.12555853440572, "grad_norm": 1.109086275100708, "learning_rate": 3.702315232215862e-05, "loss": 0.5122, "num_input_tokens_seen": 31673992, "step": 54555 }, { "epoch": 8.126303246946678, "grad_norm": 2.4320788383483887, "learning_rate": 3.7020303254154164e-05, "loss": 0.6676, "num_input_tokens_seen": 31676936, "step": 54560 }, { "epoch": 8.127047959487637, "grad_norm": 3.6486918926239014, "learning_rate": 3.701745398307891e-05, "loss": 0.755, "num_input_tokens_seen": 31680008, "step": 54565 }, { "epoch": 8.127792672028598, "grad_norm": 1.1429681777954102, "learning_rate": 3.701460450898098e-05, "loss": 0.5638, "num_input_tokens_seen": 31682760, "step": 54570 }, { "epoch": 8.128537384569556, "grad_norm": 0.9274199604988098, "learning_rate": 3.701175483190852e-05, "loss": 0.582, "num_input_tokens_seen": 31685672, "step": 54575 }, { "epoch": 8.129282097110515, "grad_norm": 1.051041841506958, "learning_rate": 3.700890495190967e-05, "loss": 0.6859, "num_input_tokens_seen": 31688392, "step": 54580 }, { "epoch": 8.130026809651474, "grad_norm": 1.4510549306869507, "learning_rate": 3.7006054869032574e-05, "loss": 0.59, "num_input_tokens_seen": 31691528, "step": 54585 }, { "epoch": 8.130771522192434, "grad_norm": 0.8536702990531921, "learning_rate": 3.700320458332539e-05, "loss": 0.5283, "num_input_tokens_seen": 31694568, "step": 54590 }, { "epoch": 8.131516234733393, "grad_norm": 1.3661121129989624, "learning_rate": 3.700035409483626e-05, "loss": 0.6115, "num_input_tokens_seen": 31698152, "step": 54595 }, { "epoch": 8.132260947274352, "grad_norm": 1.500986099243164, "learning_rate": 3.699750340361334e-05, "loss": 0.7136, "num_input_tokens_seen": 31700968, "step": 54600 }, { "epoch": 8.13300565981531, "grad_norm": 1.6214810609817505, "learning_rate": 3.69946525097048e-05, "loss": 0.6361, "num_input_tokens_seen": 31704136, "step": 54605 }, { "epoch": 8.133750372356271, "grad_norm": 1.3572673797607422, "learning_rate": 3.6991801413158795e-05, "loss": 0.6896, "num_input_tokens_seen": 31706952, "step": 54610 }, { "epoch": 8.13449508489723, "grad_norm": 1.1309096813201904, "learning_rate": 3.6988950114023494e-05, "loss": 0.5523, "num_input_tokens_seen": 31709864, "step": 54615 }, { "epoch": 8.135239797438189, "grad_norm": 1.1576446294784546, "learning_rate": 3.6986098612347056e-05, "loss": 0.8062, "num_input_tokens_seen": 31713128, "step": 54620 }, { "epoch": 8.135984509979147, "grad_norm": 1.2717782258987427, "learning_rate": 3.6983246908177675e-05, "loss": 0.7327, "num_input_tokens_seen": 31716008, "step": 54625 }, { "epoch": 8.136729222520108, "grad_norm": 1.3545926809310913, "learning_rate": 3.698039500156352e-05, "loss": 0.7013, "num_input_tokens_seen": 31719048, "step": 54630 }, { "epoch": 8.137473935061067, "grad_norm": 0.9496854543685913, "learning_rate": 3.697754289255277e-05, "loss": 0.5269, "num_input_tokens_seen": 31721608, "step": 54635 }, { "epoch": 8.138218647602026, "grad_norm": 0.9213368892669678, "learning_rate": 3.697469058119359e-05, "loss": 0.6022, "num_input_tokens_seen": 31724520, "step": 54640 }, { "epoch": 8.138963360142984, "grad_norm": 1.603948712348938, "learning_rate": 3.697183806753419e-05, "loss": 0.5593, "num_input_tokens_seen": 31727240, "step": 54645 }, { "epoch": 8.139708072683945, "grad_norm": 1.0436103343963623, "learning_rate": 3.696898535162275e-05, "loss": 0.5836, "num_input_tokens_seen": 31730120, "step": 54650 }, { "epoch": 8.140452785224904, "grad_norm": 1.1013035774230957, "learning_rate": 3.696613243350747e-05, "loss": 0.5954, "num_input_tokens_seen": 31732872, "step": 54655 }, { "epoch": 8.141197497765862, "grad_norm": 3.477383613586426, "learning_rate": 3.696327931323655e-05, "loss": 0.7839, "num_input_tokens_seen": 31735976, "step": 54660 }, { "epoch": 8.141942210306821, "grad_norm": 1.8275682926177979, "learning_rate": 3.696042599085818e-05, "loss": 0.761, "num_input_tokens_seen": 31738984, "step": 54665 }, { "epoch": 8.142686922847782, "grad_norm": 1.1632041931152344, "learning_rate": 3.695757246642057e-05, "loss": 0.7357, "num_input_tokens_seen": 31741736, "step": 54670 }, { "epoch": 8.14343163538874, "grad_norm": 1.7758578062057495, "learning_rate": 3.695471873997193e-05, "loss": 0.6322, "num_input_tokens_seen": 31745064, "step": 54675 }, { "epoch": 8.1441763479297, "grad_norm": 0.8106017112731934, "learning_rate": 3.6951864811560464e-05, "loss": 0.498, "num_input_tokens_seen": 31748072, "step": 54680 }, { "epoch": 8.144921060470658, "grad_norm": 1.181039810180664, "learning_rate": 3.694901068123439e-05, "loss": 0.6992, "num_input_tokens_seen": 31750888, "step": 54685 }, { "epoch": 8.145665773011617, "grad_norm": 0.7400733232498169, "learning_rate": 3.694615634904192e-05, "loss": 0.6874, "num_input_tokens_seen": 31753896, "step": 54690 }, { "epoch": 8.146410485552577, "grad_norm": 2.1496293544769287, "learning_rate": 3.694330181503128e-05, "loss": 0.6981, "num_input_tokens_seen": 31756584, "step": 54695 }, { "epoch": 8.147155198093536, "grad_norm": 2.2850422859191895, "learning_rate": 3.69404470792507e-05, "loss": 0.6868, "num_input_tokens_seen": 31759336, "step": 54700 }, { "epoch": 8.147899910634495, "grad_norm": 1.2650707960128784, "learning_rate": 3.69375921417484e-05, "loss": 0.4224, "num_input_tokens_seen": 31762152, "step": 54705 }, { "epoch": 8.148644623175453, "grad_norm": 1.334841012954712, "learning_rate": 3.6934737002572614e-05, "loss": 0.6798, "num_input_tokens_seen": 31765160, "step": 54710 }, { "epoch": 8.149389335716414, "grad_norm": 1.4314757585525513, "learning_rate": 3.693188166177158e-05, "loss": 0.6703, "num_input_tokens_seen": 31767976, "step": 54715 }, { "epoch": 8.150134048257373, "grad_norm": 0.9288536906242371, "learning_rate": 3.6929026119393525e-05, "loss": 0.6436, "num_input_tokens_seen": 31770856, "step": 54720 }, { "epoch": 8.150878760798332, "grad_norm": 1.1459662914276123, "learning_rate": 3.69261703754867e-05, "loss": 0.8124, "num_input_tokens_seen": 31773864, "step": 54725 }, { "epoch": 8.15162347333929, "grad_norm": 1.2935246229171753, "learning_rate": 3.6923314430099354e-05, "loss": 0.4256, "num_input_tokens_seen": 31776776, "step": 54730 }, { "epoch": 8.15236818588025, "grad_norm": 0.8550208210945129, "learning_rate": 3.6920458283279725e-05, "loss": 0.6517, "num_input_tokens_seen": 31779496, "step": 54735 }, { "epoch": 8.15311289842121, "grad_norm": 1.0381884574890137, "learning_rate": 3.691760193507607e-05, "loss": 0.6121, "num_input_tokens_seen": 31782536, "step": 54740 }, { "epoch": 8.153857610962168, "grad_norm": 1.7203404903411865, "learning_rate": 3.691474538553664e-05, "loss": 0.4746, "num_input_tokens_seen": 31785448, "step": 54745 }, { "epoch": 8.154602323503127, "grad_norm": 1.6276942491531372, "learning_rate": 3.691188863470969e-05, "loss": 0.6088, "num_input_tokens_seen": 31788232, "step": 54750 }, { "epoch": 8.155347036044088, "grad_norm": 1.0366030931472778, "learning_rate": 3.69090316826435e-05, "loss": 0.4793, "num_input_tokens_seen": 31791240, "step": 54755 }, { "epoch": 8.156091748585046, "grad_norm": 0.8521009087562561, "learning_rate": 3.690617452938632e-05, "loss": 0.489, "num_input_tokens_seen": 31794088, "step": 54760 }, { "epoch": 8.156836461126005, "grad_norm": 0.9533536434173584, "learning_rate": 3.6903317174986425e-05, "loss": 0.574, "num_input_tokens_seen": 31797128, "step": 54765 }, { "epoch": 8.157581173666964, "grad_norm": 1.1834858655929565, "learning_rate": 3.690045961949208e-05, "loss": 0.7311, "num_input_tokens_seen": 31799912, "step": 54770 }, { "epoch": 8.158325886207924, "grad_norm": 1.4763071537017822, "learning_rate": 3.689760186295156e-05, "loss": 0.5333, "num_input_tokens_seen": 31803016, "step": 54775 }, { "epoch": 8.159070598748883, "grad_norm": 1.614507794380188, "learning_rate": 3.689474390541316e-05, "loss": 0.6436, "num_input_tokens_seen": 31805832, "step": 54780 }, { "epoch": 8.159815311289842, "grad_norm": 0.8869487643241882, "learning_rate": 3.6891885746925136e-05, "loss": 0.5456, "num_input_tokens_seen": 31808776, "step": 54785 }, { "epoch": 8.1605600238308, "grad_norm": 1.324110984802246, "learning_rate": 3.68890273875358e-05, "loss": 0.757, "num_input_tokens_seen": 31812200, "step": 54790 }, { "epoch": 8.161304736371761, "grad_norm": 1.7494734525680542, "learning_rate": 3.688616882729343e-05, "loss": 0.6784, "num_input_tokens_seen": 31814952, "step": 54795 }, { "epoch": 8.16204944891272, "grad_norm": 1.6649224758148193, "learning_rate": 3.688331006624632e-05, "loss": 0.2981, "num_input_tokens_seen": 31817864, "step": 54800 }, { "epoch": 8.162794161453679, "grad_norm": 2.4115729331970215, "learning_rate": 3.688045110444276e-05, "loss": 0.8181, "num_input_tokens_seen": 31820872, "step": 54805 }, { "epoch": 8.163538873994638, "grad_norm": 1.614373803138733, "learning_rate": 3.687759194193105e-05, "loss": 0.6141, "num_input_tokens_seen": 31823656, "step": 54810 }, { "epoch": 8.164283586535598, "grad_norm": 1.8155146837234497, "learning_rate": 3.6874732578759495e-05, "loss": 0.6811, "num_input_tokens_seen": 31826760, "step": 54815 }, { "epoch": 8.165028299076557, "grad_norm": 1.0012794733047485, "learning_rate": 3.687187301497641e-05, "loss": 0.5719, "num_input_tokens_seen": 31829768, "step": 54820 }, { "epoch": 8.165773011617516, "grad_norm": 1.2024908065795898, "learning_rate": 3.6869013250630094e-05, "loss": 0.5643, "num_input_tokens_seen": 31832776, "step": 54825 }, { "epoch": 8.166517724158474, "grad_norm": 1.7729392051696777, "learning_rate": 3.686615328576886e-05, "loss": 0.6173, "num_input_tokens_seen": 31835720, "step": 54830 }, { "epoch": 8.167262436699435, "grad_norm": 0.8819698095321655, "learning_rate": 3.686329312044102e-05, "loss": 0.5106, "num_input_tokens_seen": 31838408, "step": 54835 }, { "epoch": 8.168007149240394, "grad_norm": 1.092630386352539, "learning_rate": 3.6860432754694915e-05, "loss": 0.6712, "num_input_tokens_seen": 31841416, "step": 54840 }, { "epoch": 8.168751861781352, "grad_norm": 1.2224904298782349, "learning_rate": 3.685757218857885e-05, "loss": 0.5794, "num_input_tokens_seen": 31844232, "step": 54845 }, { "epoch": 8.169496574322311, "grad_norm": 1.2039793729782104, "learning_rate": 3.6854711422141144e-05, "loss": 0.8141, "num_input_tokens_seen": 31847208, "step": 54850 }, { "epoch": 8.17024128686327, "grad_norm": 1.8459447622299194, "learning_rate": 3.685185045543014e-05, "loss": 0.544, "num_input_tokens_seen": 31849832, "step": 54855 }, { "epoch": 8.17098599940423, "grad_norm": 0.8124223947525024, "learning_rate": 3.684898928849417e-05, "loss": 0.6337, "num_input_tokens_seen": 31852776, "step": 54860 }, { "epoch": 8.17173071194519, "grad_norm": 1.4326978921890259, "learning_rate": 3.6846127921381576e-05, "loss": 0.6691, "num_input_tokens_seen": 31856008, "step": 54865 }, { "epoch": 8.172475424486148, "grad_norm": 0.5829999446868896, "learning_rate": 3.684326635414068e-05, "loss": 0.5298, "num_input_tokens_seen": 31858888, "step": 54870 }, { "epoch": 8.173220137027107, "grad_norm": 1.1814113855361938, "learning_rate": 3.684040458681984e-05, "loss": 0.8388, "num_input_tokens_seen": 31861672, "step": 54875 }, { "epoch": 8.173964849568067, "grad_norm": 1.014957308769226, "learning_rate": 3.6837542619467404e-05, "loss": 0.5318, "num_input_tokens_seen": 31864744, "step": 54880 }, { "epoch": 8.174709562109026, "grad_norm": 0.8924382328987122, "learning_rate": 3.6834680452131707e-05, "loss": 0.5379, "num_input_tokens_seen": 31867912, "step": 54885 }, { "epoch": 8.175454274649985, "grad_norm": 1.5284953117370605, "learning_rate": 3.683181808486112e-05, "loss": 0.6335, "num_input_tokens_seen": 31870856, "step": 54890 }, { "epoch": 8.176198987190944, "grad_norm": 1.5369807481765747, "learning_rate": 3.682895551770399e-05, "loss": 0.6293, "num_input_tokens_seen": 31873832, "step": 54895 }, { "epoch": 8.176943699731904, "grad_norm": 1.0689311027526855, "learning_rate": 3.682609275070867e-05, "loss": 0.4711, "num_input_tokens_seen": 31876424, "step": 54900 }, { "epoch": 8.177688412272863, "grad_norm": 1.287293553352356, "learning_rate": 3.682322978392354e-05, "loss": 0.6782, "num_input_tokens_seen": 31879368, "step": 54905 }, { "epoch": 8.178433124813822, "grad_norm": 1.1248801946640015, "learning_rate": 3.682036661739696e-05, "loss": 0.7523, "num_input_tokens_seen": 31882312, "step": 54910 }, { "epoch": 8.17917783735478, "grad_norm": 0.9155014157295227, "learning_rate": 3.68175032511773e-05, "loss": 0.6123, "num_input_tokens_seen": 31885224, "step": 54915 }, { "epoch": 8.17992254989574, "grad_norm": 1.324212670326233, "learning_rate": 3.6814639685312936e-05, "loss": 0.7253, "num_input_tokens_seen": 31888040, "step": 54920 }, { "epoch": 8.1806672624367, "grad_norm": 0.6261774897575378, "learning_rate": 3.6811775919852245e-05, "loss": 0.411, "num_input_tokens_seen": 31891144, "step": 54925 }, { "epoch": 8.181411974977658, "grad_norm": 1.1604297161102295, "learning_rate": 3.6808911954843595e-05, "loss": 0.476, "num_input_tokens_seen": 31894216, "step": 54930 }, { "epoch": 8.182156687518617, "grad_norm": 0.6152943968772888, "learning_rate": 3.680604779033538e-05, "loss": 0.5059, "num_input_tokens_seen": 31897192, "step": 54935 }, { "epoch": 8.182901400059578, "grad_norm": 0.8295131921768188, "learning_rate": 3.680318342637599e-05, "loss": 0.6477, "num_input_tokens_seen": 31900072, "step": 54940 }, { "epoch": 8.183646112600536, "grad_norm": 1.6004542112350464, "learning_rate": 3.6800318863013806e-05, "loss": 0.6678, "num_input_tokens_seen": 31903112, "step": 54945 }, { "epoch": 8.184390825141495, "grad_norm": 0.962463915348053, "learning_rate": 3.6797454100297234e-05, "loss": 0.4605, "num_input_tokens_seen": 31905864, "step": 54950 }, { "epoch": 8.185135537682454, "grad_norm": 0.9358662962913513, "learning_rate": 3.679458913827467e-05, "loss": 0.6672, "num_input_tokens_seen": 31909096, "step": 54955 }, { "epoch": 8.185880250223414, "grad_norm": 1.1590044498443604, "learning_rate": 3.6791723976994505e-05, "loss": 0.5794, "num_input_tokens_seen": 31911848, "step": 54960 }, { "epoch": 8.186624962764373, "grad_norm": 1.1040812730789185, "learning_rate": 3.678885861650515e-05, "loss": 0.6246, "num_input_tokens_seen": 31914536, "step": 54965 }, { "epoch": 8.187369675305332, "grad_norm": 1.5590407848358154, "learning_rate": 3.6785993056855004e-05, "loss": 0.7234, "num_input_tokens_seen": 31917448, "step": 54970 }, { "epoch": 8.18811438784629, "grad_norm": 1.9824652671813965, "learning_rate": 3.678312729809249e-05, "loss": 0.7388, "num_input_tokens_seen": 31920456, "step": 54975 }, { "epoch": 8.188859100387251, "grad_norm": 1.6843794584274292, "learning_rate": 3.6780261340266014e-05, "loss": 0.6033, "num_input_tokens_seen": 31923368, "step": 54980 }, { "epoch": 8.18960381292821, "grad_norm": 2.0660319328308105, "learning_rate": 3.677739518342399e-05, "loss": 0.5667, "num_input_tokens_seen": 31926376, "step": 54985 }, { "epoch": 8.190348525469169, "grad_norm": 1.787373661994934, "learning_rate": 3.677452882761486e-05, "loss": 0.7094, "num_input_tokens_seen": 31929160, "step": 54990 }, { "epoch": 8.191093238010128, "grad_norm": 1.183275580406189, "learning_rate": 3.677166227288702e-05, "loss": 0.7546, "num_input_tokens_seen": 31932040, "step": 54995 }, { "epoch": 8.191837950551088, "grad_norm": 1.130277395248413, "learning_rate": 3.6768795519288916e-05, "loss": 0.7726, "num_input_tokens_seen": 31935432, "step": 55000 }, { "epoch": 8.192582663092047, "grad_norm": 1.652058482170105, "learning_rate": 3.6765928566868976e-05, "loss": 0.7998, "num_input_tokens_seen": 31938120, "step": 55005 }, { "epoch": 8.193327375633006, "grad_norm": 0.990929365158081, "learning_rate": 3.676306141567562e-05, "loss": 0.582, "num_input_tokens_seen": 31940968, "step": 55010 }, { "epoch": 8.194072088173964, "grad_norm": 1.4356977939605713, "learning_rate": 3.67601940657573e-05, "loss": 0.5402, "num_input_tokens_seen": 31943720, "step": 55015 }, { "epoch": 8.194816800714925, "grad_norm": 1.0388038158416748, "learning_rate": 3.675732651716246e-05, "loss": 0.6099, "num_input_tokens_seen": 31946664, "step": 55020 }, { "epoch": 8.195561513255884, "grad_norm": 0.8969061970710754, "learning_rate": 3.675445876993953e-05, "loss": 0.6452, "num_input_tokens_seen": 31949544, "step": 55025 }, { "epoch": 8.196306225796842, "grad_norm": 1.4711328744888306, "learning_rate": 3.675159082413697e-05, "loss": 0.6616, "num_input_tokens_seen": 31952392, "step": 55030 }, { "epoch": 8.197050938337801, "grad_norm": 1.5021474361419678, "learning_rate": 3.674872267980323e-05, "loss": 0.47, "num_input_tokens_seen": 31955304, "step": 55035 }, { "epoch": 8.19779565087876, "grad_norm": 1.0195919275283813, "learning_rate": 3.674585433698676e-05, "loss": 0.6591, "num_input_tokens_seen": 31958056, "step": 55040 }, { "epoch": 8.19854036341972, "grad_norm": 1.9781299829483032, "learning_rate": 3.674298579573602e-05, "loss": 0.5794, "num_input_tokens_seen": 31961096, "step": 55045 }, { "epoch": 8.19928507596068, "grad_norm": 1.1932344436645508, "learning_rate": 3.674011705609946e-05, "loss": 0.704, "num_input_tokens_seen": 31963976, "step": 55050 }, { "epoch": 8.200029788501638, "grad_norm": 1.0908385515213013, "learning_rate": 3.6737248118125564e-05, "loss": 0.5747, "num_input_tokens_seen": 31966504, "step": 55055 }, { "epoch": 8.200774501042597, "grad_norm": 1.4950588941574097, "learning_rate": 3.673437898186279e-05, "loss": 0.4987, "num_input_tokens_seen": 31969256, "step": 55060 }, { "epoch": 8.201519213583557, "grad_norm": 1.5122781991958618, "learning_rate": 3.6731509647359604e-05, "loss": 0.5679, "num_input_tokens_seen": 31972232, "step": 55065 }, { "epoch": 8.202263926124516, "grad_norm": 1.2617005109786987, "learning_rate": 3.6728640114664485e-05, "loss": 0.6925, "num_input_tokens_seen": 31975176, "step": 55070 }, { "epoch": 8.203008638665475, "grad_norm": 1.4700164794921875, "learning_rate": 3.672577038382592e-05, "loss": 0.5101, "num_input_tokens_seen": 31977992, "step": 55075 }, { "epoch": 8.203753351206434, "grad_norm": 2.148236036300659, "learning_rate": 3.672290045489238e-05, "loss": 0.5786, "num_input_tokens_seen": 31980680, "step": 55080 }, { "epoch": 8.204498063747394, "grad_norm": 1.4996819496154785, "learning_rate": 3.672003032791235e-05, "loss": 0.6321, "num_input_tokens_seen": 31983432, "step": 55085 }, { "epoch": 8.205242776288353, "grad_norm": 1.799023985862732, "learning_rate": 3.671716000293432e-05, "loss": 0.6512, "num_input_tokens_seen": 31986376, "step": 55090 }, { "epoch": 8.205987488829312, "grad_norm": 0.9046322703361511, "learning_rate": 3.671428948000677e-05, "loss": 0.6725, "num_input_tokens_seen": 31989032, "step": 55095 }, { "epoch": 8.20673220137027, "grad_norm": 1.5529935359954834, "learning_rate": 3.671141875917822e-05, "loss": 0.4629, "num_input_tokens_seen": 31991784, "step": 55100 }, { "epoch": 8.207476913911231, "grad_norm": 1.6363624334335327, "learning_rate": 3.6708547840497144e-05, "loss": 0.5424, "num_input_tokens_seen": 31994504, "step": 55105 }, { "epoch": 8.20822162645219, "grad_norm": 1.108066201210022, "learning_rate": 3.6705676724012055e-05, "loss": 0.5199, "num_input_tokens_seen": 31997576, "step": 55110 }, { "epoch": 8.208966338993148, "grad_norm": 0.9262566566467285, "learning_rate": 3.670280540977145e-05, "loss": 0.4437, "num_input_tokens_seen": 32000360, "step": 55115 }, { "epoch": 8.209711051534107, "grad_norm": 0.9175792932510376, "learning_rate": 3.669993389782385e-05, "loss": 0.4326, "num_input_tokens_seen": 32003080, "step": 55120 }, { "epoch": 8.210455764075068, "grad_norm": 0.5274667739868164, "learning_rate": 3.669706218821776e-05, "loss": 0.6409, "num_input_tokens_seen": 32006248, "step": 55125 }, { "epoch": 8.211200476616026, "grad_norm": 1.671579360961914, "learning_rate": 3.669419028100169e-05, "loss": 0.6207, "num_input_tokens_seen": 32009192, "step": 55130 }, { "epoch": 8.211945189156985, "grad_norm": 2.151970863342285, "learning_rate": 3.6691318176224156e-05, "loss": 0.7043, "num_input_tokens_seen": 32011848, "step": 55135 }, { "epoch": 8.212689901697944, "grad_norm": 1.295757532119751, "learning_rate": 3.6688445873933686e-05, "loss": 0.5062, "num_input_tokens_seen": 32015016, "step": 55140 }, { "epoch": 8.213434614238905, "grad_norm": 1.3291378021240234, "learning_rate": 3.66855733741788e-05, "loss": 0.7085, "num_input_tokens_seen": 32018344, "step": 55145 }, { "epoch": 8.214179326779863, "grad_norm": 1.278983235359192, "learning_rate": 3.668270067700803e-05, "loss": 0.518, "num_input_tokens_seen": 32021160, "step": 55150 }, { "epoch": 8.214924039320822, "grad_norm": 1.610304355621338, "learning_rate": 3.667982778246991e-05, "loss": 0.7147, "num_input_tokens_seen": 32024008, "step": 55155 }, { "epoch": 8.21566875186178, "grad_norm": 1.7885733842849731, "learning_rate": 3.6676954690612974e-05, "loss": 0.752, "num_input_tokens_seen": 32026760, "step": 55160 }, { "epoch": 8.216413464402741, "grad_norm": 0.7860882878303528, "learning_rate": 3.6674081401485746e-05, "loss": 0.4117, "num_input_tokens_seen": 32029928, "step": 55165 }, { "epoch": 8.2171581769437, "grad_norm": 0.587333083152771, "learning_rate": 3.667120791513678e-05, "loss": 0.5328, "num_input_tokens_seen": 32032680, "step": 55170 }, { "epoch": 8.217902889484659, "grad_norm": 1.2987565994262695, "learning_rate": 3.666833423161462e-05, "loss": 0.5896, "num_input_tokens_seen": 32035592, "step": 55175 }, { "epoch": 8.218647602025618, "grad_norm": 1.7607377767562866, "learning_rate": 3.666546035096781e-05, "loss": 0.7753, "num_input_tokens_seen": 32038696, "step": 55180 }, { "epoch": 8.219392314566578, "grad_norm": 1.3083417415618896, "learning_rate": 3.6662586273244906e-05, "loss": 0.6162, "num_input_tokens_seen": 32041608, "step": 55185 }, { "epoch": 8.220137027107537, "grad_norm": 0.7292110323905945, "learning_rate": 3.665971199849447e-05, "loss": 0.6596, "num_input_tokens_seen": 32044616, "step": 55190 }, { "epoch": 8.220881739648496, "grad_norm": 1.209324598312378, "learning_rate": 3.665683752676503e-05, "loss": 0.5723, "num_input_tokens_seen": 32047240, "step": 55195 }, { "epoch": 8.221626452189454, "grad_norm": 0.9189968109130859, "learning_rate": 3.665396285810519e-05, "loss": 0.5641, "num_input_tokens_seen": 32050184, "step": 55200 }, { "epoch": 8.222371164730415, "grad_norm": 1.0374596118927002, "learning_rate": 3.665108799256348e-05, "loss": 0.7076, "num_input_tokens_seen": 32052776, "step": 55205 }, { "epoch": 8.223115877271374, "grad_norm": 0.8603909015655518, "learning_rate": 3.6648212930188474e-05, "loss": 0.4028, "num_input_tokens_seen": 32055496, "step": 55210 }, { "epoch": 8.223860589812332, "grad_norm": 1.8373745679855347, "learning_rate": 3.664533767102876e-05, "loss": 0.6577, "num_input_tokens_seen": 32058472, "step": 55215 }, { "epoch": 8.224605302353291, "grad_norm": 1.2390859127044678, "learning_rate": 3.664246221513289e-05, "loss": 0.5924, "num_input_tokens_seen": 32061256, "step": 55220 }, { "epoch": 8.22535001489425, "grad_norm": 1.6311759948730469, "learning_rate": 3.663958656254947e-05, "loss": 0.6611, "num_input_tokens_seen": 32064040, "step": 55225 }, { "epoch": 8.22609472743521, "grad_norm": 1.2452261447906494, "learning_rate": 3.663671071332705e-05, "loss": 0.6336, "num_input_tokens_seen": 32066792, "step": 55230 }, { "epoch": 8.22683943997617, "grad_norm": 1.6706839799880981, "learning_rate": 3.6633834667514236e-05, "loss": 0.6369, "num_input_tokens_seen": 32069832, "step": 55235 }, { "epoch": 8.227584152517128, "grad_norm": 1.2652297019958496, "learning_rate": 3.663095842515961e-05, "loss": 0.5334, "num_input_tokens_seen": 32072552, "step": 55240 }, { "epoch": 8.228328865058087, "grad_norm": 1.0291732549667358, "learning_rate": 3.662808198631176e-05, "loss": 0.6821, "num_input_tokens_seen": 32075304, "step": 55245 }, { "epoch": 8.229073577599047, "grad_norm": 1.7153383493423462, "learning_rate": 3.662520535101928e-05, "loss": 0.7073, "num_input_tokens_seen": 32078216, "step": 55250 }, { "epoch": 8.229818290140006, "grad_norm": 1.8511459827423096, "learning_rate": 3.662232851933079e-05, "loss": 0.5497, "num_input_tokens_seen": 32081256, "step": 55255 }, { "epoch": 8.230563002680965, "grad_norm": 1.426323413848877, "learning_rate": 3.661945149129485e-05, "loss": 0.5794, "num_input_tokens_seen": 32084488, "step": 55260 }, { "epoch": 8.231307715221924, "grad_norm": 2.4898102283477783, "learning_rate": 3.661657426696009e-05, "loss": 0.7149, "num_input_tokens_seen": 32087368, "step": 55265 }, { "epoch": 8.232052427762884, "grad_norm": 3.047057867050171, "learning_rate": 3.6613696846375115e-05, "loss": 0.8114, "num_input_tokens_seen": 32090440, "step": 55270 }, { "epoch": 8.232797140303843, "grad_norm": 1.4191588163375854, "learning_rate": 3.661081922958854e-05, "loss": 0.7689, "num_input_tokens_seen": 32093160, "step": 55275 }, { "epoch": 8.233541852844802, "grad_norm": 1.7477915287017822, "learning_rate": 3.660794141664898e-05, "loss": 0.7073, "num_input_tokens_seen": 32095752, "step": 55280 }, { "epoch": 8.23428656538576, "grad_norm": 0.9619432687759399, "learning_rate": 3.660506340760504e-05, "loss": 0.4707, "num_input_tokens_seen": 32098728, "step": 55285 }, { "epoch": 8.235031277926721, "grad_norm": 0.8203403949737549, "learning_rate": 3.660218520250535e-05, "loss": 0.5868, "num_input_tokens_seen": 32101352, "step": 55290 }, { "epoch": 8.23577599046768, "grad_norm": 0.8105365633964539, "learning_rate": 3.659930680139853e-05, "loss": 0.6076, "num_input_tokens_seen": 32104200, "step": 55295 }, { "epoch": 8.236520703008638, "grad_norm": 1.117788553237915, "learning_rate": 3.659642820433322e-05, "loss": 0.4324, "num_input_tokens_seen": 32107048, "step": 55300 }, { "epoch": 8.237265415549597, "grad_norm": 1.3125171661376953, "learning_rate": 3.659354941135803e-05, "loss": 0.615, "num_input_tokens_seen": 32109928, "step": 55305 }, { "epoch": 8.238010128090558, "grad_norm": 1.1320092678070068, "learning_rate": 3.65906704225216e-05, "loss": 0.537, "num_input_tokens_seen": 32112808, "step": 55310 }, { "epoch": 8.238754840631517, "grad_norm": 0.8580145835876465, "learning_rate": 3.658779123787259e-05, "loss": 0.4971, "num_input_tokens_seen": 32115432, "step": 55315 }, { "epoch": 8.239499553172475, "grad_norm": 2.3096892833709717, "learning_rate": 3.6584911857459624e-05, "loss": 0.8082, "num_input_tokens_seen": 32118376, "step": 55320 }, { "epoch": 8.240244265713434, "grad_norm": 1.020818829536438, "learning_rate": 3.6582032281331345e-05, "loss": 0.6009, "num_input_tokens_seen": 32121032, "step": 55325 }, { "epoch": 8.240988978254395, "grad_norm": 2.070063591003418, "learning_rate": 3.6579152509536395e-05, "loss": 0.5274, "num_input_tokens_seen": 32123880, "step": 55330 }, { "epoch": 8.241733690795353, "grad_norm": 1.083892822265625, "learning_rate": 3.6576272542123435e-05, "loss": 0.594, "num_input_tokens_seen": 32126536, "step": 55335 }, { "epoch": 8.242478403336312, "grad_norm": 1.2324329614639282, "learning_rate": 3.657339237914111e-05, "loss": 0.6897, "num_input_tokens_seen": 32129576, "step": 55340 }, { "epoch": 8.24322311587727, "grad_norm": 2.0092763900756836, "learning_rate": 3.657051202063809e-05, "loss": 0.6945, "num_input_tokens_seen": 32132296, "step": 55345 }, { "epoch": 8.243967828418231, "grad_norm": 1.46113920211792, "learning_rate": 3.656763146666303e-05, "loss": 0.6084, "num_input_tokens_seen": 32135144, "step": 55350 }, { "epoch": 8.24471254095919, "grad_norm": 1.4556963443756104, "learning_rate": 3.6564750717264595e-05, "loss": 0.6004, "num_input_tokens_seen": 32138280, "step": 55355 }, { "epoch": 8.245457253500149, "grad_norm": 0.8162611126899719, "learning_rate": 3.656186977249145e-05, "loss": 0.4555, "num_input_tokens_seen": 32141160, "step": 55360 }, { "epoch": 8.246201966041108, "grad_norm": 1.4200701713562012, "learning_rate": 3.655898863239226e-05, "loss": 0.6701, "num_input_tokens_seen": 32144232, "step": 55365 }, { "epoch": 8.246946678582066, "grad_norm": 1.5558385848999023, "learning_rate": 3.655610729701571e-05, "loss": 0.7087, "num_input_tokens_seen": 32147272, "step": 55370 }, { "epoch": 8.247691391123027, "grad_norm": 1.0093121528625488, "learning_rate": 3.655322576641047e-05, "loss": 0.5957, "num_input_tokens_seen": 32149992, "step": 55375 }, { "epoch": 8.248436103663986, "grad_norm": 1.4790817499160767, "learning_rate": 3.655034404062522e-05, "loss": 0.6716, "num_input_tokens_seen": 32153032, "step": 55380 }, { "epoch": 8.249180816204944, "grad_norm": 1.545509696006775, "learning_rate": 3.654746211970865e-05, "loss": 0.729, "num_input_tokens_seen": 32155688, "step": 55385 }, { "epoch": 8.249925528745903, "grad_norm": 2.1936094760894775, "learning_rate": 3.654458000370945e-05, "loss": 0.6758, "num_input_tokens_seen": 32158504, "step": 55390 }, { "epoch": 8.250670241286864, "grad_norm": 1.3195137977600098, "learning_rate": 3.65416976926763e-05, "loss": 0.4888, "num_input_tokens_seen": 32161320, "step": 55395 }, { "epoch": 8.251414953827823, "grad_norm": 1.3246349096298218, "learning_rate": 3.653881518665789e-05, "loss": 0.752, "num_input_tokens_seen": 32164200, "step": 55400 }, { "epoch": 8.252159666368781, "grad_norm": 1.0685964822769165, "learning_rate": 3.6535932485702933e-05, "loss": 0.585, "num_input_tokens_seen": 32167176, "step": 55405 }, { "epoch": 8.25290437890974, "grad_norm": 1.687681794166565, "learning_rate": 3.6533049589860115e-05, "loss": 0.4785, "num_input_tokens_seen": 32169896, "step": 55410 }, { "epoch": 8.2536490914507, "grad_norm": 1.3299875259399414, "learning_rate": 3.653016649917815e-05, "loss": 0.8198, "num_input_tokens_seen": 32172680, "step": 55415 }, { "epoch": 8.25439380399166, "grad_norm": 1.3324673175811768, "learning_rate": 3.652728321370575e-05, "loss": 0.5438, "num_input_tokens_seen": 32175816, "step": 55420 }, { "epoch": 8.255138516532618, "grad_norm": 1.6165330410003662, "learning_rate": 3.6524399733491605e-05, "loss": 0.627, "num_input_tokens_seen": 32178600, "step": 55425 }, { "epoch": 8.255883229073577, "grad_norm": 1.6440527439117432, "learning_rate": 3.652151605858444e-05, "loss": 0.6065, "num_input_tokens_seen": 32181448, "step": 55430 }, { "epoch": 8.256627941614537, "grad_norm": 0.8320053815841675, "learning_rate": 3.651863218903297e-05, "loss": 0.4945, "num_input_tokens_seen": 32184104, "step": 55435 }, { "epoch": 8.257372654155496, "grad_norm": 2.352593183517456, "learning_rate": 3.651574812488592e-05, "loss": 0.6532, "num_input_tokens_seen": 32187112, "step": 55440 }, { "epoch": 8.258117366696455, "grad_norm": 0.9384815096855164, "learning_rate": 3.651286386619201e-05, "loss": 0.5121, "num_input_tokens_seen": 32190184, "step": 55445 }, { "epoch": 8.258862079237414, "grad_norm": 1.2126009464263916, "learning_rate": 3.650997941299996e-05, "loss": 0.6397, "num_input_tokens_seen": 32193128, "step": 55450 }, { "epoch": 8.259606791778374, "grad_norm": 1.5840556621551514, "learning_rate": 3.650709476535852e-05, "loss": 0.6205, "num_input_tokens_seen": 32196104, "step": 55455 }, { "epoch": 8.260351504319333, "grad_norm": 1.600479006767273, "learning_rate": 3.6504209923316396e-05, "loss": 0.7136, "num_input_tokens_seen": 32198888, "step": 55460 }, { "epoch": 8.261096216860292, "grad_norm": 0.588032066822052, "learning_rate": 3.650132488692234e-05, "loss": 0.7104, "num_input_tokens_seen": 32201544, "step": 55465 }, { "epoch": 8.26184092940125, "grad_norm": 0.8833235502243042, "learning_rate": 3.649843965622509e-05, "loss": 0.801, "num_input_tokens_seen": 32204552, "step": 55470 }, { "epoch": 8.262585641942211, "grad_norm": 1.0550408363342285, "learning_rate": 3.6495554231273386e-05, "loss": 0.592, "num_input_tokens_seen": 32207336, "step": 55475 }, { "epoch": 8.26333035448317, "grad_norm": 2.5808260440826416, "learning_rate": 3.6492668612115986e-05, "loss": 0.7168, "num_input_tokens_seen": 32210184, "step": 55480 }, { "epoch": 8.264075067024129, "grad_norm": 2.9107978343963623, "learning_rate": 3.648978279880162e-05, "loss": 0.6164, "num_input_tokens_seen": 32213064, "step": 55485 }, { "epoch": 8.264819779565087, "grad_norm": 1.3157747983932495, "learning_rate": 3.648689679137906e-05, "loss": 0.5826, "num_input_tokens_seen": 32216008, "step": 55490 }, { "epoch": 8.265564492106048, "grad_norm": 1.2503395080566406, "learning_rate": 3.648401058989705e-05, "loss": 0.5773, "num_input_tokens_seen": 32219176, "step": 55495 }, { "epoch": 8.266309204647007, "grad_norm": 2.492570400238037, "learning_rate": 3.6481124194404345e-05, "loss": 0.7144, "num_input_tokens_seen": 32222664, "step": 55500 }, { "epoch": 8.267053917187965, "grad_norm": 0.9492571949958801, "learning_rate": 3.6478237604949725e-05, "loss": 0.5606, "num_input_tokens_seen": 32225416, "step": 55505 }, { "epoch": 8.267798629728924, "grad_norm": 0.9074742794036865, "learning_rate": 3.647535082158194e-05, "loss": 0.6528, "num_input_tokens_seen": 32228584, "step": 55510 }, { "epoch": 8.268543342269885, "grad_norm": 1.246949553489685, "learning_rate": 3.6472463844349776e-05, "loss": 0.6972, "num_input_tokens_seen": 32231368, "step": 55515 }, { "epoch": 8.269288054810843, "grad_norm": 1.311848759651184, "learning_rate": 3.646957667330199e-05, "loss": 0.6164, "num_input_tokens_seen": 32234344, "step": 55520 }, { "epoch": 8.270032767351802, "grad_norm": 1.2423880100250244, "learning_rate": 3.6466689308487364e-05, "loss": 0.7221, "num_input_tokens_seen": 32237416, "step": 55525 }, { "epoch": 8.270777479892761, "grad_norm": 1.180202603340149, "learning_rate": 3.646380174995468e-05, "loss": 0.7209, "num_input_tokens_seen": 32240584, "step": 55530 }, { "epoch": 8.271522192433721, "grad_norm": 0.9626858830451965, "learning_rate": 3.646091399775271e-05, "loss": 0.5194, "num_input_tokens_seen": 32243528, "step": 55535 }, { "epoch": 8.27226690497468, "grad_norm": 0.9527597427368164, "learning_rate": 3.645802605193025e-05, "loss": 0.5509, "num_input_tokens_seen": 32246472, "step": 55540 }, { "epoch": 8.273011617515639, "grad_norm": 1.4683040380477905, "learning_rate": 3.645513791253608e-05, "loss": 0.6554, "num_input_tokens_seen": 32249576, "step": 55545 }, { "epoch": 8.273756330056598, "grad_norm": 1.8298540115356445, "learning_rate": 3.6452249579619005e-05, "loss": 0.6826, "num_input_tokens_seen": 32252104, "step": 55550 }, { "epoch": 8.274501042597556, "grad_norm": 1.3602612018585205, "learning_rate": 3.644936105322781e-05, "loss": 0.6163, "num_input_tokens_seen": 32255144, "step": 55555 }, { "epoch": 8.275245755138517, "grad_norm": 1.110654354095459, "learning_rate": 3.644647233341129e-05, "loss": 0.6278, "num_input_tokens_seen": 32257832, "step": 55560 }, { "epoch": 8.275990467679476, "grad_norm": 1.4562078714370728, "learning_rate": 3.644358342021826e-05, "loss": 0.838, "num_input_tokens_seen": 32260616, "step": 55565 }, { "epoch": 8.276735180220435, "grad_norm": 1.9863505363464355, "learning_rate": 3.644069431369752e-05, "loss": 0.7999, "num_input_tokens_seen": 32263240, "step": 55570 }, { "epoch": 8.277479892761393, "grad_norm": 1.399807333946228, "learning_rate": 3.643780501389787e-05, "loss": 0.5993, "num_input_tokens_seen": 32266408, "step": 55575 }, { "epoch": 8.278224605302354, "grad_norm": 0.9617574214935303, "learning_rate": 3.643491552086814e-05, "loss": 0.5558, "num_input_tokens_seen": 32269384, "step": 55580 }, { "epoch": 8.278969317843313, "grad_norm": 2.0962936878204346, "learning_rate": 3.643202583465713e-05, "loss": 0.9024, "num_input_tokens_seen": 32272392, "step": 55585 }, { "epoch": 8.279714030384271, "grad_norm": 1.3139477968215942, "learning_rate": 3.6429135955313664e-05, "loss": 0.648, "num_input_tokens_seen": 32275176, "step": 55590 }, { "epoch": 8.28045874292523, "grad_norm": 1.2230552434921265, "learning_rate": 3.6426245882886554e-05, "loss": 0.5322, "num_input_tokens_seen": 32278376, "step": 55595 }, { "epoch": 8.28120345546619, "grad_norm": 1.245516300201416, "learning_rate": 3.6423355617424634e-05, "loss": 0.5102, "num_input_tokens_seen": 32281096, "step": 55600 }, { "epoch": 8.28194816800715, "grad_norm": 1.3233033418655396, "learning_rate": 3.6420465158976746e-05, "loss": 0.4136, "num_input_tokens_seen": 32283848, "step": 55605 }, { "epoch": 8.282692880548108, "grad_norm": 1.086153507232666, "learning_rate": 3.6417574507591694e-05, "loss": 0.6367, "num_input_tokens_seen": 32286856, "step": 55610 }, { "epoch": 8.283437593089067, "grad_norm": 2.451261043548584, "learning_rate": 3.641468366331833e-05, "loss": 0.7153, "num_input_tokens_seen": 32289832, "step": 55615 }, { "epoch": 8.284182305630027, "grad_norm": 1.0759657621383667, "learning_rate": 3.641179262620548e-05, "loss": 0.7308, "num_input_tokens_seen": 32292616, "step": 55620 }, { "epoch": 8.284927018170986, "grad_norm": 0.9600307941436768, "learning_rate": 3.6408901396301995e-05, "loss": 0.6433, "num_input_tokens_seen": 32295752, "step": 55625 }, { "epoch": 8.285671730711945, "grad_norm": 1.2242408990859985, "learning_rate": 3.640600997365672e-05, "loss": 0.6365, "num_input_tokens_seen": 32298664, "step": 55630 }, { "epoch": 8.286416443252904, "grad_norm": 1.0836313962936401, "learning_rate": 3.64031183583185e-05, "loss": 0.608, "num_input_tokens_seen": 32301480, "step": 55635 }, { "epoch": 8.287161155793864, "grad_norm": 1.3927576541900635, "learning_rate": 3.6400226550336166e-05, "loss": 0.5335, "num_input_tokens_seen": 32304296, "step": 55640 }, { "epoch": 8.287905868334823, "grad_norm": 1.937649130821228, "learning_rate": 3.6397334549758614e-05, "loss": 0.6423, "num_input_tokens_seen": 32307336, "step": 55645 }, { "epoch": 8.288650580875782, "grad_norm": 0.6369113326072693, "learning_rate": 3.639444235663467e-05, "loss": 0.6131, "num_input_tokens_seen": 32310408, "step": 55650 }, { "epoch": 8.28939529341674, "grad_norm": 1.1684973239898682, "learning_rate": 3.6391549971013213e-05, "loss": 0.5774, "num_input_tokens_seen": 32313288, "step": 55655 }, { "epoch": 8.290140005957701, "grad_norm": 0.9750974774360657, "learning_rate": 3.638865739294308e-05, "loss": 0.428, "num_input_tokens_seen": 32316648, "step": 55660 }, { "epoch": 8.29088471849866, "grad_norm": 1.1528805494308472, "learning_rate": 3.6385764622473164e-05, "loss": 0.6492, "num_input_tokens_seen": 32319752, "step": 55665 }, { "epoch": 8.291629431039619, "grad_norm": 1.1720712184906006, "learning_rate": 3.6382871659652326e-05, "loss": 0.7137, "num_input_tokens_seen": 32322568, "step": 55670 }, { "epoch": 8.292374143580577, "grad_norm": 0.783004641532898, "learning_rate": 3.6379978504529436e-05, "loss": 0.6698, "num_input_tokens_seen": 32325544, "step": 55675 }, { "epoch": 8.293118856121538, "grad_norm": 0.8458743691444397, "learning_rate": 3.637708515715338e-05, "loss": 0.4688, "num_input_tokens_seen": 32328840, "step": 55680 }, { "epoch": 8.293863568662497, "grad_norm": 1.423401951789856, "learning_rate": 3.637419161757304e-05, "loss": 0.6217, "num_input_tokens_seen": 32331688, "step": 55685 }, { "epoch": 8.294608281203455, "grad_norm": 1.4304099082946777, "learning_rate": 3.637129788583729e-05, "loss": 0.5333, "num_input_tokens_seen": 32334504, "step": 55690 }, { "epoch": 8.295352993744414, "grad_norm": 3.047764778137207, "learning_rate": 3.6368403961995014e-05, "loss": 0.6872, "num_input_tokens_seen": 32337672, "step": 55695 }, { "epoch": 8.296097706285375, "grad_norm": 1.3290554285049438, "learning_rate": 3.63655098460951e-05, "loss": 0.5074, "num_input_tokens_seen": 32340360, "step": 55700 }, { "epoch": 8.296842418826333, "grad_norm": 3.329023599624634, "learning_rate": 3.636261553818646e-05, "loss": 0.9005, "num_input_tokens_seen": 32343400, "step": 55705 }, { "epoch": 8.297587131367292, "grad_norm": 1.6090950965881348, "learning_rate": 3.6359721038317976e-05, "loss": 0.5836, "num_input_tokens_seen": 32346184, "step": 55710 }, { "epoch": 8.298331843908251, "grad_norm": 0.9657542705535889, "learning_rate": 3.6356826346538555e-05, "loss": 0.6238, "num_input_tokens_seen": 32349032, "step": 55715 }, { "epoch": 8.299076556449211, "grad_norm": 1.1776455640792847, "learning_rate": 3.635393146289709e-05, "loss": 0.6056, "num_input_tokens_seen": 32352264, "step": 55720 }, { "epoch": 8.29982126899017, "grad_norm": 1.7549415826797485, "learning_rate": 3.6351036387442496e-05, "loss": 0.7533, "num_input_tokens_seen": 32355272, "step": 55725 }, { "epoch": 8.300565981531129, "grad_norm": 1.5733563899993896, "learning_rate": 3.634814112022368e-05, "loss": 0.6431, "num_input_tokens_seen": 32357992, "step": 55730 }, { "epoch": 8.301310694072088, "grad_norm": 1.4353063106536865, "learning_rate": 3.634524566128955e-05, "loss": 0.6822, "num_input_tokens_seen": 32360744, "step": 55735 }, { "epoch": 8.302055406613047, "grad_norm": 1.600716233253479, "learning_rate": 3.6342350010689017e-05, "loss": 0.5444, "num_input_tokens_seen": 32363528, "step": 55740 }, { "epoch": 8.302800119154007, "grad_norm": 1.3581945896148682, "learning_rate": 3.633945416847102e-05, "loss": 0.6545, "num_input_tokens_seen": 32366344, "step": 55745 }, { "epoch": 8.303544831694966, "grad_norm": 1.3033357858657837, "learning_rate": 3.6336558134684465e-05, "loss": 0.6466, "num_input_tokens_seen": 32369160, "step": 55750 }, { "epoch": 8.304289544235925, "grad_norm": 1.095936894416809, "learning_rate": 3.6333661909378286e-05, "loss": 0.4409, "num_input_tokens_seen": 32371976, "step": 55755 }, { "epoch": 8.305034256776883, "grad_norm": 3.079974412918091, "learning_rate": 3.63307654926014e-05, "loss": 0.7181, "num_input_tokens_seen": 32374696, "step": 55760 }, { "epoch": 8.305778969317844, "grad_norm": 1.0968812704086304, "learning_rate": 3.632786888440276e-05, "loss": 0.6753, "num_input_tokens_seen": 32377448, "step": 55765 }, { "epoch": 8.306523681858803, "grad_norm": 1.2544876337051392, "learning_rate": 3.6324972084831284e-05, "loss": 0.7231, "num_input_tokens_seen": 32380264, "step": 55770 }, { "epoch": 8.307268394399761, "grad_norm": 1.0595886707305908, "learning_rate": 3.632207509393591e-05, "loss": 0.5066, "num_input_tokens_seen": 32383208, "step": 55775 }, { "epoch": 8.30801310694072, "grad_norm": 1.523174524307251, "learning_rate": 3.6319177911765583e-05, "loss": 0.6654, "num_input_tokens_seen": 32385960, "step": 55780 }, { "epoch": 8.30875781948168, "grad_norm": 1.1675854921340942, "learning_rate": 3.631628053836926e-05, "loss": 0.7628, "num_input_tokens_seen": 32388680, "step": 55785 }, { "epoch": 8.30950253202264, "grad_norm": 1.8658872842788696, "learning_rate": 3.631338297379587e-05, "loss": 0.7858, "num_input_tokens_seen": 32391336, "step": 55790 }, { "epoch": 8.310247244563598, "grad_norm": 1.5632402896881104, "learning_rate": 3.631048521809437e-05, "loss": 0.6765, "num_input_tokens_seen": 32394344, "step": 55795 }, { "epoch": 8.310991957104557, "grad_norm": 0.9238269925117493, "learning_rate": 3.630758727131373e-05, "loss": 0.58, "num_input_tokens_seen": 32397288, "step": 55800 }, { "epoch": 8.311736669645517, "grad_norm": 1.1474677324295044, "learning_rate": 3.6304689133502884e-05, "loss": 0.6135, "num_input_tokens_seen": 32400040, "step": 55805 }, { "epoch": 8.312481382186476, "grad_norm": 1.3834781646728516, "learning_rate": 3.630179080471081e-05, "loss": 0.5012, "num_input_tokens_seen": 32403016, "step": 55810 }, { "epoch": 8.313226094727435, "grad_norm": 0.9489099979400635, "learning_rate": 3.629889228498646e-05, "loss": 0.5076, "num_input_tokens_seen": 32405992, "step": 55815 }, { "epoch": 8.313970807268394, "grad_norm": 1.070137858390808, "learning_rate": 3.629599357437882e-05, "loss": 0.8023, "num_input_tokens_seen": 32408904, "step": 55820 }, { "epoch": 8.314715519809354, "grad_norm": 0.9723570942878723, "learning_rate": 3.629309467293685e-05, "loss": 0.4091, "num_input_tokens_seen": 32411720, "step": 55825 }, { "epoch": 8.315460232350313, "grad_norm": 1.075986385345459, "learning_rate": 3.6290195580709505e-05, "loss": 0.6154, "num_input_tokens_seen": 32415016, "step": 55830 }, { "epoch": 8.316204944891272, "grad_norm": 1.3169870376586914, "learning_rate": 3.62872962977458e-05, "loss": 0.577, "num_input_tokens_seen": 32417896, "step": 55835 }, { "epoch": 8.31694965743223, "grad_norm": 1.6327790021896362, "learning_rate": 3.628439682409468e-05, "loss": 0.5223, "num_input_tokens_seen": 32420936, "step": 55840 }, { "epoch": 8.317694369973191, "grad_norm": 1.1649965047836304, "learning_rate": 3.628149715980516e-05, "loss": 0.6002, "num_input_tokens_seen": 32423784, "step": 55845 }, { "epoch": 8.31843908251415, "grad_norm": 1.497002363204956, "learning_rate": 3.62785973049262e-05, "loss": 0.7081, "num_input_tokens_seen": 32426536, "step": 55850 }, { "epoch": 8.319183795055109, "grad_norm": 1.1213077306747437, "learning_rate": 3.627569725950681e-05, "loss": 0.611, "num_input_tokens_seen": 32429512, "step": 55855 }, { "epoch": 8.319928507596067, "grad_norm": 0.948422908782959, "learning_rate": 3.6272797023595974e-05, "loss": 0.7379, "num_input_tokens_seen": 32432456, "step": 55860 }, { "epoch": 8.320673220137028, "grad_norm": 1.3242584466934204, "learning_rate": 3.626989659724268e-05, "loss": 0.5034, "num_input_tokens_seen": 32435656, "step": 55865 }, { "epoch": 8.321417932677987, "grad_norm": 1.1286888122558594, "learning_rate": 3.626699598049594e-05, "loss": 0.6459, "num_input_tokens_seen": 32438472, "step": 55870 }, { "epoch": 8.322162645218945, "grad_norm": 0.8971600532531738, "learning_rate": 3.626409517340476e-05, "loss": 0.5254, "num_input_tokens_seen": 32441416, "step": 55875 }, { "epoch": 8.322907357759904, "grad_norm": 1.4452956914901733, "learning_rate": 3.626119417601814e-05, "loss": 0.5974, "num_input_tokens_seen": 32443976, "step": 55880 }, { "epoch": 8.323652070300863, "grad_norm": 1.9748072624206543, "learning_rate": 3.625829298838509e-05, "loss": 0.7967, "num_input_tokens_seen": 32446760, "step": 55885 }, { "epoch": 8.324396782841823, "grad_norm": 1.5756092071533203, "learning_rate": 3.6255391610554624e-05, "loss": 0.6163, "num_input_tokens_seen": 32449640, "step": 55890 }, { "epoch": 8.325141495382782, "grad_norm": 2.116405963897705, "learning_rate": 3.625249004257575e-05, "loss": 0.5633, "num_input_tokens_seen": 32452488, "step": 55895 }, { "epoch": 8.325886207923741, "grad_norm": 1.186643362045288, "learning_rate": 3.6249588284497496e-05, "loss": 0.5461, "num_input_tokens_seen": 32455336, "step": 55900 }, { "epoch": 8.3266309204647, "grad_norm": 1.0517957210540771, "learning_rate": 3.624668633636888e-05, "loss": 0.4695, "num_input_tokens_seen": 32457864, "step": 55905 }, { "epoch": 8.32737563300566, "grad_norm": 0.9801339507102966, "learning_rate": 3.624378419823893e-05, "loss": 0.565, "num_input_tokens_seen": 32460808, "step": 55910 }, { "epoch": 8.328120345546619, "grad_norm": 0.9950340986251831, "learning_rate": 3.624088187015668e-05, "loss": 0.7032, "num_input_tokens_seen": 32463912, "step": 55915 }, { "epoch": 8.328865058087578, "grad_norm": 1.7668439149856567, "learning_rate": 3.623797935217115e-05, "loss": 0.6576, "num_input_tokens_seen": 32467080, "step": 55920 }, { "epoch": 8.329609770628537, "grad_norm": 1.2710645198822021, "learning_rate": 3.623507664433138e-05, "loss": 0.5664, "num_input_tokens_seen": 32470152, "step": 55925 }, { "epoch": 8.330354483169497, "grad_norm": 1.2538671493530273, "learning_rate": 3.6232173746686405e-05, "loss": 0.7294, "num_input_tokens_seen": 32473000, "step": 55930 }, { "epoch": 8.331099195710456, "grad_norm": 1.996545433998108, "learning_rate": 3.6229270659285276e-05, "loss": 0.79, "num_input_tokens_seen": 32475688, "step": 55935 }, { "epoch": 8.331843908251415, "grad_norm": 1.4215850830078125, "learning_rate": 3.622636738217703e-05, "loss": 0.75, "num_input_tokens_seen": 32478600, "step": 55940 }, { "epoch": 8.332588620792373, "grad_norm": 0.7312946915626526, "learning_rate": 3.6223463915410714e-05, "loss": 0.538, "num_input_tokens_seen": 32481736, "step": 55945 }, { "epoch": 8.333333333333334, "grad_norm": 1.278405785560608, "learning_rate": 3.622056025903539e-05, "loss": 0.6368, "num_input_tokens_seen": 32484744, "step": 55950 }, { "epoch": 8.334078045874293, "grad_norm": 1.4106537103652954, "learning_rate": 3.62176564131001e-05, "loss": 0.6803, "num_input_tokens_seen": 32487688, "step": 55955 }, { "epoch": 8.334822758415251, "grad_norm": 1.6864619255065918, "learning_rate": 3.62147523776539e-05, "loss": 0.6798, "num_input_tokens_seen": 32490280, "step": 55960 }, { "epoch": 8.33556747095621, "grad_norm": 1.154311180114746, "learning_rate": 3.621184815274587e-05, "loss": 0.7033, "num_input_tokens_seen": 32493192, "step": 55965 }, { "epoch": 8.33631218349717, "grad_norm": 1.3444151878356934, "learning_rate": 3.620894373842505e-05, "loss": 0.68, "num_input_tokens_seen": 32496040, "step": 55970 }, { "epoch": 8.33705689603813, "grad_norm": 0.8760769367218018, "learning_rate": 3.6206039134740525e-05, "loss": 0.5522, "num_input_tokens_seen": 32499080, "step": 55975 }, { "epoch": 8.337801608579088, "grad_norm": 1.4164613485336304, "learning_rate": 3.620313434174135e-05, "loss": 0.6223, "num_input_tokens_seen": 32501768, "step": 55980 }, { "epoch": 8.338546321120047, "grad_norm": 1.9133405685424805, "learning_rate": 3.6200229359476614e-05, "loss": 0.7254, "num_input_tokens_seen": 32504456, "step": 55985 }, { "epoch": 8.339291033661008, "grad_norm": 0.7381181716918945, "learning_rate": 3.6197324187995384e-05, "loss": 0.53, "num_input_tokens_seen": 32507496, "step": 55990 }, { "epoch": 8.340035746201966, "grad_norm": 1.5220866203308105, "learning_rate": 3.6194418827346746e-05, "loss": 0.719, "num_input_tokens_seen": 32510408, "step": 55995 }, { "epoch": 8.340780458742925, "grad_norm": 1.2427823543548584, "learning_rate": 3.619151327757977e-05, "loss": 0.8312, "num_input_tokens_seen": 32513416, "step": 56000 }, { "epoch": 8.341525171283884, "grad_norm": 1.507125735282898, "learning_rate": 3.6188607538743556e-05, "loss": 0.6498, "num_input_tokens_seen": 32516232, "step": 56005 }, { "epoch": 8.342269883824844, "grad_norm": 1.4027631282806396, "learning_rate": 3.618570161088719e-05, "loss": 0.5889, "num_input_tokens_seen": 32519240, "step": 56010 }, { "epoch": 8.343014596365803, "grad_norm": 0.8172562718391418, "learning_rate": 3.6182795494059764e-05, "loss": 0.6312, "num_input_tokens_seen": 32521896, "step": 56015 }, { "epoch": 8.343759308906762, "grad_norm": 0.8233378529548645, "learning_rate": 3.617988918831038e-05, "loss": 0.5049, "num_input_tokens_seen": 32524968, "step": 56020 }, { "epoch": 8.34450402144772, "grad_norm": 1.7576960325241089, "learning_rate": 3.617698269368812e-05, "loss": 0.6421, "num_input_tokens_seen": 32527848, "step": 56025 }, { "epoch": 8.345248733988681, "grad_norm": 0.9051418900489807, "learning_rate": 3.61740760102421e-05, "loss": 0.6948, "num_input_tokens_seen": 32530760, "step": 56030 }, { "epoch": 8.34599344652964, "grad_norm": 3.1373631954193115, "learning_rate": 3.617116913802143e-05, "loss": 0.6141, "num_input_tokens_seen": 32533512, "step": 56035 }, { "epoch": 8.346738159070599, "grad_norm": 2.1262738704681396, "learning_rate": 3.61682620770752e-05, "loss": 0.6308, "num_input_tokens_seen": 32536168, "step": 56040 }, { "epoch": 8.347482871611557, "grad_norm": 1.144734501838684, "learning_rate": 3.616535482745254e-05, "loss": 0.5887, "num_input_tokens_seen": 32539304, "step": 56045 }, { "epoch": 8.348227584152518, "grad_norm": 1.611254334449768, "learning_rate": 3.616244738920256e-05, "loss": 0.6803, "num_input_tokens_seen": 32542408, "step": 56050 }, { "epoch": 8.348972296693477, "grad_norm": 1.38922917842865, "learning_rate": 3.615953976237438e-05, "loss": 0.6574, "num_input_tokens_seen": 32545448, "step": 56055 }, { "epoch": 8.349717009234435, "grad_norm": 2.024326801300049, "learning_rate": 3.6156631947017106e-05, "loss": 0.5393, "num_input_tokens_seen": 32548072, "step": 56060 }, { "epoch": 8.350461721775394, "grad_norm": 1.0096436738967896, "learning_rate": 3.6153723943179876e-05, "loss": 0.7034, "num_input_tokens_seen": 32550728, "step": 56065 }, { "epoch": 8.351206434316353, "grad_norm": 0.9927430152893066, "learning_rate": 3.6150815750911825e-05, "loss": 0.5608, "num_input_tokens_seen": 32553896, "step": 56070 }, { "epoch": 8.351951146857314, "grad_norm": 3.121839761734009, "learning_rate": 3.614790737026207e-05, "loss": 0.6016, "num_input_tokens_seen": 32556520, "step": 56075 }, { "epoch": 8.352695859398272, "grad_norm": 1.6156659126281738, "learning_rate": 3.614499880127975e-05, "loss": 0.7636, "num_input_tokens_seen": 32559368, "step": 56080 }, { "epoch": 8.353440571939231, "grad_norm": 0.9521852731704712, "learning_rate": 3.6142090044014e-05, "loss": 0.5548, "num_input_tokens_seen": 32561992, "step": 56085 }, { "epoch": 8.35418528448019, "grad_norm": 1.6266738176345825, "learning_rate": 3.613918109851397e-05, "loss": 0.6751, "num_input_tokens_seen": 32564680, "step": 56090 }, { "epoch": 8.35492999702115, "grad_norm": 0.9981505274772644, "learning_rate": 3.613627196482879e-05, "loss": 0.7409, "num_input_tokens_seen": 32567528, "step": 56095 }, { "epoch": 8.35567470956211, "grad_norm": 1.9213591814041138, "learning_rate": 3.613336264300762e-05, "loss": 0.72, "num_input_tokens_seen": 32570184, "step": 56100 }, { "epoch": 8.356419422103068, "grad_norm": 1.422752022743225, "learning_rate": 3.613045313309959e-05, "loss": 0.4793, "num_input_tokens_seen": 32573000, "step": 56105 }, { "epoch": 8.357164134644027, "grad_norm": 1.7591824531555176, "learning_rate": 3.612754343515388e-05, "loss": 0.7101, "num_input_tokens_seen": 32575976, "step": 56110 }, { "epoch": 8.357908847184987, "grad_norm": 0.8574032187461853, "learning_rate": 3.612463354921963e-05, "loss": 0.6854, "num_input_tokens_seen": 32578856, "step": 56115 }, { "epoch": 8.358653559725946, "grad_norm": 1.035593032836914, "learning_rate": 3.6121723475346006e-05, "loss": 0.7707, "num_input_tokens_seen": 32581608, "step": 56120 }, { "epoch": 8.359398272266905, "grad_norm": 2.210437536239624, "learning_rate": 3.6118813213582156e-05, "loss": 0.6455, "num_input_tokens_seen": 32584456, "step": 56125 }, { "epoch": 8.360142984807863, "grad_norm": 0.9375503659248352, "learning_rate": 3.611590276397727e-05, "loss": 0.5177, "num_input_tokens_seen": 32587336, "step": 56130 }, { "epoch": 8.360887697348824, "grad_norm": 1.5457035303115845, "learning_rate": 3.6112992126580505e-05, "loss": 0.7829, "num_input_tokens_seen": 32590664, "step": 56135 }, { "epoch": 8.361632409889783, "grad_norm": 0.7626392245292664, "learning_rate": 3.611008130144102e-05, "loss": 0.6568, "num_input_tokens_seen": 32593416, "step": 56140 }, { "epoch": 8.362377122430741, "grad_norm": 0.7696576714515686, "learning_rate": 3.610717028860801e-05, "loss": 0.5802, "num_input_tokens_seen": 32596296, "step": 56145 }, { "epoch": 8.3631218349717, "grad_norm": 0.8605245351791382, "learning_rate": 3.6104259088130655e-05, "loss": 0.5956, "num_input_tokens_seen": 32599592, "step": 56150 }, { "epoch": 8.36386654751266, "grad_norm": 0.8694520592689514, "learning_rate": 3.6101347700058116e-05, "loss": 0.6723, "num_input_tokens_seen": 32602408, "step": 56155 }, { "epoch": 8.36461126005362, "grad_norm": 1.5172514915466309, "learning_rate": 3.6098436124439594e-05, "loss": 0.5914, "num_input_tokens_seen": 32605128, "step": 56160 }, { "epoch": 8.365355972594578, "grad_norm": 1.477138876914978, "learning_rate": 3.609552436132427e-05, "loss": 0.7484, "num_input_tokens_seen": 32608040, "step": 56165 }, { "epoch": 8.366100685135537, "grad_norm": 1.2569738626480103, "learning_rate": 3.609261241076136e-05, "loss": 0.5184, "num_input_tokens_seen": 32610984, "step": 56170 }, { "epoch": 8.366845397676498, "grad_norm": 1.4967550039291382, "learning_rate": 3.608970027280001e-05, "loss": 0.3711, "num_input_tokens_seen": 32613992, "step": 56175 }, { "epoch": 8.367590110217456, "grad_norm": 1.7167755365371704, "learning_rate": 3.608678794748946e-05, "loss": 0.638, "num_input_tokens_seen": 32616872, "step": 56180 }, { "epoch": 8.368334822758415, "grad_norm": 1.3450247049331665, "learning_rate": 3.608387543487889e-05, "loss": 0.6473, "num_input_tokens_seen": 32619464, "step": 56185 }, { "epoch": 8.369079535299374, "grad_norm": 1.0195581912994385, "learning_rate": 3.6080962735017514e-05, "loss": 0.668, "num_input_tokens_seen": 32622600, "step": 56190 }, { "epoch": 8.369824247840334, "grad_norm": 1.307836890220642, "learning_rate": 3.607804984795453e-05, "loss": 0.6239, "num_input_tokens_seen": 32625320, "step": 56195 }, { "epoch": 8.370568960381293, "grad_norm": 1.3775520324707031, "learning_rate": 3.607513677373916e-05, "loss": 0.7557, "num_input_tokens_seen": 32628168, "step": 56200 }, { "epoch": 8.371313672922252, "grad_norm": 1.8121578693389893, "learning_rate": 3.60722235124206e-05, "loss": 0.6653, "num_input_tokens_seen": 32630984, "step": 56205 }, { "epoch": 8.37205838546321, "grad_norm": 1.010982632637024, "learning_rate": 3.606931006404809e-05, "loss": 0.5461, "num_input_tokens_seen": 32633800, "step": 56210 }, { "epoch": 8.372803098004171, "grad_norm": 1.4779008626937866, "learning_rate": 3.606639642867083e-05, "loss": 0.5704, "num_input_tokens_seen": 32637000, "step": 56215 }, { "epoch": 8.37354781054513, "grad_norm": 1.0919708013534546, "learning_rate": 3.606348260633805e-05, "loss": 0.5324, "num_input_tokens_seen": 32639624, "step": 56220 }, { "epoch": 8.374292523086089, "grad_norm": 0.9570426344871521, "learning_rate": 3.6060568597098974e-05, "loss": 0.5435, "num_input_tokens_seen": 32642632, "step": 56225 }, { "epoch": 8.375037235627047, "grad_norm": 0.9025507569313049, "learning_rate": 3.605765440100283e-05, "loss": 0.574, "num_input_tokens_seen": 32645640, "step": 56230 }, { "epoch": 8.375781948168008, "grad_norm": 0.8921139240264893, "learning_rate": 3.605474001809886e-05, "loss": 0.6869, "num_input_tokens_seen": 32648520, "step": 56235 }, { "epoch": 8.376526660708967, "grad_norm": 1.5167661905288696, "learning_rate": 3.6051825448436286e-05, "loss": 0.7244, "num_input_tokens_seen": 32651688, "step": 56240 }, { "epoch": 8.377271373249926, "grad_norm": 1.761437177658081, "learning_rate": 3.604891069206437e-05, "loss": 0.6346, "num_input_tokens_seen": 32654504, "step": 56245 }, { "epoch": 8.378016085790884, "grad_norm": 1.2642900943756104, "learning_rate": 3.6045995749032326e-05, "loss": 0.6081, "num_input_tokens_seen": 32657416, "step": 56250 }, { "epoch": 8.378760798331843, "grad_norm": 0.8946952223777771, "learning_rate": 3.6043080619389406e-05, "loss": 0.5853, "num_input_tokens_seen": 32660072, "step": 56255 }, { "epoch": 8.379505510872804, "grad_norm": 1.5060338973999023, "learning_rate": 3.604016530318487e-05, "loss": 0.7624, "num_input_tokens_seen": 32663240, "step": 56260 }, { "epoch": 8.380250223413762, "grad_norm": 1.6374616622924805, "learning_rate": 3.6037249800467957e-05, "loss": 0.6322, "num_input_tokens_seen": 32666024, "step": 56265 }, { "epoch": 8.380994935954721, "grad_norm": 1.268784761428833, "learning_rate": 3.6034334111287926e-05, "loss": 0.8496, "num_input_tokens_seen": 32668808, "step": 56270 }, { "epoch": 8.38173964849568, "grad_norm": 1.0437142848968506, "learning_rate": 3.603141823569404e-05, "loss": 0.6496, "num_input_tokens_seen": 32671816, "step": 56275 }, { "epoch": 8.38248436103664, "grad_norm": 1.0299533605575562, "learning_rate": 3.602850217373555e-05, "loss": 0.5526, "num_input_tokens_seen": 32674376, "step": 56280 }, { "epoch": 8.3832290735776, "grad_norm": 1.1455639600753784, "learning_rate": 3.602558592546172e-05, "loss": 0.5844, "num_input_tokens_seen": 32677192, "step": 56285 }, { "epoch": 8.383973786118558, "grad_norm": 1.2694002389907837, "learning_rate": 3.602266949092184e-05, "loss": 0.5627, "num_input_tokens_seen": 32680104, "step": 56290 }, { "epoch": 8.384718498659517, "grad_norm": 1.003443956375122, "learning_rate": 3.6019752870165145e-05, "loss": 0.5136, "num_input_tokens_seen": 32683016, "step": 56295 }, { "epoch": 8.385463211200477, "grad_norm": 1.2491322755813599, "learning_rate": 3.601683606324093e-05, "loss": 0.4261, "num_input_tokens_seen": 32685800, "step": 56300 }, { "epoch": 8.386207923741436, "grad_norm": 1.7200005054473877, "learning_rate": 3.601391907019847e-05, "loss": 0.6066, "num_input_tokens_seen": 32688808, "step": 56305 }, { "epoch": 8.386952636282395, "grad_norm": 2.0768346786499023, "learning_rate": 3.601100189108704e-05, "loss": 0.6025, "num_input_tokens_seen": 32691752, "step": 56310 }, { "epoch": 8.387697348823353, "grad_norm": 1.2336664199829102, "learning_rate": 3.600808452595592e-05, "loss": 0.7749, "num_input_tokens_seen": 32695272, "step": 56315 }, { "epoch": 8.388442061364314, "grad_norm": 1.4694024324417114, "learning_rate": 3.6005166974854406e-05, "loss": 0.6556, "num_input_tokens_seen": 32698312, "step": 56320 }, { "epoch": 8.389186773905273, "grad_norm": 1.642503261566162, "learning_rate": 3.6002249237831774e-05, "loss": 0.5807, "num_input_tokens_seen": 32701224, "step": 56325 }, { "epoch": 8.389931486446232, "grad_norm": 1.7630541324615479, "learning_rate": 3.599933131493733e-05, "loss": 0.5761, "num_input_tokens_seen": 32704072, "step": 56330 }, { "epoch": 8.39067619898719, "grad_norm": 0.8317503333091736, "learning_rate": 3.599641320622036e-05, "loss": 0.8661, "num_input_tokens_seen": 32706856, "step": 56335 }, { "epoch": 8.39142091152815, "grad_norm": 1.1425610780715942, "learning_rate": 3.599349491173016e-05, "loss": 0.6642, "num_input_tokens_seen": 32709640, "step": 56340 }, { "epoch": 8.39216562406911, "grad_norm": 1.5756652355194092, "learning_rate": 3.5990576431516044e-05, "loss": 0.8389, "num_input_tokens_seen": 32712392, "step": 56345 }, { "epoch": 8.392910336610068, "grad_norm": 0.9626196622848511, "learning_rate": 3.598765776562731e-05, "loss": 0.6324, "num_input_tokens_seen": 32715176, "step": 56350 }, { "epoch": 8.393655049151027, "grad_norm": 1.2024762630462646, "learning_rate": 3.598473891411326e-05, "loss": 0.5914, "num_input_tokens_seen": 32717896, "step": 56355 }, { "epoch": 8.394399761691988, "grad_norm": 1.5955921411514282, "learning_rate": 3.598181987702321e-05, "loss": 0.502, "num_input_tokens_seen": 32720424, "step": 56360 }, { "epoch": 8.395144474232946, "grad_norm": 0.844751238822937, "learning_rate": 3.5978900654406476e-05, "loss": 0.5275, "num_input_tokens_seen": 32723400, "step": 56365 }, { "epoch": 8.395889186773905, "grad_norm": 0.8220338821411133, "learning_rate": 3.597598124631239e-05, "loss": 0.5776, "num_input_tokens_seen": 32726536, "step": 56370 }, { "epoch": 8.396633899314864, "grad_norm": 1.1738052368164062, "learning_rate": 3.5973061652790237e-05, "loss": 0.6912, "num_input_tokens_seen": 32729352, "step": 56375 }, { "epoch": 8.397378611855824, "grad_norm": 1.3650257587432861, "learning_rate": 3.597014187388936e-05, "loss": 0.6611, "num_input_tokens_seen": 32732264, "step": 56380 }, { "epoch": 8.398123324396783, "grad_norm": 0.9346168041229248, "learning_rate": 3.5967221909659095e-05, "loss": 0.6182, "num_input_tokens_seen": 32735272, "step": 56385 }, { "epoch": 8.398868036937742, "grad_norm": 1.3671396970748901, "learning_rate": 3.596430176014875e-05, "loss": 0.5861, "num_input_tokens_seen": 32737992, "step": 56390 }, { "epoch": 8.3996127494787, "grad_norm": 1.417059302330017, "learning_rate": 3.596138142540768e-05, "loss": 0.868, "num_input_tokens_seen": 32740808, "step": 56395 }, { "epoch": 8.400357462019661, "grad_norm": 1.7871335744857788, "learning_rate": 3.5958460905485216e-05, "loss": 0.5903, "num_input_tokens_seen": 32743848, "step": 56400 }, { "epoch": 8.40110217456062, "grad_norm": 1.6356537342071533, "learning_rate": 3.595554020043068e-05, "loss": 0.5582, "num_input_tokens_seen": 32746952, "step": 56405 }, { "epoch": 8.401846887101579, "grad_norm": 1.8517224788665771, "learning_rate": 3.5952619310293435e-05, "loss": 0.6124, "num_input_tokens_seen": 32750216, "step": 56410 }, { "epoch": 8.402591599642538, "grad_norm": 1.0945814847946167, "learning_rate": 3.594969823512282e-05, "loss": 0.6539, "num_input_tokens_seen": 32753480, "step": 56415 }, { "epoch": 8.403336312183498, "grad_norm": 1.0757455825805664, "learning_rate": 3.5946776974968174e-05, "loss": 0.7045, "num_input_tokens_seen": 32756456, "step": 56420 }, { "epoch": 8.404081024724457, "grad_norm": 1.0961806774139404, "learning_rate": 3.5943855529878865e-05, "loss": 0.5925, "num_input_tokens_seen": 32759080, "step": 56425 }, { "epoch": 8.404825737265416, "grad_norm": 1.3097409009933472, "learning_rate": 3.594093389990424e-05, "loss": 0.6904, "num_input_tokens_seen": 32762184, "step": 56430 }, { "epoch": 8.405570449806374, "grad_norm": 0.7567923069000244, "learning_rate": 3.593801208509365e-05, "loss": 0.537, "num_input_tokens_seen": 32765096, "step": 56435 }, { "epoch": 8.406315162347333, "grad_norm": 1.09162175655365, "learning_rate": 3.593509008549646e-05, "loss": 0.4868, "num_input_tokens_seen": 32767912, "step": 56440 }, { "epoch": 8.407059874888294, "grad_norm": 1.4748462438583374, "learning_rate": 3.593216790116205e-05, "loss": 0.5372, "num_input_tokens_seen": 32770696, "step": 56445 }, { "epoch": 8.407804587429252, "grad_norm": 0.9369639158248901, "learning_rate": 3.5929245532139773e-05, "loss": 0.5909, "num_input_tokens_seen": 32773768, "step": 56450 }, { "epoch": 8.408549299970211, "grad_norm": 1.5869210958480835, "learning_rate": 3.5926322978478985e-05, "loss": 0.5413, "num_input_tokens_seen": 32776840, "step": 56455 }, { "epoch": 8.40929401251117, "grad_norm": 1.8429983854293823, "learning_rate": 3.592340024022909e-05, "loss": 0.7495, "num_input_tokens_seen": 32779752, "step": 56460 }, { "epoch": 8.41003872505213, "grad_norm": 0.8654800653457642, "learning_rate": 3.592047731743944e-05, "loss": 0.6998, "num_input_tokens_seen": 32782568, "step": 56465 }, { "epoch": 8.41078343759309, "grad_norm": 1.3054969310760498, "learning_rate": 3.591755421015943e-05, "loss": 0.5323, "num_input_tokens_seen": 32785736, "step": 56470 }, { "epoch": 8.411528150134048, "grad_norm": 0.9175500869750977, "learning_rate": 3.591463091843844e-05, "loss": 0.7246, "num_input_tokens_seen": 32788744, "step": 56475 }, { "epoch": 8.412272862675007, "grad_norm": 0.8794083595275879, "learning_rate": 3.591170744232585e-05, "loss": 0.5929, "num_input_tokens_seen": 32791464, "step": 56480 }, { "epoch": 8.413017575215967, "grad_norm": 1.3001724481582642, "learning_rate": 3.590878378187106e-05, "loss": 0.5105, "num_input_tokens_seen": 32794440, "step": 56485 }, { "epoch": 8.413762287756926, "grad_norm": 1.4931108951568604, "learning_rate": 3.5905859937123445e-05, "loss": 0.6015, "num_input_tokens_seen": 32797288, "step": 56490 }, { "epoch": 8.414507000297885, "grad_norm": 2.1612184047698975, "learning_rate": 3.5902935908132416e-05, "loss": 0.6706, "num_input_tokens_seen": 32800296, "step": 56495 }, { "epoch": 8.415251712838844, "grad_norm": 0.9034389853477478, "learning_rate": 3.5900011694947364e-05, "loss": 0.7194, "num_input_tokens_seen": 32802984, "step": 56500 }, { "epoch": 8.415996425379804, "grad_norm": 1.9276822805404663, "learning_rate": 3.5897087297617694e-05, "loss": 0.8193, "num_input_tokens_seen": 32805960, "step": 56505 }, { "epoch": 8.416741137920763, "grad_norm": 1.2464630603790283, "learning_rate": 3.589416271619281e-05, "loss": 0.7195, "num_input_tokens_seen": 32808808, "step": 56510 }, { "epoch": 8.417485850461722, "grad_norm": 2.233705759048462, "learning_rate": 3.589123795072212e-05, "loss": 0.5142, "num_input_tokens_seen": 32811560, "step": 56515 }, { "epoch": 8.41823056300268, "grad_norm": 1.0058232545852661, "learning_rate": 3.5888313001255034e-05, "loss": 0.691, "num_input_tokens_seen": 32814632, "step": 56520 }, { "epoch": 8.418975275543641, "grad_norm": 1.4561883211135864, "learning_rate": 3.588538786784096e-05, "loss": 0.573, "num_input_tokens_seen": 32817608, "step": 56525 }, { "epoch": 8.4197199880846, "grad_norm": 0.805886447429657, "learning_rate": 3.5882462550529325e-05, "loss": 0.7286, "num_input_tokens_seen": 32820488, "step": 56530 }, { "epoch": 8.420464700625558, "grad_norm": 0.8333202600479126, "learning_rate": 3.587953704936955e-05, "loss": 0.691, "num_input_tokens_seen": 32823432, "step": 56535 }, { "epoch": 8.421209413166517, "grad_norm": 1.2847607135772705, "learning_rate": 3.587661136441105e-05, "loss": 0.6827, "num_input_tokens_seen": 32826344, "step": 56540 }, { "epoch": 8.421954125707478, "grad_norm": 1.4241306781768799, "learning_rate": 3.587368549570326e-05, "loss": 0.5582, "num_input_tokens_seen": 32829192, "step": 56545 }, { "epoch": 8.422698838248436, "grad_norm": 1.3316782712936401, "learning_rate": 3.58707594432956e-05, "loss": 0.5424, "num_input_tokens_seen": 32832040, "step": 56550 }, { "epoch": 8.423443550789395, "grad_norm": 1.1812061071395874, "learning_rate": 3.586783320723751e-05, "loss": 0.4838, "num_input_tokens_seen": 32835368, "step": 56555 }, { "epoch": 8.424188263330354, "grad_norm": 1.9069287776947021, "learning_rate": 3.586490678757842e-05, "loss": 0.5754, "num_input_tokens_seen": 32838312, "step": 56560 }, { "epoch": 8.424932975871315, "grad_norm": 1.990736722946167, "learning_rate": 3.5861980184367775e-05, "loss": 0.9509, "num_input_tokens_seen": 32841096, "step": 56565 }, { "epoch": 8.425677688412273, "grad_norm": 0.7950765490531921, "learning_rate": 3.5859053397655014e-05, "loss": 0.5061, "num_input_tokens_seen": 32844040, "step": 56570 }, { "epoch": 8.426422400953232, "grad_norm": 1.2215536832809448, "learning_rate": 3.585612642748958e-05, "loss": 0.676, "num_input_tokens_seen": 32846696, "step": 56575 }, { "epoch": 8.42716711349419, "grad_norm": 0.9694135785102844, "learning_rate": 3.585319927392093e-05, "loss": 0.6457, "num_input_tokens_seen": 32849544, "step": 56580 }, { "epoch": 8.42791182603515, "grad_norm": 1.4890810251235962, "learning_rate": 3.585027193699851e-05, "loss": 0.6596, "num_input_tokens_seen": 32852648, "step": 56585 }, { "epoch": 8.42865653857611, "grad_norm": 0.8982388377189636, "learning_rate": 3.5847344416771766e-05, "loss": 0.71, "num_input_tokens_seen": 32856072, "step": 56590 }, { "epoch": 8.429401251117069, "grad_norm": 1.180058240890503, "learning_rate": 3.584441671329016e-05, "loss": 0.5291, "num_input_tokens_seen": 32858920, "step": 56595 }, { "epoch": 8.430145963658028, "grad_norm": 3.3982186317443848, "learning_rate": 3.584148882660316e-05, "loss": 0.7336, "num_input_tokens_seen": 32861576, "step": 56600 }, { "epoch": 8.430890676198986, "grad_norm": 1.0017727613449097, "learning_rate": 3.583856075676023e-05, "loss": 0.5612, "num_input_tokens_seen": 32864584, "step": 56605 }, { "epoch": 8.431635388739947, "grad_norm": 1.0953478813171387, "learning_rate": 3.5835632503810834e-05, "loss": 0.5028, "num_input_tokens_seen": 32867400, "step": 56610 }, { "epoch": 8.432380101280906, "grad_norm": 1.1623904705047607, "learning_rate": 3.5832704067804436e-05, "loss": 0.4747, "num_input_tokens_seen": 32870344, "step": 56615 }, { "epoch": 8.433124813821864, "grad_norm": 1.0827006101608276, "learning_rate": 3.582977544879051e-05, "loss": 0.601, "num_input_tokens_seen": 32873320, "step": 56620 }, { "epoch": 8.433869526362823, "grad_norm": 1.8187695741653442, "learning_rate": 3.5826846646818536e-05, "loss": 0.724, "num_input_tokens_seen": 32875976, "step": 56625 }, { "epoch": 8.434614238903784, "grad_norm": 2.113283395767212, "learning_rate": 3.582391766193799e-05, "loss": 0.6261, "num_input_tokens_seen": 32878824, "step": 56630 }, { "epoch": 8.435358951444742, "grad_norm": 1.5607020854949951, "learning_rate": 3.582098849419835e-05, "loss": 0.6461, "num_input_tokens_seen": 32881608, "step": 56635 }, { "epoch": 8.436103663985701, "grad_norm": 1.266697645187378, "learning_rate": 3.581805914364912e-05, "loss": 0.6957, "num_input_tokens_seen": 32884456, "step": 56640 }, { "epoch": 8.43684837652666, "grad_norm": 1.3661695718765259, "learning_rate": 3.581512961033977e-05, "loss": 0.5384, "num_input_tokens_seen": 32887336, "step": 56645 }, { "epoch": 8.43759308906762, "grad_norm": 1.5764716863632202, "learning_rate": 3.5812199894319795e-05, "loss": 0.5755, "num_input_tokens_seen": 32890184, "step": 56650 }, { "epoch": 8.43833780160858, "grad_norm": 1.7257795333862305, "learning_rate": 3.58092699956387e-05, "loss": 0.6765, "num_input_tokens_seen": 32893128, "step": 56655 }, { "epoch": 8.439082514149538, "grad_norm": 0.9671082496643066, "learning_rate": 3.580633991434597e-05, "loss": 0.5819, "num_input_tokens_seen": 32896136, "step": 56660 }, { "epoch": 8.439827226690497, "grad_norm": 1.2307699918746948, "learning_rate": 3.58034096504911e-05, "loss": 0.4992, "num_input_tokens_seen": 32898856, "step": 56665 }, { "epoch": 8.440571939231457, "grad_norm": 1.3026165962219238, "learning_rate": 3.580047920412362e-05, "loss": 0.5979, "num_input_tokens_seen": 32901672, "step": 56670 }, { "epoch": 8.441316651772416, "grad_norm": 1.4045416116714478, "learning_rate": 3.579754857529301e-05, "loss": 0.6735, "num_input_tokens_seen": 32904680, "step": 56675 }, { "epoch": 8.442061364313375, "grad_norm": 1.1497033834457397, "learning_rate": 3.57946177640488e-05, "loss": 0.6787, "num_input_tokens_seen": 32907624, "step": 56680 }, { "epoch": 8.442806076854334, "grad_norm": 1.268049955368042, "learning_rate": 3.579168677044049e-05, "loss": 0.5541, "num_input_tokens_seen": 32910632, "step": 56685 }, { "epoch": 8.443550789395294, "grad_norm": 1.3840081691741943, "learning_rate": 3.5788755594517595e-05, "loss": 0.7533, "num_input_tokens_seen": 32913608, "step": 56690 }, { "epoch": 8.444295501936253, "grad_norm": 1.8324235677719116, "learning_rate": 3.5785824236329644e-05, "loss": 0.585, "num_input_tokens_seen": 32916392, "step": 56695 }, { "epoch": 8.445040214477212, "grad_norm": 2.051867961883545, "learning_rate": 3.578289269592615e-05, "loss": 0.6209, "num_input_tokens_seen": 32919528, "step": 56700 }, { "epoch": 8.44578492701817, "grad_norm": 1.2539008855819702, "learning_rate": 3.577996097335665e-05, "loss": 0.7445, "num_input_tokens_seen": 32922120, "step": 56705 }, { "epoch": 8.446529639559131, "grad_norm": 1.577566385269165, "learning_rate": 3.577702906867066e-05, "loss": 0.6673, "num_input_tokens_seen": 32925000, "step": 56710 }, { "epoch": 8.44727435210009, "grad_norm": 1.9199472665786743, "learning_rate": 3.577409698191773e-05, "loss": 0.6761, "num_input_tokens_seen": 32928104, "step": 56715 }, { "epoch": 8.448019064641048, "grad_norm": 0.8657833337783813, "learning_rate": 3.5771164713147364e-05, "loss": 0.6717, "num_input_tokens_seen": 32931016, "step": 56720 }, { "epoch": 8.448763777182007, "grad_norm": 1.7398627996444702, "learning_rate": 3.576823226240913e-05, "loss": 0.6474, "num_input_tokens_seen": 32934344, "step": 56725 }, { "epoch": 8.449508489722968, "grad_norm": 0.9655729532241821, "learning_rate": 3.576529962975255e-05, "loss": 0.8567, "num_input_tokens_seen": 32936904, "step": 56730 }, { "epoch": 8.450253202263927, "grad_norm": 1.2020925283432007, "learning_rate": 3.576236681522718e-05, "loss": 0.7542, "num_input_tokens_seen": 32939720, "step": 56735 }, { "epoch": 8.450997914804885, "grad_norm": 1.3793278932571411, "learning_rate": 3.575943381888255e-05, "loss": 0.6395, "num_input_tokens_seen": 32942760, "step": 56740 }, { "epoch": 8.451742627345844, "grad_norm": 0.7593240737915039, "learning_rate": 3.575650064076823e-05, "loss": 0.5877, "num_input_tokens_seen": 32945608, "step": 56745 }, { "epoch": 8.452487339886805, "grad_norm": 0.9974817633628845, "learning_rate": 3.575356728093376e-05, "loss": 0.8467, "num_input_tokens_seen": 32948776, "step": 56750 }, { "epoch": 8.453232052427763, "grad_norm": 0.9294259548187256, "learning_rate": 3.575063373942871e-05, "loss": 0.649, "num_input_tokens_seen": 32951720, "step": 56755 }, { "epoch": 8.453976764968722, "grad_norm": 1.0145471096038818, "learning_rate": 3.5747700016302616e-05, "loss": 0.5651, "num_input_tokens_seen": 32954664, "step": 56760 }, { "epoch": 8.45472147750968, "grad_norm": 0.8035833239555359, "learning_rate": 3.574476611160506e-05, "loss": 0.5586, "num_input_tokens_seen": 32957576, "step": 56765 }, { "epoch": 8.45546619005064, "grad_norm": 1.4848946332931519, "learning_rate": 3.5741832025385596e-05, "loss": 0.6253, "num_input_tokens_seen": 32960680, "step": 56770 }, { "epoch": 8.4562109025916, "grad_norm": 1.243897795677185, "learning_rate": 3.57388977576938e-05, "loss": 0.6233, "num_input_tokens_seen": 32963528, "step": 56775 }, { "epoch": 8.456955615132559, "grad_norm": 3.4145009517669678, "learning_rate": 3.5735963308579256e-05, "loss": 0.6002, "num_input_tokens_seen": 32966344, "step": 56780 }, { "epoch": 8.457700327673518, "grad_norm": 1.2129260301589966, "learning_rate": 3.573302867809151e-05, "loss": 0.582, "num_input_tokens_seen": 32969512, "step": 56785 }, { "epoch": 8.458445040214476, "grad_norm": 1.1672972440719604, "learning_rate": 3.573009386628015e-05, "loss": 0.6238, "num_input_tokens_seen": 32972168, "step": 56790 }, { "epoch": 8.459189752755437, "grad_norm": 2.431281089782715, "learning_rate": 3.5727158873194763e-05, "loss": 0.7529, "num_input_tokens_seen": 32975048, "step": 56795 }, { "epoch": 8.459934465296396, "grad_norm": 0.959644079208374, "learning_rate": 3.572422369888493e-05, "loss": 0.5938, "num_input_tokens_seen": 32977864, "step": 56800 }, { "epoch": 8.460679177837354, "grad_norm": 1.686815857887268, "learning_rate": 3.5721288343400235e-05, "loss": 0.5839, "num_input_tokens_seen": 32980712, "step": 56805 }, { "epoch": 8.461423890378313, "grad_norm": 0.9567371010780334, "learning_rate": 3.571835280679027e-05, "loss": 0.5694, "num_input_tokens_seen": 32983432, "step": 56810 }, { "epoch": 8.462168602919274, "grad_norm": 0.5322571992874146, "learning_rate": 3.5715417089104634e-05, "loss": 0.5894, "num_input_tokens_seen": 32986472, "step": 56815 }, { "epoch": 8.462913315460233, "grad_norm": 1.069158673286438, "learning_rate": 3.571248119039291e-05, "loss": 0.6022, "num_input_tokens_seen": 32989064, "step": 56820 }, { "epoch": 8.463658028001191, "grad_norm": 0.7412814497947693, "learning_rate": 3.570954511070471e-05, "loss": 0.6265, "num_input_tokens_seen": 32992168, "step": 56825 }, { "epoch": 8.46440274054215, "grad_norm": 1.2327768802642822, "learning_rate": 3.570660885008962e-05, "loss": 0.565, "num_input_tokens_seen": 32994856, "step": 56830 }, { "epoch": 8.46514745308311, "grad_norm": 0.8094858527183533, "learning_rate": 3.570367240859727e-05, "loss": 0.5206, "num_input_tokens_seen": 32997992, "step": 56835 }, { "epoch": 8.46589216562407, "grad_norm": 1.8820194005966187, "learning_rate": 3.570073578627724e-05, "loss": 0.8258, "num_input_tokens_seen": 33000584, "step": 56840 }, { "epoch": 8.466636878165028, "grad_norm": 0.9005323052406311, "learning_rate": 3.5697798983179165e-05, "loss": 0.6904, "num_input_tokens_seen": 33003304, "step": 56845 }, { "epoch": 8.467381590705987, "grad_norm": 1.5706534385681152, "learning_rate": 3.569486199935264e-05, "loss": 0.6796, "num_input_tokens_seen": 33006504, "step": 56850 }, { "epoch": 8.468126303246947, "grad_norm": 1.1481090784072876, "learning_rate": 3.56919248348473e-05, "loss": 0.4996, "num_input_tokens_seen": 33009128, "step": 56855 }, { "epoch": 8.468871015787906, "grad_norm": 2.6000051498413086, "learning_rate": 3.568898748971275e-05, "loss": 0.6832, "num_input_tokens_seen": 33011848, "step": 56860 }, { "epoch": 8.469615728328865, "grad_norm": 0.9587209820747375, "learning_rate": 3.568604996399862e-05, "loss": 0.7419, "num_input_tokens_seen": 33014536, "step": 56865 }, { "epoch": 8.470360440869824, "grad_norm": 0.748996376991272, "learning_rate": 3.5683112257754535e-05, "loss": 0.5808, "num_input_tokens_seen": 33017672, "step": 56870 }, { "epoch": 8.471105153410784, "grad_norm": 1.2770249843597412, "learning_rate": 3.568017437103013e-05, "loss": 0.6039, "num_input_tokens_seen": 33020232, "step": 56875 }, { "epoch": 8.471849865951743, "grad_norm": 1.2575322389602661, "learning_rate": 3.567723630387504e-05, "loss": 0.5991, "num_input_tokens_seen": 33022888, "step": 56880 }, { "epoch": 8.472594578492702, "grad_norm": 1.3742241859436035, "learning_rate": 3.5674298056338885e-05, "loss": 0.6133, "num_input_tokens_seen": 33025768, "step": 56885 }, { "epoch": 8.47333929103366, "grad_norm": 1.2833032608032227, "learning_rate": 3.5671359628471315e-05, "loss": 0.8074, "num_input_tokens_seen": 33028680, "step": 56890 }, { "epoch": 8.474084003574621, "grad_norm": 1.3008449077606201, "learning_rate": 3.566842102032198e-05, "loss": 0.6505, "num_input_tokens_seen": 33031880, "step": 56895 }, { "epoch": 8.47482871611558, "grad_norm": 1.2749824523925781, "learning_rate": 3.56654822319405e-05, "loss": 0.7011, "num_input_tokens_seen": 33034760, "step": 56900 }, { "epoch": 8.475573428656539, "grad_norm": 0.8796647191047668, "learning_rate": 3.5662543263376544e-05, "loss": 0.5555, "num_input_tokens_seen": 33037576, "step": 56905 }, { "epoch": 8.476318141197497, "grad_norm": 1.166744589805603, "learning_rate": 3.5659604114679754e-05, "loss": 0.6561, "num_input_tokens_seen": 33040520, "step": 56910 }, { "epoch": 8.477062853738458, "grad_norm": 0.6164748072624207, "learning_rate": 3.565666478589979e-05, "loss": 0.4319, "num_input_tokens_seen": 33043496, "step": 56915 }, { "epoch": 8.477807566279417, "grad_norm": 1.0999600887298584, "learning_rate": 3.5653725277086306e-05, "loss": 0.6365, "num_input_tokens_seen": 33046376, "step": 56920 }, { "epoch": 8.478552278820375, "grad_norm": 0.8353235125541687, "learning_rate": 3.565078558828896e-05, "loss": 0.5801, "num_input_tokens_seen": 33049224, "step": 56925 }, { "epoch": 8.479296991361334, "grad_norm": 1.0824241638183594, "learning_rate": 3.564784571955741e-05, "loss": 0.513, "num_input_tokens_seen": 33052136, "step": 56930 }, { "epoch": 8.480041703902295, "grad_norm": 1.1525607109069824, "learning_rate": 3.5644905670941345e-05, "loss": 0.5309, "num_input_tokens_seen": 33054792, "step": 56935 }, { "epoch": 8.480786416443253, "grad_norm": 4.207602024078369, "learning_rate": 3.56419654424904e-05, "loss": 0.7106, "num_input_tokens_seen": 33057704, "step": 56940 }, { "epoch": 8.481531128984212, "grad_norm": 1.4341282844543457, "learning_rate": 3.5639025034254274e-05, "loss": 0.6988, "num_input_tokens_seen": 33060680, "step": 56945 }, { "epoch": 8.482275841525171, "grad_norm": 1.7570117712020874, "learning_rate": 3.563608444628264e-05, "loss": 0.7058, "num_input_tokens_seen": 33063592, "step": 56950 }, { "epoch": 8.48302055406613, "grad_norm": 0.8517626523971558, "learning_rate": 3.563314367862515e-05, "loss": 0.7854, "num_input_tokens_seen": 33066472, "step": 56955 }, { "epoch": 8.48376526660709, "grad_norm": 1.1890214681625366, "learning_rate": 3.5630202731331515e-05, "loss": 0.6058, "num_input_tokens_seen": 33069352, "step": 56960 }, { "epoch": 8.484509979148049, "grad_norm": 0.9111369252204895, "learning_rate": 3.562726160445141e-05, "loss": 0.5723, "num_input_tokens_seen": 33072200, "step": 56965 }, { "epoch": 8.485254691689008, "grad_norm": 0.4593088626861572, "learning_rate": 3.562432029803452e-05, "loss": 0.6312, "num_input_tokens_seen": 33075304, "step": 56970 }, { "epoch": 8.485999404229966, "grad_norm": 1.1709437370300293, "learning_rate": 3.562137881213053e-05, "loss": 0.7854, "num_input_tokens_seen": 33078472, "step": 56975 }, { "epoch": 8.486744116770927, "grad_norm": 1.2613099813461304, "learning_rate": 3.5618437146789155e-05, "loss": 0.5551, "num_input_tokens_seen": 33081512, "step": 56980 }, { "epoch": 8.487488829311886, "grad_norm": 1.4918274879455566, "learning_rate": 3.561549530206007e-05, "loss": 0.6019, "num_input_tokens_seen": 33084232, "step": 56985 }, { "epoch": 8.488233541852845, "grad_norm": 2.7107977867126465, "learning_rate": 3.561255327799298e-05, "loss": 0.6107, "num_input_tokens_seen": 33087176, "step": 56990 }, { "epoch": 8.488978254393803, "grad_norm": 1.779849886894226, "learning_rate": 3.5609611074637584e-05, "loss": 0.5078, "num_input_tokens_seen": 33090024, "step": 56995 }, { "epoch": 8.489722966934764, "grad_norm": 1.255979061126709, "learning_rate": 3.5606668692043595e-05, "loss": 0.5375, "num_input_tokens_seen": 33093064, "step": 57000 }, { "epoch": 8.490467679475723, "grad_norm": 2.677889585494995, "learning_rate": 3.5603726130260715e-05, "loss": 0.6665, "num_input_tokens_seen": 33095752, "step": 57005 }, { "epoch": 8.491212392016681, "grad_norm": 1.4105587005615234, "learning_rate": 3.5600783389338674e-05, "loss": 0.5815, "num_input_tokens_seen": 33098472, "step": 57010 }, { "epoch": 8.49195710455764, "grad_norm": 2.135960578918457, "learning_rate": 3.559784046932716e-05, "loss": 0.838, "num_input_tokens_seen": 33101096, "step": 57015 }, { "epoch": 8.4927018170986, "grad_norm": 1.352732539176941, "learning_rate": 3.5594897370275905e-05, "loss": 0.7285, "num_input_tokens_seen": 33104040, "step": 57020 }, { "epoch": 8.49344652963956, "grad_norm": 1.7692030668258667, "learning_rate": 3.5591954092234625e-05, "loss": 0.6986, "num_input_tokens_seen": 33106920, "step": 57025 }, { "epoch": 8.494191242180518, "grad_norm": 1.7772547006607056, "learning_rate": 3.558901063525305e-05, "loss": 0.6841, "num_input_tokens_seen": 33109992, "step": 57030 }, { "epoch": 8.494935954721477, "grad_norm": 1.0006498098373413, "learning_rate": 3.55860669993809e-05, "loss": 0.5846, "num_input_tokens_seen": 33114216, "step": 57035 }, { "epoch": 8.495680667262437, "grad_norm": 1.1245626211166382, "learning_rate": 3.55831231846679e-05, "loss": 0.4377, "num_input_tokens_seen": 33117128, "step": 57040 }, { "epoch": 8.496425379803396, "grad_norm": 1.2050912380218506, "learning_rate": 3.55801791911638e-05, "loss": 0.6208, "num_input_tokens_seen": 33119656, "step": 57045 }, { "epoch": 8.497170092344355, "grad_norm": 1.4610041379928589, "learning_rate": 3.557723501891832e-05, "loss": 0.7066, "num_input_tokens_seen": 33122408, "step": 57050 }, { "epoch": 8.497914804885314, "grad_norm": 1.5972436666488647, "learning_rate": 3.557429066798121e-05, "loss": 0.5358, "num_input_tokens_seen": 33125128, "step": 57055 }, { "epoch": 8.498659517426274, "grad_norm": 1.46946120262146, "learning_rate": 3.5571346138402204e-05, "loss": 0.8142, "num_input_tokens_seen": 33127912, "step": 57060 }, { "epoch": 8.499404229967233, "grad_norm": 1.4727543592453003, "learning_rate": 3.5568401430231045e-05, "loss": 0.5982, "num_input_tokens_seen": 33130952, "step": 57065 }, { "epoch": 8.500148942508192, "grad_norm": 2.042976140975952, "learning_rate": 3.556545654351749e-05, "loss": 0.5711, "num_input_tokens_seen": 33133896, "step": 57070 }, { "epoch": 8.50089365504915, "grad_norm": 2.851379871368408, "learning_rate": 3.556251147831128e-05, "loss": 0.6813, "num_input_tokens_seen": 33136968, "step": 57075 }, { "epoch": 8.501638367590111, "grad_norm": 2.0529708862304688, "learning_rate": 3.5559566234662175e-05, "loss": 0.7398, "num_input_tokens_seen": 33139976, "step": 57080 }, { "epoch": 8.50238308013107, "grad_norm": 0.9964903593063354, "learning_rate": 3.555662081261994e-05, "loss": 0.6245, "num_input_tokens_seen": 33142856, "step": 57085 }, { "epoch": 8.503127792672029, "grad_norm": 1.0128973722457886, "learning_rate": 3.555367521223431e-05, "loss": 0.6065, "num_input_tokens_seen": 33145704, "step": 57090 }, { "epoch": 8.503872505212987, "grad_norm": 1.7911752462387085, "learning_rate": 3.555072943355508e-05, "loss": 0.6477, "num_input_tokens_seen": 33148552, "step": 57095 }, { "epoch": 8.504617217753946, "grad_norm": 0.8856768608093262, "learning_rate": 3.554778347663199e-05, "loss": 0.5215, "num_input_tokens_seen": 33151112, "step": 57100 }, { "epoch": 8.505361930294907, "grad_norm": 1.2019548416137695, "learning_rate": 3.554483734151482e-05, "loss": 0.5896, "num_input_tokens_seen": 33153832, "step": 57105 }, { "epoch": 8.506106642835865, "grad_norm": 1.639249324798584, "learning_rate": 3.554189102825334e-05, "loss": 0.5644, "num_input_tokens_seen": 33156744, "step": 57110 }, { "epoch": 8.506851355376824, "grad_norm": 1.3761833906173706, "learning_rate": 3.553894453689733e-05, "loss": 0.6129, "num_input_tokens_seen": 33159528, "step": 57115 }, { "epoch": 8.507596067917785, "grad_norm": 1.6669654846191406, "learning_rate": 3.553599786749656e-05, "loss": 0.4886, "num_input_tokens_seen": 33162408, "step": 57120 }, { "epoch": 8.508340780458743, "grad_norm": 1.9148969650268555, "learning_rate": 3.553305102010081e-05, "loss": 0.6273, "num_input_tokens_seen": 33165352, "step": 57125 }, { "epoch": 8.509085492999702, "grad_norm": 1.7569541931152344, "learning_rate": 3.553010399475987e-05, "loss": 0.6532, "num_input_tokens_seen": 33168488, "step": 57130 }, { "epoch": 8.509830205540661, "grad_norm": 1.5071409940719604, "learning_rate": 3.552715679152353e-05, "loss": 0.685, "num_input_tokens_seen": 33171336, "step": 57135 }, { "epoch": 8.51057491808162, "grad_norm": 1.095429539680481, "learning_rate": 3.552420941044157e-05, "loss": 0.4931, "num_input_tokens_seen": 33174216, "step": 57140 }, { "epoch": 8.51131963062258, "grad_norm": 1.3051795959472656, "learning_rate": 3.5521261851563796e-05, "loss": 0.7536, "num_input_tokens_seen": 33177224, "step": 57145 }, { "epoch": 8.512064343163539, "grad_norm": 1.3699051141738892, "learning_rate": 3.551831411493999e-05, "loss": 0.6401, "num_input_tokens_seen": 33179976, "step": 57150 }, { "epoch": 8.512809055704498, "grad_norm": 0.8920290470123291, "learning_rate": 3.551536620061996e-05, "loss": 0.7196, "num_input_tokens_seen": 33182824, "step": 57155 }, { "epoch": 8.513553768245457, "grad_norm": 0.7725732922554016, "learning_rate": 3.55124181086535e-05, "loss": 0.633, "num_input_tokens_seen": 33185704, "step": 57160 }, { "epoch": 8.514298480786417, "grad_norm": 1.639268159866333, "learning_rate": 3.5509469839090426e-05, "loss": 0.4424, "num_input_tokens_seen": 33188328, "step": 57165 }, { "epoch": 8.515043193327376, "grad_norm": 1.3552610874176025, "learning_rate": 3.550652139198054e-05, "loss": 0.5341, "num_input_tokens_seen": 33191176, "step": 57170 }, { "epoch": 8.515787905868335, "grad_norm": 1.762559413909912, "learning_rate": 3.550357276737365e-05, "loss": 0.6994, "num_input_tokens_seen": 33194056, "step": 57175 }, { "epoch": 8.516532618409293, "grad_norm": 1.042004942893982, "learning_rate": 3.550062396531959e-05, "loss": 0.5482, "num_input_tokens_seen": 33197064, "step": 57180 }, { "epoch": 8.517277330950254, "grad_norm": 1.3126436471939087, "learning_rate": 3.549767498586814e-05, "loss": 0.6904, "num_input_tokens_seen": 33199816, "step": 57185 }, { "epoch": 8.518022043491213, "grad_norm": 1.0843263864517212, "learning_rate": 3.549472582906914e-05, "loss": 0.7118, "num_input_tokens_seen": 33202728, "step": 57190 }, { "epoch": 8.518766756032171, "grad_norm": 0.6857524514198303, "learning_rate": 3.549177649497242e-05, "loss": 0.8152, "num_input_tokens_seen": 33205416, "step": 57195 }, { "epoch": 8.51951146857313, "grad_norm": 0.8240059614181519, "learning_rate": 3.54888269836278e-05, "loss": 0.5464, "num_input_tokens_seen": 33208584, "step": 57200 }, { "epoch": 8.52025618111409, "grad_norm": 0.9423016309738159, "learning_rate": 3.5485877295085105e-05, "loss": 0.6308, "num_input_tokens_seen": 33211720, "step": 57205 }, { "epoch": 8.52100089365505, "grad_norm": 1.325103521347046, "learning_rate": 3.5482927429394184e-05, "loss": 0.6713, "num_input_tokens_seen": 33214888, "step": 57210 }, { "epoch": 8.521745606196008, "grad_norm": 1.2201485633850098, "learning_rate": 3.547997738660485e-05, "loss": 0.4863, "num_input_tokens_seen": 33218024, "step": 57215 }, { "epoch": 8.522490318736967, "grad_norm": 1.14839768409729, "learning_rate": 3.547702716676694e-05, "loss": 0.607, "num_input_tokens_seen": 33221096, "step": 57220 }, { "epoch": 8.523235031277927, "grad_norm": 1.128947377204895, "learning_rate": 3.547407676993032e-05, "loss": 0.5638, "num_input_tokens_seen": 33224424, "step": 57225 }, { "epoch": 8.523979743818886, "grad_norm": 1.3472273349761963, "learning_rate": 3.54711261961448e-05, "loss": 0.6937, "num_input_tokens_seen": 33227048, "step": 57230 }, { "epoch": 8.524724456359845, "grad_norm": 1.1867653131484985, "learning_rate": 3.5468175445460263e-05, "loss": 0.5694, "num_input_tokens_seen": 33230184, "step": 57235 }, { "epoch": 8.525469168900804, "grad_norm": 1.2390727996826172, "learning_rate": 3.546522451792653e-05, "loss": 0.7191, "num_input_tokens_seen": 33233256, "step": 57240 }, { "epoch": 8.526213881441764, "grad_norm": 1.5427016019821167, "learning_rate": 3.546227341359347e-05, "loss": 0.7612, "num_input_tokens_seen": 33236264, "step": 57245 }, { "epoch": 8.526958593982723, "grad_norm": 0.9679030179977417, "learning_rate": 3.545932213251093e-05, "loss": 0.6272, "num_input_tokens_seen": 33239144, "step": 57250 }, { "epoch": 8.527703306523682, "grad_norm": 0.9809937477111816, "learning_rate": 3.545637067472878e-05, "loss": 0.5856, "num_input_tokens_seen": 33241736, "step": 57255 }, { "epoch": 8.52844801906464, "grad_norm": 1.4521369934082031, "learning_rate": 3.545341904029687e-05, "loss": 0.7338, "num_input_tokens_seen": 33244520, "step": 57260 }, { "epoch": 8.529192731605601, "grad_norm": 0.6757146716117859, "learning_rate": 3.545046722926507e-05, "loss": 0.5368, "num_input_tokens_seen": 33247528, "step": 57265 }, { "epoch": 8.52993744414656, "grad_norm": 1.799727201461792, "learning_rate": 3.544751524168325e-05, "loss": 0.6521, "num_input_tokens_seen": 33250248, "step": 57270 }, { "epoch": 8.530682156687519, "grad_norm": 1.2617563009262085, "learning_rate": 3.544456307760128e-05, "loss": 0.6613, "num_input_tokens_seen": 33252808, "step": 57275 }, { "epoch": 8.531426869228477, "grad_norm": 0.962131679058075, "learning_rate": 3.5441610737069026e-05, "loss": 0.6168, "num_input_tokens_seen": 33255496, "step": 57280 }, { "epoch": 8.532171581769436, "grad_norm": 2.971583366394043, "learning_rate": 3.543865822013637e-05, "loss": 0.697, "num_input_tokens_seen": 33258376, "step": 57285 }, { "epoch": 8.532916294310397, "grad_norm": 0.9057456254959106, "learning_rate": 3.5435705526853196e-05, "loss": 0.5082, "num_input_tokens_seen": 33260872, "step": 57290 }, { "epoch": 8.533661006851355, "grad_norm": 0.9246532917022705, "learning_rate": 3.5432752657269384e-05, "loss": 0.5115, "num_input_tokens_seen": 33263752, "step": 57295 }, { "epoch": 8.534405719392314, "grad_norm": 1.5611357688903809, "learning_rate": 3.542979961143482e-05, "loss": 0.5844, "num_input_tokens_seen": 33266248, "step": 57300 }, { "epoch": 8.535150431933273, "grad_norm": 1.3955414295196533, "learning_rate": 3.542684638939939e-05, "loss": 0.5832, "num_input_tokens_seen": 33269000, "step": 57305 }, { "epoch": 8.535895144474233, "grad_norm": 1.1824368238449097, "learning_rate": 3.5423892991212994e-05, "loss": 0.6127, "num_input_tokens_seen": 33271688, "step": 57310 }, { "epoch": 8.536639857015192, "grad_norm": 2.345665693283081, "learning_rate": 3.542093941692551e-05, "loss": 0.7347, "num_input_tokens_seen": 33274472, "step": 57315 }, { "epoch": 8.537384569556151, "grad_norm": 1.0732505321502686, "learning_rate": 3.541798566658685e-05, "loss": 0.7511, "num_input_tokens_seen": 33277704, "step": 57320 }, { "epoch": 8.53812928209711, "grad_norm": 1.2590845823287964, "learning_rate": 3.541503174024691e-05, "loss": 0.7679, "num_input_tokens_seen": 33280552, "step": 57325 }, { "epoch": 8.53887399463807, "grad_norm": 1.818098545074463, "learning_rate": 3.54120776379556e-05, "loss": 0.5777, "num_input_tokens_seen": 33283496, "step": 57330 }, { "epoch": 8.539618707179029, "grad_norm": 1.6482818126678467, "learning_rate": 3.540912335976281e-05, "loss": 0.5607, "num_input_tokens_seen": 33286696, "step": 57335 }, { "epoch": 8.540363419719988, "grad_norm": 1.6543210744857788, "learning_rate": 3.540616890571847e-05, "loss": 0.519, "num_input_tokens_seen": 33289448, "step": 57340 }, { "epoch": 8.541108132260947, "grad_norm": 1.1820656061172485, "learning_rate": 3.540321427587249e-05, "loss": 0.7045, "num_input_tokens_seen": 33292232, "step": 57345 }, { "epoch": 8.541852844801907, "grad_norm": 1.1329617500305176, "learning_rate": 3.540025947027476e-05, "loss": 0.6045, "num_input_tokens_seen": 33294760, "step": 57350 }, { "epoch": 8.542597557342866, "grad_norm": 2.093946933746338, "learning_rate": 3.5397304488975226e-05, "loss": 0.6899, "num_input_tokens_seen": 33297544, "step": 57355 }, { "epoch": 8.543342269883825, "grad_norm": 1.0533229112625122, "learning_rate": 3.53943493320238e-05, "loss": 0.5239, "num_input_tokens_seen": 33300296, "step": 57360 }, { "epoch": 8.544086982424783, "grad_norm": 1.666975498199463, "learning_rate": 3.539139399947039e-05, "loss": 0.6922, "num_input_tokens_seen": 33302984, "step": 57365 }, { "epoch": 8.544831694965744, "grad_norm": 1.090495228767395, "learning_rate": 3.5388438491364963e-05, "loss": 0.6402, "num_input_tokens_seen": 33305928, "step": 57370 }, { "epoch": 8.545576407506703, "grad_norm": 1.3136593103408813, "learning_rate": 3.538548280775742e-05, "loss": 0.5955, "num_input_tokens_seen": 33308936, "step": 57375 }, { "epoch": 8.546321120047661, "grad_norm": 1.0082834959030151, "learning_rate": 3.53825269486977e-05, "loss": 0.5834, "num_input_tokens_seen": 33311880, "step": 57380 }, { "epoch": 8.54706583258862, "grad_norm": 1.430250883102417, "learning_rate": 3.5379570914235735e-05, "loss": 0.6704, "num_input_tokens_seen": 33315048, "step": 57385 }, { "epoch": 8.54781054512958, "grad_norm": 1.1159437894821167, "learning_rate": 3.537661470442147e-05, "loss": 0.462, "num_input_tokens_seen": 33317960, "step": 57390 }, { "epoch": 8.54855525767054, "grad_norm": 1.2155978679656982, "learning_rate": 3.537365831930484e-05, "loss": 0.6525, "num_input_tokens_seen": 33320808, "step": 57395 }, { "epoch": 8.549299970211498, "grad_norm": 1.7013325691223145, "learning_rate": 3.53707017589358e-05, "loss": 0.8175, "num_input_tokens_seen": 33324872, "step": 57400 }, { "epoch": 8.550044682752457, "grad_norm": 1.0482410192489624, "learning_rate": 3.53677450233643e-05, "loss": 0.5116, "num_input_tokens_seen": 33327272, "step": 57405 }, { "epoch": 8.550789395293418, "grad_norm": 1.5910547971725464, "learning_rate": 3.536478811264028e-05, "loss": 0.7363, "num_input_tokens_seen": 33329864, "step": 57410 }, { "epoch": 8.551534107834376, "grad_norm": 2.621598243713379, "learning_rate": 3.5361831026813704e-05, "loss": 0.7303, "num_input_tokens_seen": 33332488, "step": 57415 }, { "epoch": 8.552278820375335, "grad_norm": 1.7133395671844482, "learning_rate": 3.535887376593453e-05, "loss": 0.6386, "num_input_tokens_seen": 33335720, "step": 57420 }, { "epoch": 8.553023532916294, "grad_norm": 1.1098401546478271, "learning_rate": 3.53559163300527e-05, "loss": 0.7547, "num_input_tokens_seen": 33338632, "step": 57425 }, { "epoch": 8.553768245457253, "grad_norm": 1.2199335098266602, "learning_rate": 3.5352958719218186e-05, "loss": 0.5819, "num_input_tokens_seen": 33341576, "step": 57430 }, { "epoch": 8.554512957998213, "grad_norm": 1.4227521419525146, "learning_rate": 3.5350000933480966e-05, "loss": 0.6809, "num_input_tokens_seen": 33344456, "step": 57435 }, { "epoch": 8.555257670539172, "grad_norm": 1.0345710515975952, "learning_rate": 3.5347042972891e-05, "loss": 0.7949, "num_input_tokens_seen": 33347336, "step": 57440 }, { "epoch": 8.55600238308013, "grad_norm": 0.913844108581543, "learning_rate": 3.5344084837498245e-05, "loss": 0.5819, "num_input_tokens_seen": 33350472, "step": 57445 }, { "epoch": 8.556747095621091, "grad_norm": 1.2882260084152222, "learning_rate": 3.53411265273527e-05, "loss": 0.7391, "num_input_tokens_seen": 33353256, "step": 57450 }, { "epoch": 8.55749180816205, "grad_norm": 0.9485790729522705, "learning_rate": 3.5338168042504336e-05, "loss": 0.5417, "num_input_tokens_seen": 33355848, "step": 57455 }, { "epoch": 8.558236520703009, "grad_norm": 1.2750611305236816, "learning_rate": 3.533520938300313e-05, "loss": 0.4879, "num_input_tokens_seen": 33358600, "step": 57460 }, { "epoch": 8.558981233243967, "grad_norm": 1.019112467765808, "learning_rate": 3.533225054889906e-05, "loss": 0.5779, "num_input_tokens_seen": 33361480, "step": 57465 }, { "epoch": 8.559725945784926, "grad_norm": 1.474048137664795, "learning_rate": 3.532929154024212e-05, "loss": 0.4994, "num_input_tokens_seen": 33364232, "step": 57470 }, { "epoch": 8.560470658325887, "grad_norm": 1.2817955017089844, "learning_rate": 3.5326332357082306e-05, "loss": 0.5991, "num_input_tokens_seen": 33367176, "step": 57475 }, { "epoch": 8.561215370866845, "grad_norm": 1.019171953201294, "learning_rate": 3.532337299946959e-05, "loss": 0.5723, "num_input_tokens_seen": 33370216, "step": 57480 }, { "epoch": 8.561960083407804, "grad_norm": 1.530275583267212, "learning_rate": 3.532041346745398e-05, "loss": 0.6172, "num_input_tokens_seen": 33373000, "step": 57485 }, { "epoch": 8.562704795948763, "grad_norm": 1.1273449659347534, "learning_rate": 3.5317453761085476e-05, "loss": 0.6676, "num_input_tokens_seen": 33376136, "step": 57490 }, { "epoch": 8.563449508489724, "grad_norm": 2.0357589721679688, "learning_rate": 3.531449388041408e-05, "loss": 0.5124, "num_input_tokens_seen": 33378920, "step": 57495 }, { "epoch": 8.564194221030682, "grad_norm": 1.58505380153656, "learning_rate": 3.5311533825489795e-05, "loss": 0.5924, "num_input_tokens_seen": 33381928, "step": 57500 }, { "epoch": 8.564938933571641, "grad_norm": 1.6259139776229858, "learning_rate": 3.530857359636262e-05, "loss": 0.3894, "num_input_tokens_seen": 33384840, "step": 57505 }, { "epoch": 8.5656836461126, "grad_norm": 0.9798988103866577, "learning_rate": 3.5305613193082575e-05, "loss": 0.668, "num_input_tokens_seen": 33387656, "step": 57510 }, { "epoch": 8.56642835865356, "grad_norm": 0.9324967265129089, "learning_rate": 3.530265261569967e-05, "loss": 0.6388, "num_input_tokens_seen": 33390856, "step": 57515 }, { "epoch": 8.567173071194519, "grad_norm": 1.0798066854476929, "learning_rate": 3.529969186426392e-05, "loss": 0.5855, "num_input_tokens_seen": 33393448, "step": 57520 }, { "epoch": 8.567917783735478, "grad_norm": 1.1789348125457764, "learning_rate": 3.529673093882534e-05, "loss": 0.4992, "num_input_tokens_seen": 33396392, "step": 57525 }, { "epoch": 8.568662496276437, "grad_norm": 1.8044661283493042, "learning_rate": 3.5293769839433956e-05, "loss": 0.7444, "num_input_tokens_seen": 33399304, "step": 57530 }, { "epoch": 8.569407208817397, "grad_norm": 0.7751557230949402, "learning_rate": 3.529080856613979e-05, "loss": 0.6005, "num_input_tokens_seen": 33402088, "step": 57535 }, { "epoch": 8.570151921358356, "grad_norm": 1.521706223487854, "learning_rate": 3.528784711899288e-05, "loss": 0.6002, "num_input_tokens_seen": 33404808, "step": 57540 }, { "epoch": 8.570896633899315, "grad_norm": 1.0115382671356201, "learning_rate": 3.5284885498043254e-05, "loss": 0.5851, "num_input_tokens_seen": 33407720, "step": 57545 }, { "epoch": 8.571641346440273, "grad_norm": 1.052617073059082, "learning_rate": 3.528192370334094e-05, "loss": 0.656, "num_input_tokens_seen": 33410792, "step": 57550 }, { "epoch": 8.572386058981234, "grad_norm": 1.3743135929107666, "learning_rate": 3.527896173493596e-05, "loss": 0.6387, "num_input_tokens_seen": 33413544, "step": 57555 }, { "epoch": 8.573130771522193, "grad_norm": 2.457598924636841, "learning_rate": 3.527599959287838e-05, "loss": 0.551, "num_input_tokens_seen": 33416520, "step": 57560 }, { "epoch": 8.573875484063151, "grad_norm": 2.1894869804382324, "learning_rate": 3.5273037277218224e-05, "loss": 0.789, "num_input_tokens_seen": 33419336, "step": 57565 }, { "epoch": 8.57462019660411, "grad_norm": 1.0710487365722656, "learning_rate": 3.527007478800555e-05, "loss": 0.6963, "num_input_tokens_seen": 33421896, "step": 57570 }, { "epoch": 8.57536490914507, "grad_norm": 1.7158012390136719, "learning_rate": 3.5267112125290396e-05, "loss": 0.6212, "num_input_tokens_seen": 33424840, "step": 57575 }, { "epoch": 8.57610962168603, "grad_norm": 1.724949598312378, "learning_rate": 3.5264149289122825e-05, "loss": 0.7246, "num_input_tokens_seen": 33427784, "step": 57580 }, { "epoch": 8.576854334226988, "grad_norm": 1.3221789598464966, "learning_rate": 3.526118627955288e-05, "loss": 0.5597, "num_input_tokens_seen": 33430536, "step": 57585 }, { "epoch": 8.577599046767947, "grad_norm": 1.0056071281433105, "learning_rate": 3.525822309663061e-05, "loss": 0.6981, "num_input_tokens_seen": 33433704, "step": 57590 }, { "epoch": 8.578343759308908, "grad_norm": 1.0249886512756348, "learning_rate": 3.5255259740406104e-05, "loss": 0.6809, "num_input_tokens_seen": 33436584, "step": 57595 }, { "epoch": 8.579088471849866, "grad_norm": 1.8641772270202637, "learning_rate": 3.52522962109294e-05, "loss": 0.6518, "num_input_tokens_seen": 33439432, "step": 57600 }, { "epoch": 8.579833184390825, "grad_norm": 0.9850128889083862, "learning_rate": 3.5249332508250576e-05, "loss": 0.5692, "num_input_tokens_seen": 33442504, "step": 57605 }, { "epoch": 8.580577896931784, "grad_norm": 1.1855589151382446, "learning_rate": 3.524636863241969e-05, "loss": 0.5718, "num_input_tokens_seen": 33445480, "step": 57610 }, { "epoch": 8.581322609472743, "grad_norm": 1.6544140577316284, "learning_rate": 3.5243404583486824e-05, "loss": 0.6795, "num_input_tokens_seen": 33448392, "step": 57615 }, { "epoch": 8.582067322013703, "grad_norm": 1.3119163513183594, "learning_rate": 3.5240440361502046e-05, "loss": 0.5538, "num_input_tokens_seen": 33451208, "step": 57620 }, { "epoch": 8.582812034554662, "grad_norm": 1.1882009506225586, "learning_rate": 3.523747596651544e-05, "loss": 0.4583, "num_input_tokens_seen": 33454088, "step": 57625 }, { "epoch": 8.58355674709562, "grad_norm": 2.5797626972198486, "learning_rate": 3.523451139857708e-05, "loss": 0.7383, "num_input_tokens_seen": 33456968, "step": 57630 }, { "epoch": 8.584301459636581, "grad_norm": 1.399208664894104, "learning_rate": 3.5231546657737044e-05, "loss": 0.5001, "num_input_tokens_seen": 33459912, "step": 57635 }, { "epoch": 8.58504617217754, "grad_norm": 1.3801963329315186, "learning_rate": 3.522858174404544e-05, "loss": 0.657, "num_input_tokens_seen": 33463048, "step": 57640 }, { "epoch": 8.585790884718499, "grad_norm": 1.5715657472610474, "learning_rate": 3.522561665755234e-05, "loss": 0.5209, "num_input_tokens_seen": 33465736, "step": 57645 }, { "epoch": 8.586535597259457, "grad_norm": 0.8379990458488464, "learning_rate": 3.5222651398307835e-05, "loss": 0.7266, "num_input_tokens_seen": 33468712, "step": 57650 }, { "epoch": 8.587280309800416, "grad_norm": 0.49814456701278687, "learning_rate": 3.5219685966362024e-05, "loss": 0.4833, "num_input_tokens_seen": 33471592, "step": 57655 }, { "epoch": 8.588025022341377, "grad_norm": 1.1740097999572754, "learning_rate": 3.521672036176501e-05, "loss": 0.5829, "num_input_tokens_seen": 33474760, "step": 57660 }, { "epoch": 8.588769734882336, "grad_norm": 1.2237074375152588, "learning_rate": 3.5213754584566886e-05, "loss": 0.6526, "num_input_tokens_seen": 33477864, "step": 57665 }, { "epoch": 8.589514447423294, "grad_norm": 0.8540336489677429, "learning_rate": 3.521078863481776e-05, "loss": 0.5892, "num_input_tokens_seen": 33481096, "step": 57670 }, { "epoch": 8.590259159964253, "grad_norm": 1.769115924835205, "learning_rate": 3.5207822512567736e-05, "loss": 0.6141, "num_input_tokens_seen": 33483624, "step": 57675 }, { "epoch": 8.591003872505214, "grad_norm": 0.6706845164299011, "learning_rate": 3.520485621786693e-05, "loss": 0.5516, "num_input_tokens_seen": 33486536, "step": 57680 }, { "epoch": 8.591748585046172, "grad_norm": 1.4122713804244995, "learning_rate": 3.5201889750765446e-05, "loss": 0.576, "num_input_tokens_seen": 33489256, "step": 57685 }, { "epoch": 8.592493297587131, "grad_norm": 1.8493843078613281, "learning_rate": 3.51989231113134e-05, "loss": 0.6559, "num_input_tokens_seen": 33492040, "step": 57690 }, { "epoch": 8.59323801012809, "grad_norm": 0.8607209920883179, "learning_rate": 3.519595629956092e-05, "loss": 0.5549, "num_input_tokens_seen": 33495080, "step": 57695 }, { "epoch": 8.59398272266905, "grad_norm": 1.1638902425765991, "learning_rate": 3.519298931555812e-05, "loss": 0.5463, "num_input_tokens_seen": 33498056, "step": 57700 }, { "epoch": 8.59472743521001, "grad_norm": 1.538914442062378, "learning_rate": 3.519002215935512e-05, "loss": 0.6273, "num_input_tokens_seen": 33500872, "step": 57705 }, { "epoch": 8.595472147750968, "grad_norm": 2.424034357070923, "learning_rate": 3.5187054831002064e-05, "loss": 0.6631, "num_input_tokens_seen": 33504008, "step": 57710 }, { "epoch": 8.596216860291927, "grad_norm": 1.0682553052902222, "learning_rate": 3.5184087330549056e-05, "loss": 0.5906, "num_input_tokens_seen": 33506760, "step": 57715 }, { "epoch": 8.596961572832887, "grad_norm": 1.2346254587173462, "learning_rate": 3.518111965804625e-05, "loss": 0.6209, "num_input_tokens_seen": 33509608, "step": 57720 }, { "epoch": 8.597706285373846, "grad_norm": 1.363232135772705, "learning_rate": 3.517815181354378e-05, "loss": 0.7468, "num_input_tokens_seen": 33512648, "step": 57725 }, { "epoch": 8.598450997914805, "grad_norm": 1.0257396697998047, "learning_rate": 3.517518379709177e-05, "loss": 0.6224, "num_input_tokens_seen": 33515528, "step": 57730 }, { "epoch": 8.599195710455763, "grad_norm": 1.7323070764541626, "learning_rate": 3.5172215608740376e-05, "loss": 0.4797, "num_input_tokens_seen": 33518632, "step": 57735 }, { "epoch": 8.599940422996724, "grad_norm": 1.4700242280960083, "learning_rate": 3.516924724853974e-05, "loss": 0.6337, "num_input_tokens_seen": 33521352, "step": 57740 }, { "epoch": 8.600685135537683, "grad_norm": 1.6229690313339233, "learning_rate": 3.5166278716540016e-05, "loss": 0.6466, "num_input_tokens_seen": 33524008, "step": 57745 }, { "epoch": 8.601429848078642, "grad_norm": 1.669950246810913, "learning_rate": 3.5163310012791326e-05, "loss": 0.5607, "num_input_tokens_seen": 33527048, "step": 57750 }, { "epoch": 8.6021745606196, "grad_norm": 1.417556881904602, "learning_rate": 3.516034113734385e-05, "loss": 0.688, "num_input_tokens_seen": 33530024, "step": 57755 }, { "epoch": 8.60291927316056, "grad_norm": 1.4248615503311157, "learning_rate": 3.515737209024774e-05, "loss": 0.506, "num_input_tokens_seen": 33532872, "step": 57760 }, { "epoch": 8.60366398570152, "grad_norm": 1.0530833005905151, "learning_rate": 3.515440287155315e-05, "loss": 0.4835, "num_input_tokens_seen": 33535752, "step": 57765 }, { "epoch": 8.604408698242478, "grad_norm": 1.6802434921264648, "learning_rate": 3.515143348131025e-05, "loss": 0.6388, "num_input_tokens_seen": 33538696, "step": 57770 }, { "epoch": 8.605153410783437, "grad_norm": 2.8268895149230957, "learning_rate": 3.514846391956919e-05, "loss": 0.6168, "num_input_tokens_seen": 33541448, "step": 57775 }, { "epoch": 8.605898123324398, "grad_norm": 2.08713960647583, "learning_rate": 3.514549418638015e-05, "loss": 0.6788, "num_input_tokens_seen": 33544648, "step": 57780 }, { "epoch": 8.606642835865356, "grad_norm": 1.6845622062683105, "learning_rate": 3.5142524281793296e-05, "loss": 0.5447, "num_input_tokens_seen": 33547560, "step": 57785 }, { "epoch": 8.607387548406315, "grad_norm": 1.3242449760437012, "learning_rate": 3.513955420585881e-05, "loss": 0.5145, "num_input_tokens_seen": 33550312, "step": 57790 }, { "epoch": 8.608132260947274, "grad_norm": 0.655806303024292, "learning_rate": 3.513658395862685e-05, "loss": 0.5929, "num_input_tokens_seen": 33553032, "step": 57795 }, { "epoch": 8.608876973488233, "grad_norm": 1.8754348754882812, "learning_rate": 3.5133613540147605e-05, "loss": 0.4912, "num_input_tokens_seen": 33555880, "step": 57800 }, { "epoch": 8.609621686029193, "grad_norm": 1.278377652168274, "learning_rate": 3.513064295047127e-05, "loss": 0.5408, "num_input_tokens_seen": 33558792, "step": 57805 }, { "epoch": 8.610366398570152, "grad_norm": 3.714792490005493, "learning_rate": 3.5127672189648016e-05, "loss": 0.7012, "num_input_tokens_seen": 33561864, "step": 57810 }, { "epoch": 8.61111111111111, "grad_norm": 1.2041836977005005, "learning_rate": 3.512470125772803e-05, "loss": 0.7597, "num_input_tokens_seen": 33564680, "step": 57815 }, { "epoch": 8.61185582365207, "grad_norm": 0.8499500751495361, "learning_rate": 3.5121730154761507e-05, "loss": 0.6858, "num_input_tokens_seen": 33567848, "step": 57820 }, { "epoch": 8.61260053619303, "grad_norm": 1.1949102878570557, "learning_rate": 3.511875888079864e-05, "loss": 0.4215, "num_input_tokens_seen": 33570696, "step": 57825 }, { "epoch": 8.613345248733989, "grad_norm": 1.3685191869735718, "learning_rate": 3.511578743588963e-05, "loss": 0.6524, "num_input_tokens_seen": 33573512, "step": 57830 }, { "epoch": 8.614089961274948, "grad_norm": 2.7730937004089355, "learning_rate": 3.511281582008466e-05, "loss": 0.6859, "num_input_tokens_seen": 33576296, "step": 57835 }, { "epoch": 8.614834673815906, "grad_norm": 1.16280996799469, "learning_rate": 3.5109844033433963e-05, "loss": 0.4989, "num_input_tokens_seen": 33579304, "step": 57840 }, { "epoch": 8.615579386356867, "grad_norm": 2.0063254833221436, "learning_rate": 3.5106872075987716e-05, "loss": 0.9111, "num_input_tokens_seen": 33582152, "step": 57845 }, { "epoch": 8.616324098897826, "grad_norm": 1.6760730743408203, "learning_rate": 3.5103899947796134e-05, "loss": 0.6438, "num_input_tokens_seen": 33584968, "step": 57850 }, { "epoch": 8.617068811438784, "grad_norm": 1.254428505897522, "learning_rate": 3.510092764890944e-05, "loss": 0.5941, "num_input_tokens_seen": 33587784, "step": 57855 }, { "epoch": 8.617813523979743, "grad_norm": 0.932914137840271, "learning_rate": 3.509795517937784e-05, "loss": 0.5512, "num_input_tokens_seen": 33590760, "step": 57860 }, { "epoch": 8.618558236520704, "grad_norm": 1.929567575454712, "learning_rate": 3.5094982539251545e-05, "loss": 0.7241, "num_input_tokens_seen": 33593640, "step": 57865 }, { "epoch": 8.619302949061662, "grad_norm": 1.1100316047668457, "learning_rate": 3.5092009728580784e-05, "loss": 0.543, "num_input_tokens_seen": 33596648, "step": 57870 }, { "epoch": 8.620047661602621, "grad_norm": 1.2423608303070068, "learning_rate": 3.5089036747415775e-05, "loss": 0.5971, "num_input_tokens_seen": 33599592, "step": 57875 }, { "epoch": 8.62079237414358, "grad_norm": 2.3328537940979004, "learning_rate": 3.508606359580674e-05, "loss": 0.6265, "num_input_tokens_seen": 33602472, "step": 57880 }, { "epoch": 8.62153708668454, "grad_norm": 2.09273099899292, "learning_rate": 3.508309027380392e-05, "loss": 0.4887, "num_input_tokens_seen": 33605480, "step": 57885 }, { "epoch": 8.6222817992255, "grad_norm": 1.1104995012283325, "learning_rate": 3.508011678145752e-05, "loss": 0.7239, "num_input_tokens_seen": 33608392, "step": 57890 }, { "epoch": 8.623026511766458, "grad_norm": 2.259404182434082, "learning_rate": 3.5077143118817805e-05, "loss": 0.8072, "num_input_tokens_seen": 33611336, "step": 57895 }, { "epoch": 8.623771224307417, "grad_norm": 1.5864673852920532, "learning_rate": 3.5074169285935e-05, "loss": 0.6437, "num_input_tokens_seen": 33614376, "step": 57900 }, { "epoch": 8.624515936848377, "grad_norm": 1.2930655479431152, "learning_rate": 3.5071195282859345e-05, "loss": 0.5221, "num_input_tokens_seen": 33617288, "step": 57905 }, { "epoch": 8.625260649389336, "grad_norm": 1.1086695194244385, "learning_rate": 3.506822110964108e-05, "loss": 0.6538, "num_input_tokens_seen": 33620232, "step": 57910 }, { "epoch": 8.626005361930295, "grad_norm": 1.654126763343811, "learning_rate": 3.506524676633045e-05, "loss": 0.6273, "num_input_tokens_seen": 33623304, "step": 57915 }, { "epoch": 8.626750074471254, "grad_norm": 1.4766885042190552, "learning_rate": 3.50622722529777e-05, "loss": 0.6907, "num_input_tokens_seen": 33626120, "step": 57920 }, { "epoch": 8.627494787012214, "grad_norm": 1.3691297769546509, "learning_rate": 3.5059297569633096e-05, "loss": 0.4937, "num_input_tokens_seen": 33628776, "step": 57925 }, { "epoch": 8.628239499553173, "grad_norm": 2.716435670852661, "learning_rate": 3.505632271634688e-05, "loss": 0.8475, "num_input_tokens_seen": 33631720, "step": 57930 }, { "epoch": 8.628984212094132, "grad_norm": 1.4385325908660889, "learning_rate": 3.505334769316931e-05, "loss": 0.4988, "num_input_tokens_seen": 33634696, "step": 57935 }, { "epoch": 8.62972892463509, "grad_norm": 1.306729793548584, "learning_rate": 3.505037250015066e-05, "loss": 0.5191, "num_input_tokens_seen": 33637480, "step": 57940 }, { "epoch": 8.63047363717605, "grad_norm": 1.450360655784607, "learning_rate": 3.504739713734118e-05, "loss": 0.6357, "num_input_tokens_seen": 33640968, "step": 57945 }, { "epoch": 8.63121834971701, "grad_norm": 1.3952562808990479, "learning_rate": 3.504442160479112e-05, "loss": 0.6397, "num_input_tokens_seen": 33644104, "step": 57950 }, { "epoch": 8.631963062257968, "grad_norm": 2.1492838859558105, "learning_rate": 3.5041445902550776e-05, "loss": 0.585, "num_input_tokens_seen": 33646856, "step": 57955 }, { "epoch": 8.632707774798927, "grad_norm": 1.1157481670379639, "learning_rate": 3.503847003067041e-05, "loss": 0.5157, "num_input_tokens_seen": 33649384, "step": 57960 }, { "epoch": 8.633452487339888, "grad_norm": 1.3477516174316406, "learning_rate": 3.503549398920029e-05, "loss": 0.6403, "num_input_tokens_seen": 33652328, "step": 57965 }, { "epoch": 8.634197199880846, "grad_norm": 0.592011034488678, "learning_rate": 3.503251777819071e-05, "loss": 0.6591, "num_input_tokens_seen": 33655208, "step": 57970 }, { "epoch": 8.634941912421805, "grad_norm": 1.5007706880569458, "learning_rate": 3.502954139769193e-05, "loss": 0.4797, "num_input_tokens_seen": 33657928, "step": 57975 }, { "epoch": 8.635686624962764, "grad_norm": 1.15709388256073, "learning_rate": 3.502656484775424e-05, "loss": 0.5019, "num_input_tokens_seen": 33660904, "step": 57980 }, { "epoch": 8.636431337503723, "grad_norm": 1.650414228439331, "learning_rate": 3.502358812842794e-05, "loss": 0.7831, "num_input_tokens_seen": 33663720, "step": 57985 }, { "epoch": 8.637176050044683, "grad_norm": 1.6793252229690552, "learning_rate": 3.502061123976329e-05, "loss": 0.6723, "num_input_tokens_seen": 33666440, "step": 57990 }, { "epoch": 8.637920762585642, "grad_norm": 1.4771746397018433, "learning_rate": 3.50176341818106e-05, "loss": 0.6309, "num_input_tokens_seen": 33669128, "step": 57995 }, { "epoch": 8.6386654751266, "grad_norm": 1.632407546043396, "learning_rate": 3.5014656954620174e-05, "loss": 0.5644, "num_input_tokens_seen": 33671816, "step": 58000 }, { "epoch": 8.63941018766756, "grad_norm": 1.4501237869262695, "learning_rate": 3.5011679558242286e-05, "loss": 0.6669, "num_input_tokens_seen": 33674696, "step": 58005 }, { "epoch": 8.64015490020852, "grad_norm": 3.8632664680480957, "learning_rate": 3.5008701992727254e-05, "loss": 0.7763, "num_input_tokens_seen": 33677640, "step": 58010 }, { "epoch": 8.640899612749479, "grad_norm": 2.758453607559204, "learning_rate": 3.500572425812537e-05, "loss": 0.7822, "num_input_tokens_seen": 33680648, "step": 58015 }, { "epoch": 8.641644325290438, "grad_norm": 1.3384876251220703, "learning_rate": 3.500274635448694e-05, "loss": 0.6903, "num_input_tokens_seen": 33683944, "step": 58020 }, { "epoch": 8.642389037831396, "grad_norm": 1.7240586280822754, "learning_rate": 3.499976828186229e-05, "loss": 0.6669, "num_input_tokens_seen": 33686760, "step": 58025 }, { "epoch": 8.643133750372357, "grad_norm": 0.8660336136817932, "learning_rate": 3.499679004030171e-05, "loss": 0.5058, "num_input_tokens_seen": 33689608, "step": 58030 }, { "epoch": 8.643878462913316, "grad_norm": 1.0485031604766846, "learning_rate": 3.499381162985552e-05, "loss": 0.5783, "num_input_tokens_seen": 33692904, "step": 58035 }, { "epoch": 8.644623175454274, "grad_norm": 1.4222098588943481, "learning_rate": 3.499083305057405e-05, "loss": 0.5163, "num_input_tokens_seen": 33695784, "step": 58040 }, { "epoch": 8.645367887995233, "grad_norm": 1.6919043064117432, "learning_rate": 3.49878543025076e-05, "loss": 0.7743, "num_input_tokens_seen": 33698760, "step": 58045 }, { "epoch": 8.646112600536194, "grad_norm": 1.085366129875183, "learning_rate": 3.49848753857065e-05, "loss": 0.5433, "num_input_tokens_seen": 33701768, "step": 58050 }, { "epoch": 8.646857313077152, "grad_norm": 2.462702512741089, "learning_rate": 3.4981896300221084e-05, "loss": 0.7293, "num_input_tokens_seen": 33704424, "step": 58055 }, { "epoch": 8.647602025618111, "grad_norm": 1.960023283958435, "learning_rate": 3.497891704610167e-05, "loss": 0.4506, "num_input_tokens_seen": 33707240, "step": 58060 }, { "epoch": 8.64834673815907, "grad_norm": 0.9373502731323242, "learning_rate": 3.49759376233986e-05, "loss": 0.5232, "num_input_tokens_seen": 33710088, "step": 58065 }, { "epoch": 8.64909145070003, "grad_norm": 1.1677616834640503, "learning_rate": 3.4972958032162204e-05, "loss": 0.5967, "num_input_tokens_seen": 33713256, "step": 58070 }, { "epoch": 8.64983616324099, "grad_norm": 1.311565637588501, "learning_rate": 3.496997827244282e-05, "loss": 0.6788, "num_input_tokens_seen": 33716008, "step": 58075 }, { "epoch": 8.650580875781948, "grad_norm": 1.6666102409362793, "learning_rate": 3.496699834429078e-05, "loss": 0.6574, "num_input_tokens_seen": 33719176, "step": 58080 }, { "epoch": 8.651325588322907, "grad_norm": 1.0021765232086182, "learning_rate": 3.4964018247756434e-05, "loss": 0.6616, "num_input_tokens_seen": 33721992, "step": 58085 }, { "epoch": 8.652070300863867, "grad_norm": 1.4647977352142334, "learning_rate": 3.4961037982890135e-05, "loss": 0.615, "num_input_tokens_seen": 33724648, "step": 58090 }, { "epoch": 8.652815013404826, "grad_norm": 1.3804303407669067, "learning_rate": 3.495805754974221e-05, "loss": 0.6899, "num_input_tokens_seen": 33727624, "step": 58095 }, { "epoch": 8.653559725945785, "grad_norm": 0.8062412738800049, "learning_rate": 3.495507694836304e-05, "loss": 0.6622, "num_input_tokens_seen": 33730248, "step": 58100 }, { "epoch": 8.654304438486744, "grad_norm": 1.573313593864441, "learning_rate": 3.4952096178802946e-05, "loss": 0.6373, "num_input_tokens_seen": 33733000, "step": 58105 }, { "epoch": 8.655049151027704, "grad_norm": 1.1716594696044922, "learning_rate": 3.4949115241112314e-05, "loss": 0.4634, "num_input_tokens_seen": 33735880, "step": 58110 }, { "epoch": 8.655793863568663, "grad_norm": 1.1967628002166748, "learning_rate": 3.4946134135341486e-05, "loss": 0.6703, "num_input_tokens_seen": 33738536, "step": 58115 }, { "epoch": 8.656538576109622, "grad_norm": 0.9990114569664001, "learning_rate": 3.494315286154083e-05, "loss": 0.6704, "num_input_tokens_seen": 33741256, "step": 58120 }, { "epoch": 8.65728328865058, "grad_norm": 1.1409295797348022, "learning_rate": 3.494017141976071e-05, "loss": 0.5636, "num_input_tokens_seen": 33744168, "step": 58125 }, { "epoch": 8.65802800119154, "grad_norm": 1.9106062650680542, "learning_rate": 3.4937189810051494e-05, "loss": 0.6848, "num_input_tokens_seen": 33746888, "step": 58130 }, { "epoch": 8.6587727137325, "grad_norm": 1.292336106300354, "learning_rate": 3.4934208032463565e-05, "loss": 0.7686, "num_input_tokens_seen": 33749832, "step": 58135 }, { "epoch": 8.659517426273458, "grad_norm": 1.332136631011963, "learning_rate": 3.4931226087047285e-05, "loss": 0.5712, "num_input_tokens_seen": 33752872, "step": 58140 }, { "epoch": 8.660262138814417, "grad_norm": 2.4458391666412354, "learning_rate": 3.4928243973853044e-05, "loss": 0.7473, "num_input_tokens_seen": 33755848, "step": 58145 }, { "epoch": 8.661006851355378, "grad_norm": 2.1168980598449707, "learning_rate": 3.49252616929312e-05, "loss": 0.8431, "num_input_tokens_seen": 33758824, "step": 58150 }, { "epoch": 8.661751563896336, "grad_norm": 0.6536105275154114, "learning_rate": 3.492227924433215e-05, "loss": 0.5572, "num_input_tokens_seen": 33761480, "step": 58155 }, { "epoch": 8.662496276437295, "grad_norm": 0.9902926683425903, "learning_rate": 3.491929662810627e-05, "loss": 0.461, "num_input_tokens_seen": 33764360, "step": 58160 }, { "epoch": 8.663240988978254, "grad_norm": 1.5069726705551147, "learning_rate": 3.491631384430396e-05, "loss": 0.6593, "num_input_tokens_seen": 33767336, "step": 58165 }, { "epoch": 8.663985701519213, "grad_norm": 0.913888156414032, "learning_rate": 3.4913330892975606e-05, "loss": 0.6584, "num_input_tokens_seen": 33770376, "step": 58170 }, { "epoch": 8.664730414060173, "grad_norm": 1.0882219076156616, "learning_rate": 3.4910347774171606e-05, "loss": 0.6094, "num_input_tokens_seen": 33773384, "step": 58175 }, { "epoch": 8.665475126601132, "grad_norm": 0.9408580660820007, "learning_rate": 3.490736448794235e-05, "loss": 0.4315, "num_input_tokens_seen": 33776296, "step": 58180 }, { "epoch": 8.66621983914209, "grad_norm": 0.8989934325218201, "learning_rate": 3.490438103433824e-05, "loss": 0.6258, "num_input_tokens_seen": 33779208, "step": 58185 }, { "epoch": 8.66696455168305, "grad_norm": 0.9321579933166504, "learning_rate": 3.490139741340967e-05, "loss": 0.5587, "num_input_tokens_seen": 33782280, "step": 58190 }, { "epoch": 8.66770926422401, "grad_norm": 1.6003831624984741, "learning_rate": 3.4898413625207067e-05, "loss": 0.7233, "num_input_tokens_seen": 33785192, "step": 58195 }, { "epoch": 8.668453976764969, "grad_norm": 1.2514972686767578, "learning_rate": 3.4895429669780824e-05, "loss": 0.463, "num_input_tokens_seen": 33788040, "step": 58200 }, { "epoch": 8.669198689305928, "grad_norm": 0.6745264530181885, "learning_rate": 3.4892445547181354e-05, "loss": 0.6768, "num_input_tokens_seen": 33791080, "step": 58205 }, { "epoch": 8.669943401846886, "grad_norm": 1.0433887243270874, "learning_rate": 3.4889461257459065e-05, "loss": 0.6278, "num_input_tokens_seen": 33793896, "step": 58210 }, { "epoch": 8.670688114387847, "grad_norm": 1.9895212650299072, "learning_rate": 3.488647680066438e-05, "loss": 0.6481, "num_input_tokens_seen": 33796808, "step": 58215 }, { "epoch": 8.671432826928806, "grad_norm": 1.0098116397857666, "learning_rate": 3.4883492176847724e-05, "loss": 0.6197, "num_input_tokens_seen": 33799912, "step": 58220 }, { "epoch": 8.672177539469764, "grad_norm": 1.3298604488372803, "learning_rate": 3.488050738605951e-05, "loss": 0.5075, "num_input_tokens_seen": 33803048, "step": 58225 }, { "epoch": 8.672922252010723, "grad_norm": 1.4077911376953125, "learning_rate": 3.4877522428350165e-05, "loss": 0.5654, "num_input_tokens_seen": 33806056, "step": 58230 }, { "epoch": 8.673666964551684, "grad_norm": 1.0700764656066895, "learning_rate": 3.487453730377011e-05, "loss": 0.4986, "num_input_tokens_seen": 33809192, "step": 58235 }, { "epoch": 8.674411677092642, "grad_norm": 1.013858437538147, "learning_rate": 3.4871552012369793e-05, "loss": 0.4586, "num_input_tokens_seen": 33811848, "step": 58240 }, { "epoch": 8.675156389633601, "grad_norm": 1.0615808963775635, "learning_rate": 3.486856655419964e-05, "loss": 0.538, "num_input_tokens_seen": 33814920, "step": 58245 }, { "epoch": 8.67590110217456, "grad_norm": 1.0248730182647705, "learning_rate": 3.4865580929310074e-05, "loss": 0.486, "num_input_tokens_seen": 33817704, "step": 58250 }, { "epoch": 8.67664581471552, "grad_norm": 1.601428508758545, "learning_rate": 3.486259513775155e-05, "loss": 0.7244, "num_input_tokens_seen": 33820488, "step": 58255 }, { "epoch": 8.67739052725648, "grad_norm": 1.1789331436157227, "learning_rate": 3.485960917957451e-05, "loss": 0.5909, "num_input_tokens_seen": 33823400, "step": 58260 }, { "epoch": 8.678135239797438, "grad_norm": 1.2602168321609497, "learning_rate": 3.4856623054829395e-05, "loss": 0.627, "num_input_tokens_seen": 33826216, "step": 58265 }, { "epoch": 8.678879952338397, "grad_norm": 1.0237478017807007, "learning_rate": 3.4853636763566646e-05, "loss": 0.5981, "num_input_tokens_seen": 33829064, "step": 58270 }, { "epoch": 8.679624664879357, "grad_norm": 0.9644801020622253, "learning_rate": 3.485065030583672e-05, "loss": 0.5156, "num_input_tokens_seen": 33832104, "step": 58275 }, { "epoch": 8.680369377420316, "grad_norm": 1.0556882619857788, "learning_rate": 3.484766368169007e-05, "loss": 0.6044, "num_input_tokens_seen": 33835016, "step": 58280 }, { "epoch": 8.681114089961275, "grad_norm": 1.1002912521362305, "learning_rate": 3.484467689117715e-05, "loss": 0.5048, "num_input_tokens_seen": 33838248, "step": 58285 }, { "epoch": 8.681858802502234, "grad_norm": 1.2225255966186523, "learning_rate": 3.4841689934348416e-05, "loss": 0.5266, "num_input_tokens_seen": 33841160, "step": 58290 }, { "epoch": 8.682603515043194, "grad_norm": 1.3847628831863403, "learning_rate": 3.483870281125433e-05, "loss": 0.6275, "num_input_tokens_seen": 33844008, "step": 58295 }, { "epoch": 8.683348227584153, "grad_norm": 1.371181607246399, "learning_rate": 3.483571552194537e-05, "loss": 0.7061, "num_input_tokens_seen": 33846696, "step": 58300 }, { "epoch": 8.684092940125112, "grad_norm": 1.3657504320144653, "learning_rate": 3.4832728066471994e-05, "loss": 0.5733, "num_input_tokens_seen": 33849736, "step": 58305 }, { "epoch": 8.68483765266607, "grad_norm": 1.48322594165802, "learning_rate": 3.482974044488466e-05, "loss": 0.5755, "num_input_tokens_seen": 33852552, "step": 58310 }, { "epoch": 8.68558236520703, "grad_norm": 1.8802964687347412, "learning_rate": 3.4826752657233855e-05, "loss": 0.4673, "num_input_tokens_seen": 33855400, "step": 58315 }, { "epoch": 8.68632707774799, "grad_norm": 1.3753011226654053, "learning_rate": 3.4823764703570054e-05, "loss": 0.5244, "num_input_tokens_seen": 33858248, "step": 58320 }, { "epoch": 8.687071790288948, "grad_norm": 0.8947886228561401, "learning_rate": 3.482077658394373e-05, "loss": 0.6008, "num_input_tokens_seen": 33860904, "step": 58325 }, { "epoch": 8.687816502829907, "grad_norm": 0.9657694101333618, "learning_rate": 3.481778829840537e-05, "loss": 0.6026, "num_input_tokens_seen": 33863816, "step": 58330 }, { "epoch": 8.688561215370868, "grad_norm": 0.7588838934898376, "learning_rate": 3.481479984700546e-05, "loss": 0.497, "num_input_tokens_seen": 33867048, "step": 58335 }, { "epoch": 8.689305927911827, "grad_norm": 0.8444802761077881, "learning_rate": 3.481181122979447e-05, "loss": 0.4378, "num_input_tokens_seen": 33870088, "step": 58340 }, { "epoch": 8.690050640452785, "grad_norm": 1.5408293008804321, "learning_rate": 3.480882244682291e-05, "loss": 0.6414, "num_input_tokens_seen": 33873576, "step": 58345 }, { "epoch": 8.690795352993744, "grad_norm": 1.2876332998275757, "learning_rate": 3.480583349814126e-05, "loss": 0.8014, "num_input_tokens_seen": 33876520, "step": 58350 }, { "epoch": 8.691540065534703, "grad_norm": 1.3976949453353882, "learning_rate": 3.480284438380002e-05, "loss": 0.5865, "num_input_tokens_seen": 33879368, "step": 58355 }, { "epoch": 8.692284778075663, "grad_norm": 2.4456522464752197, "learning_rate": 3.479985510384969e-05, "loss": 0.7416, "num_input_tokens_seen": 33882536, "step": 58360 }, { "epoch": 8.693029490616622, "grad_norm": 1.0571917295455933, "learning_rate": 3.479686565834077e-05, "loss": 0.4589, "num_input_tokens_seen": 33885576, "step": 58365 }, { "epoch": 8.69377420315758, "grad_norm": 2.713843584060669, "learning_rate": 3.479387604732376e-05, "loss": 0.7293, "num_input_tokens_seen": 33888456, "step": 58370 }, { "epoch": 8.69451891569854, "grad_norm": 1.263419270515442, "learning_rate": 3.479088627084916e-05, "loss": 0.5138, "num_input_tokens_seen": 33891304, "step": 58375 }, { "epoch": 8.6952636282395, "grad_norm": 1.446589469909668, "learning_rate": 3.4787896328967493e-05, "loss": 0.5905, "num_input_tokens_seen": 33894024, "step": 58380 }, { "epoch": 8.696008340780459, "grad_norm": 1.7440613508224487, "learning_rate": 3.478490622172926e-05, "loss": 0.5963, "num_input_tokens_seen": 33896744, "step": 58385 }, { "epoch": 8.696753053321418, "grad_norm": 1.4783351421356201, "learning_rate": 3.478191594918499e-05, "loss": 0.6957, "num_input_tokens_seen": 33899880, "step": 58390 }, { "epoch": 8.697497765862376, "grad_norm": 0.899151086807251, "learning_rate": 3.477892551138519e-05, "loss": 0.5249, "num_input_tokens_seen": 33903080, "step": 58395 }, { "epoch": 8.698242478403337, "grad_norm": 1.2517472505569458, "learning_rate": 3.4775934908380386e-05, "loss": 0.5289, "num_input_tokens_seen": 33906024, "step": 58400 }, { "epoch": 8.698987190944296, "grad_norm": 1.4943071603775024, "learning_rate": 3.4772944140221094e-05, "loss": 0.5698, "num_input_tokens_seen": 33908904, "step": 58405 }, { "epoch": 8.699731903485254, "grad_norm": 1.7046250104904175, "learning_rate": 3.476995320695784e-05, "loss": 0.6309, "num_input_tokens_seen": 33911944, "step": 58410 }, { "epoch": 8.700476616026213, "grad_norm": 1.543886423110962, "learning_rate": 3.476696210864116e-05, "loss": 0.7554, "num_input_tokens_seen": 33914824, "step": 58415 }, { "epoch": 8.701221328567174, "grad_norm": 1.6332577466964722, "learning_rate": 3.476397084532158e-05, "loss": 0.8063, "num_input_tokens_seen": 33917544, "step": 58420 }, { "epoch": 8.701966041108133, "grad_norm": 0.9399327039718628, "learning_rate": 3.476097941704964e-05, "loss": 0.4989, "num_input_tokens_seen": 33920168, "step": 58425 }, { "epoch": 8.702710753649091, "grad_norm": 1.2037206888198853, "learning_rate": 3.475798782387587e-05, "loss": 0.6405, "num_input_tokens_seen": 33923176, "step": 58430 }, { "epoch": 8.70345546619005, "grad_norm": 1.1648942232131958, "learning_rate": 3.475499606585081e-05, "loss": 0.5315, "num_input_tokens_seen": 33926120, "step": 58435 }, { "epoch": 8.70420017873101, "grad_norm": 1.2531567811965942, "learning_rate": 3.4752004143025016e-05, "loss": 0.7259, "num_input_tokens_seen": 33928744, "step": 58440 }, { "epoch": 8.70494489127197, "grad_norm": 1.0158789157867432, "learning_rate": 3.4749012055449015e-05, "loss": 0.488, "num_input_tokens_seen": 33931464, "step": 58445 }, { "epoch": 8.705689603812928, "grad_norm": 2.4295969009399414, "learning_rate": 3.4746019803173365e-05, "loss": 0.7544, "num_input_tokens_seen": 33934024, "step": 58450 }, { "epoch": 8.706434316353887, "grad_norm": 1.2621526718139648, "learning_rate": 3.474302738624862e-05, "loss": 0.6263, "num_input_tokens_seen": 33936840, "step": 58455 }, { "epoch": 8.707179028894847, "grad_norm": 1.3453054428100586, "learning_rate": 3.474003480472532e-05, "loss": 0.6016, "num_input_tokens_seen": 33939688, "step": 58460 }, { "epoch": 8.707923741435806, "grad_norm": 1.356523871421814, "learning_rate": 3.473704205865405e-05, "loss": 0.8837, "num_input_tokens_seen": 33942408, "step": 58465 }, { "epoch": 8.708668453976765, "grad_norm": 1.4910820722579956, "learning_rate": 3.473404914808534e-05, "loss": 0.6192, "num_input_tokens_seen": 33945608, "step": 58470 }, { "epoch": 8.709413166517724, "grad_norm": 1.1531109809875488, "learning_rate": 3.4731056073069754e-05, "loss": 0.6654, "num_input_tokens_seen": 33948584, "step": 58475 }, { "epoch": 8.710157879058684, "grad_norm": 1.0447545051574707, "learning_rate": 3.472806283365788e-05, "loss": 0.5024, "num_input_tokens_seen": 33951368, "step": 58480 }, { "epoch": 8.710902591599643, "grad_norm": 2.2537548542022705, "learning_rate": 3.472506942990026e-05, "loss": 0.6103, "num_input_tokens_seen": 33954344, "step": 58485 }, { "epoch": 8.711647304140602, "grad_norm": 1.5203354358673096, "learning_rate": 3.472207586184748e-05, "loss": 0.4855, "num_input_tokens_seen": 33957192, "step": 58490 }, { "epoch": 8.71239201668156, "grad_norm": 1.219425916671753, "learning_rate": 3.4719082129550106e-05, "loss": 0.7238, "num_input_tokens_seen": 33959912, "step": 58495 }, { "epoch": 8.71313672922252, "grad_norm": 1.2175812721252441, "learning_rate": 3.471608823305873e-05, "loss": 0.7596, "num_input_tokens_seen": 33963016, "step": 58500 }, { "epoch": 8.71388144176348, "grad_norm": 1.2824954986572266, "learning_rate": 3.471309417242391e-05, "loss": 0.5322, "num_input_tokens_seen": 33966184, "step": 58505 }, { "epoch": 8.714626154304439, "grad_norm": 1.080430030822754, "learning_rate": 3.471009994769624e-05, "loss": 0.42, "num_input_tokens_seen": 33969768, "step": 58510 }, { "epoch": 8.715370866845397, "grad_norm": 1.4994103908538818, "learning_rate": 3.470710555892629e-05, "loss": 0.52, "num_input_tokens_seen": 33972520, "step": 58515 }, { "epoch": 8.716115579386356, "grad_norm": 1.3470878601074219, "learning_rate": 3.470411100616466e-05, "loss": 0.6689, "num_input_tokens_seen": 33975592, "step": 58520 }, { "epoch": 8.716860291927317, "grad_norm": 1.1286381483078003, "learning_rate": 3.4701116289461945e-05, "loss": 0.5688, "num_input_tokens_seen": 33978600, "step": 58525 }, { "epoch": 8.717605004468275, "grad_norm": 2.3407721519470215, "learning_rate": 3.469812140886872e-05, "loss": 0.7039, "num_input_tokens_seen": 33981640, "step": 58530 }, { "epoch": 8.718349717009234, "grad_norm": 0.7501698136329651, "learning_rate": 3.4695126364435604e-05, "loss": 0.4715, "num_input_tokens_seen": 33984648, "step": 58535 }, { "epoch": 8.719094429550193, "grad_norm": 0.9251700639724731, "learning_rate": 3.4692131156213175e-05, "loss": 0.6072, "num_input_tokens_seen": 33987336, "step": 58540 }, { "epoch": 8.719839142091153, "grad_norm": 1.3007733821868896, "learning_rate": 3.468913578425203e-05, "loss": 0.5356, "num_input_tokens_seen": 33990248, "step": 58545 }, { "epoch": 8.720583854632112, "grad_norm": 3.5630292892456055, "learning_rate": 3.4686140248602804e-05, "loss": 0.4804, "num_input_tokens_seen": 33993320, "step": 58550 }, { "epoch": 8.721328567173071, "grad_norm": 1.9868799448013306, "learning_rate": 3.468314454931607e-05, "loss": 0.6445, "num_input_tokens_seen": 33996232, "step": 58555 }, { "epoch": 8.72207327971403, "grad_norm": 1.254656195640564, "learning_rate": 3.468014868644245e-05, "loss": 0.6585, "num_input_tokens_seen": 33999016, "step": 58560 }, { "epoch": 8.72281799225499, "grad_norm": 1.217321753501892, "learning_rate": 3.4677152660032565e-05, "loss": 0.766, "num_input_tokens_seen": 34002056, "step": 58565 }, { "epoch": 8.723562704795949, "grad_norm": 1.3127834796905518, "learning_rate": 3.467415647013702e-05, "loss": 0.7226, "num_input_tokens_seen": 34005256, "step": 58570 }, { "epoch": 8.724307417336908, "grad_norm": 0.9186568856239319, "learning_rate": 3.467116011680643e-05, "loss": 0.5627, "num_input_tokens_seen": 34007656, "step": 58575 }, { "epoch": 8.725052129877866, "grad_norm": 1.2683054208755493, "learning_rate": 3.4668163600091415e-05, "loss": 0.5531, "num_input_tokens_seen": 34010600, "step": 58580 }, { "epoch": 8.725796842418827, "grad_norm": 1.4150220155715942, "learning_rate": 3.46651669200426e-05, "loss": 0.5966, "num_input_tokens_seen": 34013640, "step": 58585 }, { "epoch": 8.726541554959786, "grad_norm": 2.0965628623962402, "learning_rate": 3.4662170076710624e-05, "loss": 0.6277, "num_input_tokens_seen": 34016552, "step": 58590 }, { "epoch": 8.727286267500745, "grad_norm": 1.493184208869934, "learning_rate": 3.46591730701461e-05, "loss": 0.5678, "num_input_tokens_seen": 34019720, "step": 58595 }, { "epoch": 8.728030980041703, "grad_norm": 1.1105409860610962, "learning_rate": 3.465617590039967e-05, "loss": 0.5304, "num_input_tokens_seen": 34022376, "step": 58600 }, { "epoch": 8.728775692582664, "grad_norm": 1.5601210594177246, "learning_rate": 3.4653178567521956e-05, "loss": 0.668, "num_input_tokens_seen": 34025320, "step": 58605 }, { "epoch": 8.729520405123623, "grad_norm": 1.060483455657959, "learning_rate": 3.4650181071563595e-05, "loss": 0.6063, "num_input_tokens_seen": 34028008, "step": 58610 }, { "epoch": 8.730265117664581, "grad_norm": 2.4892754554748535, "learning_rate": 3.4647183412575243e-05, "loss": 0.5667, "num_input_tokens_seen": 34030792, "step": 58615 }, { "epoch": 8.73100983020554, "grad_norm": 1.5112581253051758, "learning_rate": 3.464418559060753e-05, "loss": 0.7328, "num_input_tokens_seen": 34033800, "step": 58620 }, { "epoch": 8.7317545427465, "grad_norm": 1.5058387517929077, "learning_rate": 3.464118760571109e-05, "loss": 0.6087, "num_input_tokens_seen": 34036520, "step": 58625 }, { "epoch": 8.73249925528746, "grad_norm": 0.9588736295700073, "learning_rate": 3.463818945793661e-05, "loss": 0.6416, "num_input_tokens_seen": 34039240, "step": 58630 }, { "epoch": 8.733243967828418, "grad_norm": 1.1083918809890747, "learning_rate": 3.46351911473347e-05, "loss": 0.4947, "num_input_tokens_seen": 34041864, "step": 58635 }, { "epoch": 8.733988680369377, "grad_norm": 1.9856020212173462, "learning_rate": 3.463219267395603e-05, "loss": 0.592, "num_input_tokens_seen": 34044648, "step": 58640 }, { "epoch": 8.734733392910336, "grad_norm": 1.1258718967437744, "learning_rate": 3.4629194037851254e-05, "loss": 0.5182, "num_input_tokens_seen": 34047560, "step": 58645 }, { "epoch": 8.735478105451296, "grad_norm": 1.7448062896728516, "learning_rate": 3.462619523907103e-05, "loss": 0.5346, "num_input_tokens_seen": 34050152, "step": 58650 }, { "epoch": 8.736222817992255, "grad_norm": 1.4928077459335327, "learning_rate": 3.462319627766602e-05, "loss": 0.5319, "num_input_tokens_seen": 34053096, "step": 58655 }, { "epoch": 8.736967530533214, "grad_norm": 1.2077982425689697, "learning_rate": 3.462019715368689e-05, "loss": 0.4497, "num_input_tokens_seen": 34056040, "step": 58660 }, { "epoch": 8.737712243074174, "grad_norm": 1.5771502256393433, "learning_rate": 3.461719786718431e-05, "loss": 0.6285, "num_input_tokens_seen": 34058984, "step": 58665 }, { "epoch": 8.738456955615133, "grad_norm": 0.888424813747406, "learning_rate": 3.461419841820895e-05, "loss": 0.6324, "num_input_tokens_seen": 34061960, "step": 58670 }, { "epoch": 8.739201668156092, "grad_norm": 2.8834645748138428, "learning_rate": 3.461119880681147e-05, "loss": 0.6359, "num_input_tokens_seen": 34064872, "step": 58675 }, { "epoch": 8.73994638069705, "grad_norm": 1.151349425315857, "learning_rate": 3.460819903304256e-05, "loss": 0.6049, "num_input_tokens_seen": 34067496, "step": 58680 }, { "epoch": 8.74069109323801, "grad_norm": 1.1094574928283691, "learning_rate": 3.460519909695289e-05, "loss": 0.6202, "num_input_tokens_seen": 34070504, "step": 58685 }, { "epoch": 8.74143580577897, "grad_norm": 1.7916945219039917, "learning_rate": 3.460219899859314e-05, "loss": 0.6066, "num_input_tokens_seen": 34073576, "step": 58690 }, { "epoch": 8.742180518319929, "grad_norm": 1.7125458717346191, "learning_rate": 3.459919873801401e-05, "loss": 0.5838, "num_input_tokens_seen": 34076328, "step": 58695 }, { "epoch": 8.742925230860887, "grad_norm": 1.5179988145828247, "learning_rate": 3.4596198315266165e-05, "loss": 0.6573, "num_input_tokens_seen": 34078952, "step": 58700 }, { "epoch": 8.743669943401846, "grad_norm": 2.269496440887451, "learning_rate": 3.45931977304003e-05, "loss": 0.6669, "num_input_tokens_seen": 34081800, "step": 58705 }, { "epoch": 8.744414655942807, "grad_norm": 1.6975351572036743, "learning_rate": 3.4590196983467114e-05, "loss": 0.7612, "num_input_tokens_seen": 34085128, "step": 58710 }, { "epoch": 8.745159368483765, "grad_norm": 1.1139417886734009, "learning_rate": 3.45871960745173e-05, "loss": 0.5683, "num_input_tokens_seen": 34087816, "step": 58715 }, { "epoch": 8.745904081024724, "grad_norm": 1.5600872039794922, "learning_rate": 3.458419500360154e-05, "loss": 0.5348, "num_input_tokens_seen": 34090920, "step": 58720 }, { "epoch": 8.746648793565683, "grad_norm": 0.8253670334815979, "learning_rate": 3.458119377077056e-05, "loss": 0.5702, "num_input_tokens_seen": 34093608, "step": 58725 }, { "epoch": 8.747393506106643, "grad_norm": 1.6027870178222656, "learning_rate": 3.4578192376075044e-05, "loss": 0.5397, "num_input_tokens_seen": 34096552, "step": 58730 }, { "epoch": 8.748138218647602, "grad_norm": 0.9255651831626892, "learning_rate": 3.45751908195657e-05, "loss": 0.491, "num_input_tokens_seen": 34099336, "step": 58735 }, { "epoch": 8.748882931188561, "grad_norm": 0.8816794753074646, "learning_rate": 3.457218910129324e-05, "loss": 0.5777, "num_input_tokens_seen": 34102280, "step": 58740 }, { "epoch": 8.74962764372952, "grad_norm": 1.02422297000885, "learning_rate": 3.4569187221308376e-05, "loss": 0.3529, "num_input_tokens_seen": 34105192, "step": 58745 }, { "epoch": 8.75037235627048, "grad_norm": 0.7425504922866821, "learning_rate": 3.456618517966183e-05, "loss": 0.6427, "num_input_tokens_seen": 34108040, "step": 58750 }, { "epoch": 8.751117068811439, "grad_norm": 0.7990990877151489, "learning_rate": 3.4563182976404286e-05, "loss": 0.7333, "num_input_tokens_seen": 34111112, "step": 58755 }, { "epoch": 8.751861781352398, "grad_norm": 1.3129057884216309, "learning_rate": 3.456018061158649e-05, "loss": 0.6877, "num_input_tokens_seen": 34114408, "step": 58760 }, { "epoch": 8.752606493893357, "grad_norm": 0.9155202507972717, "learning_rate": 3.455717808525917e-05, "loss": 0.7666, "num_input_tokens_seen": 34117160, "step": 58765 }, { "epoch": 8.753351206434317, "grad_norm": 1.681187391281128, "learning_rate": 3.4554175397473036e-05, "loss": 0.5553, "num_input_tokens_seen": 34120072, "step": 58770 }, { "epoch": 8.754095918975276, "grad_norm": 3.161541700363159, "learning_rate": 3.455117254827882e-05, "loss": 0.4811, "num_input_tokens_seen": 34123112, "step": 58775 }, { "epoch": 8.754840631516235, "grad_norm": 1.7596821784973145, "learning_rate": 3.454816953772724e-05, "loss": 0.6774, "num_input_tokens_seen": 34125864, "step": 58780 }, { "epoch": 8.755585344057193, "grad_norm": 1.6790727376937866, "learning_rate": 3.4545166365869054e-05, "loss": 0.6886, "num_input_tokens_seen": 34129032, "step": 58785 }, { "epoch": 8.756330056598154, "grad_norm": 2.8978984355926514, "learning_rate": 3.454216303275498e-05, "loss": 0.6312, "num_input_tokens_seen": 34131848, "step": 58790 }, { "epoch": 8.757074769139113, "grad_norm": 0.911206066608429, "learning_rate": 3.4539159538435755e-05, "loss": 0.4596, "num_input_tokens_seen": 34134920, "step": 58795 }, { "epoch": 8.757819481680071, "grad_norm": 0.9608357548713684, "learning_rate": 3.453615588296213e-05, "loss": 0.5839, "num_input_tokens_seen": 34137672, "step": 58800 }, { "epoch": 8.75856419422103, "grad_norm": 1.7417304515838623, "learning_rate": 3.4533152066384844e-05, "loss": 0.733, "num_input_tokens_seen": 34141160, "step": 58805 }, { "epoch": 8.75930890676199, "grad_norm": 2.5326900482177734, "learning_rate": 3.453014808875464e-05, "loss": 0.6759, "num_input_tokens_seen": 34144232, "step": 58810 }, { "epoch": 8.76005361930295, "grad_norm": 1.1754698753356934, "learning_rate": 3.4527143950122266e-05, "loss": 0.6935, "num_input_tokens_seen": 34147304, "step": 58815 }, { "epoch": 8.760798331843908, "grad_norm": 1.2706716060638428, "learning_rate": 3.4524139650538485e-05, "loss": 0.571, "num_input_tokens_seen": 34149736, "step": 58820 }, { "epoch": 8.761543044384867, "grad_norm": 1.161080002784729, "learning_rate": 3.452113519005404e-05, "loss": 0.7023, "num_input_tokens_seen": 34152552, "step": 58825 }, { "epoch": 8.762287756925826, "grad_norm": 0.7481175661087036, "learning_rate": 3.45181305687197e-05, "loss": 0.5403, "num_input_tokens_seen": 34155400, "step": 58830 }, { "epoch": 8.763032469466786, "grad_norm": 1.259317398071289, "learning_rate": 3.451512578658621e-05, "loss": 0.7905, "num_input_tokens_seen": 34158568, "step": 58835 }, { "epoch": 8.763777182007745, "grad_norm": 1.8195778131484985, "learning_rate": 3.4512120843704344e-05, "loss": 0.7228, "num_input_tokens_seen": 34161352, "step": 58840 }, { "epoch": 8.764521894548704, "grad_norm": 1.368882417678833, "learning_rate": 3.4509115740124866e-05, "loss": 0.7415, "num_input_tokens_seen": 34164328, "step": 58845 }, { "epoch": 8.765266607089664, "grad_norm": 2.8181815147399902, "learning_rate": 3.4506110475898535e-05, "loss": 0.6819, "num_input_tokens_seen": 34167176, "step": 58850 }, { "epoch": 8.766011319630623, "grad_norm": 1.1623729467391968, "learning_rate": 3.4503105051076126e-05, "loss": 0.5817, "num_input_tokens_seen": 34170088, "step": 58855 }, { "epoch": 8.766756032171582, "grad_norm": 1.3795191049575806, "learning_rate": 3.450009946570843e-05, "loss": 0.5533, "num_input_tokens_seen": 34172808, "step": 58860 }, { "epoch": 8.76750074471254, "grad_norm": 0.6693885326385498, "learning_rate": 3.44970937198462e-05, "loss": 0.4885, "num_input_tokens_seen": 34175624, "step": 58865 }, { "epoch": 8.7682454572535, "grad_norm": 1.241252064704895, "learning_rate": 3.449408781354023e-05, "loss": 0.6331, "num_input_tokens_seen": 34178504, "step": 58870 }, { "epoch": 8.76899016979446, "grad_norm": 2.160475492477417, "learning_rate": 3.449108174684129e-05, "loss": 0.5564, "num_input_tokens_seen": 34181224, "step": 58875 }, { "epoch": 8.769734882335419, "grad_norm": 1.7559643983840942, "learning_rate": 3.448807551980017e-05, "loss": 0.6009, "num_input_tokens_seen": 34184072, "step": 58880 }, { "epoch": 8.770479594876377, "grad_norm": 1.8580408096313477, "learning_rate": 3.448506913246766e-05, "loss": 0.6567, "num_input_tokens_seen": 34186600, "step": 58885 }, { "epoch": 8.771224307417336, "grad_norm": 1.2910969257354736, "learning_rate": 3.448206258489455e-05, "loss": 0.7228, "num_input_tokens_seen": 34189160, "step": 58890 }, { "epoch": 8.771969019958297, "grad_norm": 1.3434735536575317, "learning_rate": 3.4479055877131616e-05, "loss": 0.6237, "num_input_tokens_seen": 34192072, "step": 58895 }, { "epoch": 8.772713732499255, "grad_norm": 1.362468957901001, "learning_rate": 3.4476049009229685e-05, "loss": 0.4995, "num_input_tokens_seen": 34194920, "step": 58900 }, { "epoch": 8.773458445040214, "grad_norm": 0.9945740699768066, "learning_rate": 3.447304198123953e-05, "loss": 0.5577, "num_input_tokens_seen": 34197672, "step": 58905 }, { "epoch": 8.774203157581173, "grad_norm": 1.6124540567398071, "learning_rate": 3.447003479321196e-05, "loss": 0.8319, "num_input_tokens_seen": 34200648, "step": 58910 }, { "epoch": 8.774947870122134, "grad_norm": 1.2285560369491577, "learning_rate": 3.4467027445197774e-05, "loss": 0.477, "num_input_tokens_seen": 34203848, "step": 58915 }, { "epoch": 8.775692582663092, "grad_norm": 1.4780187606811523, "learning_rate": 3.446401993724778e-05, "loss": 0.6235, "num_input_tokens_seen": 34206888, "step": 58920 }, { "epoch": 8.776437295204051, "grad_norm": 1.1600905656814575, "learning_rate": 3.446101226941279e-05, "loss": 0.6681, "num_input_tokens_seen": 34209768, "step": 58925 }, { "epoch": 8.77718200774501, "grad_norm": 1.9076906442642212, "learning_rate": 3.4458004441743605e-05, "loss": 0.4153, "num_input_tokens_seen": 34212648, "step": 58930 }, { "epoch": 8.77792672028597, "grad_norm": 1.9204089641571045, "learning_rate": 3.445499645429107e-05, "loss": 0.5389, "num_input_tokens_seen": 34215560, "step": 58935 }, { "epoch": 8.778671432826929, "grad_norm": 0.9340267181396484, "learning_rate": 3.445198830710596e-05, "loss": 0.543, "num_input_tokens_seen": 34218568, "step": 58940 }, { "epoch": 8.779416145367888, "grad_norm": 2.562307596206665, "learning_rate": 3.4448980000239114e-05, "loss": 0.4989, "num_input_tokens_seen": 34221288, "step": 58945 }, { "epoch": 8.780160857908847, "grad_norm": 1.3963242769241333, "learning_rate": 3.444597153374136e-05, "loss": 0.4791, "num_input_tokens_seen": 34224072, "step": 58950 }, { "epoch": 8.780905570449807, "grad_norm": 1.7307170629501343, "learning_rate": 3.444296290766352e-05, "loss": 0.5307, "num_input_tokens_seen": 34227240, "step": 58955 }, { "epoch": 8.781650282990766, "grad_norm": 0.9966575503349304, "learning_rate": 3.443995412205642e-05, "loss": 0.5997, "num_input_tokens_seen": 34230344, "step": 58960 }, { "epoch": 8.782394995531725, "grad_norm": 1.0958974361419678, "learning_rate": 3.443694517697089e-05, "loss": 0.6025, "num_input_tokens_seen": 34233544, "step": 58965 }, { "epoch": 8.783139708072683, "grad_norm": 1.433085322380066, "learning_rate": 3.4433936072457754e-05, "loss": 0.5537, "num_input_tokens_seen": 34236296, "step": 58970 }, { "epoch": 8.783884420613644, "grad_norm": 1.3598296642303467, "learning_rate": 3.443092680856787e-05, "loss": 0.5086, "num_input_tokens_seen": 34239240, "step": 58975 }, { "epoch": 8.784629133154603, "grad_norm": 1.59968900680542, "learning_rate": 3.442791738535205e-05, "loss": 0.6804, "num_input_tokens_seen": 34242024, "step": 58980 }, { "epoch": 8.785373845695561, "grad_norm": 0.3006972670555115, "learning_rate": 3.4424907802861143e-05, "loss": 0.5875, "num_input_tokens_seen": 34245032, "step": 58985 }, { "epoch": 8.78611855823652, "grad_norm": 1.3925050497055054, "learning_rate": 3.4421898061146005e-05, "loss": 0.5875, "num_input_tokens_seen": 34247752, "step": 58990 }, { "epoch": 8.78686327077748, "grad_norm": 2.5074522495269775, "learning_rate": 3.4418888160257486e-05, "loss": 0.6878, "num_input_tokens_seen": 34250856, "step": 58995 }, { "epoch": 8.78760798331844, "grad_norm": 1.1640046834945679, "learning_rate": 3.441587810024642e-05, "loss": 0.4695, "num_input_tokens_seen": 34253768, "step": 59000 }, { "epoch": 8.788352695859398, "grad_norm": 1.3763222694396973, "learning_rate": 3.441286788116365e-05, "loss": 0.6747, "num_input_tokens_seen": 34256872, "step": 59005 }, { "epoch": 8.789097408400357, "grad_norm": 1.981949806213379, "learning_rate": 3.440985750306006e-05, "loss": 0.591, "num_input_tokens_seen": 34259720, "step": 59010 }, { "epoch": 8.789842120941316, "grad_norm": 1.4687762260437012, "learning_rate": 3.4406846965986476e-05, "loss": 0.8476, "num_input_tokens_seen": 34262408, "step": 59015 }, { "epoch": 8.790586833482276, "grad_norm": 1.2876518964767456, "learning_rate": 3.440383626999378e-05, "loss": 0.6812, "num_input_tokens_seen": 34265416, "step": 59020 }, { "epoch": 8.791331546023235, "grad_norm": 0.9874237775802612, "learning_rate": 3.440082541513283e-05, "loss": 0.5333, "num_input_tokens_seen": 34268776, "step": 59025 }, { "epoch": 8.792076258564194, "grad_norm": 1.3553766012191772, "learning_rate": 3.439781440145449e-05, "loss": 0.6303, "num_input_tokens_seen": 34271752, "step": 59030 }, { "epoch": 8.792820971105153, "grad_norm": 1.156352162361145, "learning_rate": 3.4394803229009634e-05, "loss": 0.7932, "num_input_tokens_seen": 34274568, "step": 59035 }, { "epoch": 8.793565683646113, "grad_norm": 1.3661664724349976, "learning_rate": 3.439179189784911e-05, "loss": 0.7747, "num_input_tokens_seen": 34278088, "step": 59040 }, { "epoch": 8.794310396187072, "grad_norm": 1.417206883430481, "learning_rate": 3.438878040802381e-05, "loss": 0.5895, "num_input_tokens_seen": 34281032, "step": 59045 }, { "epoch": 8.79505510872803, "grad_norm": 2.367241144180298, "learning_rate": 3.438576875958461e-05, "loss": 0.5437, "num_input_tokens_seen": 34283976, "step": 59050 }, { "epoch": 8.79579982126899, "grad_norm": 1.4484474658966064, "learning_rate": 3.438275695258239e-05, "loss": 0.4873, "num_input_tokens_seen": 34286952, "step": 59055 }, { "epoch": 8.79654453380995, "grad_norm": 0.9338384866714478, "learning_rate": 3.4379744987068025e-05, "loss": 0.6104, "num_input_tokens_seen": 34289928, "step": 59060 }, { "epoch": 8.797289246350909, "grad_norm": 1.3670523166656494, "learning_rate": 3.43767328630924e-05, "loss": 0.5674, "num_input_tokens_seen": 34293224, "step": 59065 }, { "epoch": 8.798033958891867, "grad_norm": 1.7613247632980347, "learning_rate": 3.437372058070641e-05, "loss": 0.6295, "num_input_tokens_seen": 34295880, "step": 59070 }, { "epoch": 8.798778671432826, "grad_norm": 1.092508316040039, "learning_rate": 3.4370708139960934e-05, "loss": 0.7091, "num_input_tokens_seen": 34298792, "step": 59075 }, { "epoch": 8.799523383973787, "grad_norm": 1.3193492889404297, "learning_rate": 3.4367695540906864e-05, "loss": 0.7725, "num_input_tokens_seen": 34301576, "step": 59080 }, { "epoch": 8.800268096514746, "grad_norm": 1.3392460346221924, "learning_rate": 3.43646827835951e-05, "loss": 0.8499, "num_input_tokens_seen": 34304744, "step": 59085 }, { "epoch": 8.801012809055704, "grad_norm": 2.47501277923584, "learning_rate": 3.436166986807654e-05, "loss": 0.5932, "num_input_tokens_seen": 34307784, "step": 59090 }, { "epoch": 8.801757521596663, "grad_norm": 1.569815993309021, "learning_rate": 3.435865679440208e-05, "loss": 0.6199, "num_input_tokens_seen": 34310600, "step": 59095 }, { "epoch": 8.802502234137624, "grad_norm": 1.3596056699752808, "learning_rate": 3.435564356262263e-05, "loss": 0.6561, "num_input_tokens_seen": 34313384, "step": 59100 }, { "epoch": 8.803246946678582, "grad_norm": 1.304432988166809, "learning_rate": 3.435263017278909e-05, "loss": 0.5766, "num_input_tokens_seen": 34316392, "step": 59105 }, { "epoch": 8.803991659219541, "grad_norm": 1.843701720237732, "learning_rate": 3.4349616624952365e-05, "loss": 0.6809, "num_input_tokens_seen": 34319112, "step": 59110 }, { "epoch": 8.8047363717605, "grad_norm": 0.8307377696037292, "learning_rate": 3.434660291916337e-05, "loss": 0.6943, "num_input_tokens_seen": 34322088, "step": 59115 }, { "epoch": 8.80548108430146, "grad_norm": 0.9877431392669678, "learning_rate": 3.4343589055473025e-05, "loss": 0.6404, "num_input_tokens_seen": 34324712, "step": 59120 }, { "epoch": 8.80622579684242, "grad_norm": 1.068871259689331, "learning_rate": 3.4340575033932234e-05, "loss": 0.6263, "num_input_tokens_seen": 34327656, "step": 59125 }, { "epoch": 8.806970509383378, "grad_norm": 2.1956369876861572, "learning_rate": 3.433756085459192e-05, "loss": 0.4838, "num_input_tokens_seen": 34330472, "step": 59130 }, { "epoch": 8.807715221924337, "grad_norm": 2.454336643218994, "learning_rate": 3.4334546517503006e-05, "loss": 0.5616, "num_input_tokens_seen": 34333192, "step": 59135 }, { "epoch": 8.808459934465297, "grad_norm": 0.5850260257720947, "learning_rate": 3.4331532022716416e-05, "loss": 0.429, "num_input_tokens_seen": 34336200, "step": 59140 }, { "epoch": 8.809204647006256, "grad_norm": 1.3962528705596924, "learning_rate": 3.432851737028308e-05, "loss": 0.5437, "num_input_tokens_seen": 34339112, "step": 59145 }, { "epoch": 8.809949359547215, "grad_norm": 1.2351816892623901, "learning_rate": 3.432550256025391e-05, "loss": 0.7199, "num_input_tokens_seen": 34341928, "step": 59150 }, { "epoch": 8.810694072088173, "grad_norm": 1.2054522037506104, "learning_rate": 3.4322487592679876e-05, "loss": 0.534, "num_input_tokens_seen": 34344776, "step": 59155 }, { "epoch": 8.811438784629132, "grad_norm": 1.879062294960022, "learning_rate": 3.4319472467611876e-05, "loss": 0.4923, "num_input_tokens_seen": 34347816, "step": 59160 }, { "epoch": 8.812183497170093, "grad_norm": 2.313223123550415, "learning_rate": 3.431645718510086e-05, "loss": 0.7622, "num_input_tokens_seen": 34350568, "step": 59165 }, { "epoch": 8.812928209711052, "grad_norm": 1.316519856452942, "learning_rate": 3.431344174519777e-05, "loss": 0.5166, "num_input_tokens_seen": 34353384, "step": 59170 }, { "epoch": 8.81367292225201, "grad_norm": 1.0859096050262451, "learning_rate": 3.431042614795354e-05, "loss": 0.7618, "num_input_tokens_seen": 34356200, "step": 59175 }, { "epoch": 8.81441763479297, "grad_norm": 3.0036611557006836, "learning_rate": 3.430741039341914e-05, "loss": 0.6927, "num_input_tokens_seen": 34359240, "step": 59180 }, { "epoch": 8.81516234733393, "grad_norm": 2.3098578453063965, "learning_rate": 3.4304394481645485e-05, "loss": 0.5621, "num_input_tokens_seen": 34361992, "step": 59185 }, { "epoch": 8.815907059874888, "grad_norm": 1.6255923509597778, "learning_rate": 3.430137841268355e-05, "loss": 0.8539, "num_input_tokens_seen": 34364776, "step": 59190 }, { "epoch": 8.816651772415847, "grad_norm": 1.7396801710128784, "learning_rate": 3.4298362186584275e-05, "loss": 0.7278, "num_input_tokens_seen": 34367880, "step": 59195 }, { "epoch": 8.817396484956806, "grad_norm": 1.8648899793624878, "learning_rate": 3.4295345803398634e-05, "loss": 0.6847, "num_input_tokens_seen": 34371048, "step": 59200 }, { "epoch": 8.818141197497766, "grad_norm": 1.131958246231079, "learning_rate": 3.429232926317756e-05, "loss": 0.669, "num_input_tokens_seen": 34373736, "step": 59205 }, { "epoch": 8.818885910038725, "grad_norm": 0.8526650667190552, "learning_rate": 3.428931256597203e-05, "loss": 0.6522, "num_input_tokens_seen": 34376648, "step": 59210 }, { "epoch": 8.819630622579684, "grad_norm": 1.9315348863601685, "learning_rate": 3.428629571183301e-05, "loss": 0.5994, "num_input_tokens_seen": 34379368, "step": 59215 }, { "epoch": 8.820375335120643, "grad_norm": 0.9450962543487549, "learning_rate": 3.428327870081145e-05, "loss": 0.4624, "num_input_tokens_seen": 34382152, "step": 59220 }, { "epoch": 8.821120047661603, "grad_norm": 1.2817507982254028, "learning_rate": 3.428026153295834e-05, "loss": 0.6103, "num_input_tokens_seen": 34385032, "step": 59225 }, { "epoch": 8.821864760202562, "grad_norm": 1.5391066074371338, "learning_rate": 3.427724420832464e-05, "loss": 0.5327, "num_input_tokens_seen": 34388072, "step": 59230 }, { "epoch": 8.82260947274352, "grad_norm": 1.6258131265640259, "learning_rate": 3.427422672696135e-05, "loss": 0.6181, "num_input_tokens_seen": 34390856, "step": 59235 }, { "epoch": 8.82335418528448, "grad_norm": 1.3047150373458862, "learning_rate": 3.42712090889194e-05, "loss": 0.5835, "num_input_tokens_seen": 34393672, "step": 59240 }, { "epoch": 8.82409889782544, "grad_norm": 2.5677926540374756, "learning_rate": 3.426819129424979e-05, "loss": 0.5626, "num_input_tokens_seen": 34396616, "step": 59245 }, { "epoch": 8.824843610366399, "grad_norm": 0.848682165145874, "learning_rate": 3.426517334300352e-05, "loss": 0.6677, "num_input_tokens_seen": 34399336, "step": 59250 }, { "epoch": 8.825588322907358, "grad_norm": 1.2953016757965088, "learning_rate": 3.426215523523157e-05, "loss": 0.5369, "num_input_tokens_seen": 34402184, "step": 59255 }, { "epoch": 8.826333035448316, "grad_norm": 2.0377511978149414, "learning_rate": 3.425913697098491e-05, "loss": 0.5867, "num_input_tokens_seen": 34405064, "step": 59260 }, { "epoch": 8.827077747989277, "grad_norm": 1.4463826417922974, "learning_rate": 3.4256118550314556e-05, "loss": 0.6652, "num_input_tokens_seen": 34407656, "step": 59265 }, { "epoch": 8.827822460530236, "grad_norm": 1.4941200017929077, "learning_rate": 3.425309997327147e-05, "loss": 0.6803, "num_input_tokens_seen": 34410376, "step": 59270 }, { "epoch": 8.828567173071194, "grad_norm": 1.9896782636642456, "learning_rate": 3.4250081239906674e-05, "loss": 0.7664, "num_input_tokens_seen": 34413352, "step": 59275 }, { "epoch": 8.829311885612153, "grad_norm": 0.9727115631103516, "learning_rate": 3.424706235027115e-05, "loss": 0.6359, "num_input_tokens_seen": 34416424, "step": 59280 }, { "epoch": 8.830056598153114, "grad_norm": 1.530707597732544, "learning_rate": 3.4244043304415907e-05, "loss": 0.6387, "num_input_tokens_seen": 34419624, "step": 59285 }, { "epoch": 8.830801310694072, "grad_norm": 2.0044145584106445, "learning_rate": 3.424102410239195e-05, "loss": 0.5629, "num_input_tokens_seen": 34422280, "step": 59290 }, { "epoch": 8.831546023235031, "grad_norm": 1.1036884784698486, "learning_rate": 3.423800474425029e-05, "loss": 0.735, "num_input_tokens_seen": 34425032, "step": 59295 }, { "epoch": 8.83229073577599, "grad_norm": 0.8773248791694641, "learning_rate": 3.4234985230041916e-05, "loss": 0.549, "num_input_tokens_seen": 34427848, "step": 59300 }, { "epoch": 8.83303544831695, "grad_norm": 1.1669996976852417, "learning_rate": 3.4231965559817856e-05, "loss": 0.6472, "num_input_tokens_seen": 34430600, "step": 59305 }, { "epoch": 8.83378016085791, "grad_norm": 1.4617526531219482, "learning_rate": 3.4228945733629124e-05, "loss": 0.4748, "num_input_tokens_seen": 34433384, "step": 59310 }, { "epoch": 8.834524873398868, "grad_norm": 1.1127701997756958, "learning_rate": 3.422592575152673e-05, "loss": 0.7407, "num_input_tokens_seen": 34436264, "step": 59315 }, { "epoch": 8.835269585939827, "grad_norm": 3.6965696811676025, "learning_rate": 3.4222905613561706e-05, "loss": 0.6347, "num_input_tokens_seen": 34439112, "step": 59320 }, { "epoch": 8.836014298480787, "grad_norm": 0.674543559551239, "learning_rate": 3.421988531978506e-05, "loss": 0.5869, "num_input_tokens_seen": 34442056, "step": 59325 }, { "epoch": 8.836759011021746, "grad_norm": 1.0268527269363403, "learning_rate": 3.421686487024782e-05, "loss": 0.5618, "num_input_tokens_seen": 34444808, "step": 59330 }, { "epoch": 8.837503723562705, "grad_norm": 1.2323240041732788, "learning_rate": 3.4213844265001015e-05, "loss": 0.4925, "num_input_tokens_seen": 34448040, "step": 59335 }, { "epoch": 8.838248436103664, "grad_norm": 1.0713964700698853, "learning_rate": 3.421082350409568e-05, "loss": 0.5152, "num_input_tokens_seen": 34451016, "step": 59340 }, { "epoch": 8.838993148644622, "grad_norm": 3.0063211917877197, "learning_rate": 3.420780258758284e-05, "loss": 0.6478, "num_input_tokens_seen": 34454088, "step": 59345 }, { "epoch": 8.839737861185583, "grad_norm": 0.9839751720428467, "learning_rate": 3.420478151551353e-05, "loss": 0.7124, "num_input_tokens_seen": 34456936, "step": 59350 }, { "epoch": 8.840482573726542, "grad_norm": 1.3963453769683838, "learning_rate": 3.42017602879388e-05, "loss": 0.4274, "num_input_tokens_seen": 34460168, "step": 59355 }, { "epoch": 8.8412272862675, "grad_norm": 1.2136541604995728, "learning_rate": 3.419873890490968e-05, "loss": 0.5916, "num_input_tokens_seen": 34463112, "step": 59360 }, { "epoch": 8.84197199880846, "grad_norm": 1.4466630220413208, "learning_rate": 3.4195717366477216e-05, "loss": 0.8214, "num_input_tokens_seen": 34465896, "step": 59365 }, { "epoch": 8.84271671134942, "grad_norm": 1.725434422492981, "learning_rate": 3.419269567269245e-05, "loss": 0.7305, "num_input_tokens_seen": 34468776, "step": 59370 }, { "epoch": 8.843461423890378, "grad_norm": 2.6167891025543213, "learning_rate": 3.418967382360643e-05, "loss": 0.7031, "num_input_tokens_seen": 34471624, "step": 59375 }, { "epoch": 8.844206136431337, "grad_norm": 1.371252179145813, "learning_rate": 3.4186651819270224e-05, "loss": 0.7914, "num_input_tokens_seen": 34474536, "step": 59380 }, { "epoch": 8.844950848972296, "grad_norm": 1.348551869392395, "learning_rate": 3.4183629659734855e-05, "loss": 0.4675, "num_input_tokens_seen": 34477512, "step": 59385 }, { "epoch": 8.845695561513256, "grad_norm": 0.8502729535102844, "learning_rate": 3.418060734505141e-05, "loss": 0.716, "num_input_tokens_seen": 34480168, "step": 59390 }, { "epoch": 8.846440274054215, "grad_norm": 2.084158420562744, "learning_rate": 3.417758487527093e-05, "loss": 0.7755, "num_input_tokens_seen": 34483144, "step": 59395 }, { "epoch": 8.847184986595174, "grad_norm": 0.9894115328788757, "learning_rate": 3.417456225044449e-05, "loss": 0.4737, "num_input_tokens_seen": 34485832, "step": 59400 }, { "epoch": 8.847929699136133, "grad_norm": 1.3472167253494263, "learning_rate": 3.417153947062313e-05, "loss": 0.6654, "num_input_tokens_seen": 34488872, "step": 59405 }, { "epoch": 8.848674411677093, "grad_norm": 1.8257330656051636, "learning_rate": 3.416851653585794e-05, "loss": 0.8337, "num_input_tokens_seen": 34491720, "step": 59410 }, { "epoch": 8.849419124218052, "grad_norm": 0.9955701231956482, "learning_rate": 3.416549344619998e-05, "loss": 0.5755, "num_input_tokens_seen": 34494632, "step": 59415 }, { "epoch": 8.85016383675901, "grad_norm": 0.91278475522995, "learning_rate": 3.416247020170032e-05, "loss": 0.7715, "num_input_tokens_seen": 34497672, "step": 59420 }, { "epoch": 8.85090854929997, "grad_norm": 2.2195615768432617, "learning_rate": 3.415944680241004e-05, "loss": 0.7524, "num_input_tokens_seen": 34500424, "step": 59425 }, { "epoch": 8.85165326184093, "grad_norm": 1.2376434803009033, "learning_rate": 3.415642324838023e-05, "loss": 0.6877, "num_input_tokens_seen": 34503208, "step": 59430 }, { "epoch": 8.852397974381889, "grad_norm": 1.9377191066741943, "learning_rate": 3.415339953966194e-05, "loss": 0.5777, "num_input_tokens_seen": 34506024, "step": 59435 }, { "epoch": 8.853142686922848, "grad_norm": 1.0118359327316284, "learning_rate": 3.4150375676306276e-05, "loss": 0.5001, "num_input_tokens_seen": 34508968, "step": 59440 }, { "epoch": 8.853887399463806, "grad_norm": 1.5026191473007202, "learning_rate": 3.4147351658364304e-05, "loss": 0.6067, "num_input_tokens_seen": 34511656, "step": 59445 }, { "epoch": 8.854632112004767, "grad_norm": 1.4304695129394531, "learning_rate": 3.4144327485887126e-05, "loss": 0.6835, "num_input_tokens_seen": 34514696, "step": 59450 }, { "epoch": 8.855376824545726, "grad_norm": 0.8823602199554443, "learning_rate": 3.414130315892583e-05, "loss": 0.6896, "num_input_tokens_seen": 34517480, "step": 59455 }, { "epoch": 8.856121537086684, "grad_norm": 0.7936989665031433, "learning_rate": 3.4138278677531515e-05, "loss": 0.4557, "num_input_tokens_seen": 34520488, "step": 59460 }, { "epoch": 8.856866249627643, "grad_norm": 1.104284644126892, "learning_rate": 3.413525404175527e-05, "loss": 0.6982, "num_input_tokens_seen": 34523208, "step": 59465 }, { "epoch": 8.857610962168604, "grad_norm": 1.3505122661590576, "learning_rate": 3.413222925164818e-05, "loss": 0.6045, "num_input_tokens_seen": 34526248, "step": 59470 }, { "epoch": 8.858355674709562, "grad_norm": 1.9301801919937134, "learning_rate": 3.412920430726137e-05, "loss": 0.5661, "num_input_tokens_seen": 34529000, "step": 59475 }, { "epoch": 8.859100387250521, "grad_norm": 1.5104964971542358, "learning_rate": 3.412617920864593e-05, "loss": 0.6385, "num_input_tokens_seen": 34531976, "step": 59480 }, { "epoch": 8.85984509979148, "grad_norm": 1.181889295578003, "learning_rate": 3.412315395585296e-05, "loss": 0.5391, "num_input_tokens_seen": 34534728, "step": 59485 }, { "epoch": 8.86058981233244, "grad_norm": 1.3386552333831787, "learning_rate": 3.4120128548933575e-05, "loss": 0.6152, "num_input_tokens_seen": 34537800, "step": 59490 }, { "epoch": 8.8613345248734, "grad_norm": 1.6896933317184448, "learning_rate": 3.4117102987938895e-05, "loss": 0.7283, "num_input_tokens_seen": 34540392, "step": 59495 }, { "epoch": 8.862079237414358, "grad_norm": 1.1973463296890259, "learning_rate": 3.411407727292003e-05, "loss": 0.574, "num_input_tokens_seen": 34543240, "step": 59500 }, { "epoch": 8.862823949955317, "grad_norm": 1.148511290550232, "learning_rate": 3.411105140392808e-05, "loss": 0.523, "num_input_tokens_seen": 34545960, "step": 59505 }, { "epoch": 8.863568662496277, "grad_norm": 1.0522648096084595, "learning_rate": 3.4108025381014184e-05, "loss": 0.4251, "num_input_tokens_seen": 34548616, "step": 59510 }, { "epoch": 8.864313375037236, "grad_norm": 1.301453948020935, "learning_rate": 3.4104999204229466e-05, "loss": 0.5472, "num_input_tokens_seen": 34551432, "step": 59515 }, { "epoch": 8.865058087578195, "grad_norm": 1.0698901414871216, "learning_rate": 3.410197287362503e-05, "loss": 0.6984, "num_input_tokens_seen": 34554216, "step": 59520 }, { "epoch": 8.865802800119154, "grad_norm": 1.3594865798950195, "learning_rate": 3.409894638925201e-05, "loss": 0.5727, "num_input_tokens_seen": 34557576, "step": 59525 }, { "epoch": 8.866547512660112, "grad_norm": 1.2248764038085938, "learning_rate": 3.409591975116155e-05, "loss": 0.5779, "num_input_tokens_seen": 34560328, "step": 59530 }, { "epoch": 8.867292225201073, "grad_norm": 2.1656219959259033, "learning_rate": 3.409289295940476e-05, "loss": 0.6076, "num_input_tokens_seen": 34563176, "step": 59535 }, { "epoch": 8.868036937742032, "grad_norm": 0.8972069621086121, "learning_rate": 3.408986601403278e-05, "loss": 0.5712, "num_input_tokens_seen": 34566184, "step": 59540 }, { "epoch": 8.86878165028299, "grad_norm": 1.634325623512268, "learning_rate": 3.4086838915096765e-05, "loss": 0.7339, "num_input_tokens_seen": 34569000, "step": 59545 }, { "epoch": 8.86952636282395, "grad_norm": 1.0363411903381348, "learning_rate": 3.408381166264784e-05, "loss": 0.5535, "num_input_tokens_seen": 34571720, "step": 59550 }, { "epoch": 8.87027107536491, "grad_norm": 1.3346052169799805, "learning_rate": 3.408078425673714e-05, "loss": 0.6764, "num_input_tokens_seen": 34574376, "step": 59555 }, { "epoch": 8.871015787905868, "grad_norm": 1.6902778148651123, "learning_rate": 3.407775669741583e-05, "loss": 0.6976, "num_input_tokens_seen": 34577032, "step": 59560 }, { "epoch": 8.871760500446827, "grad_norm": 0.8608501553535461, "learning_rate": 3.4074728984735043e-05, "loss": 0.705, "num_input_tokens_seen": 34579944, "step": 59565 }, { "epoch": 8.872505212987786, "grad_norm": 1.8794623613357544, "learning_rate": 3.407170111874593e-05, "loss": 0.537, "num_input_tokens_seen": 34582504, "step": 59570 }, { "epoch": 8.873249925528746, "grad_norm": 1.6682699918746948, "learning_rate": 3.4068673099499646e-05, "loss": 0.7425, "num_input_tokens_seen": 34585288, "step": 59575 }, { "epoch": 8.873994638069705, "grad_norm": 1.7503926753997803, "learning_rate": 3.4065644927047354e-05, "loss": 0.7396, "num_input_tokens_seen": 34588168, "step": 59580 }, { "epoch": 8.874739350610664, "grad_norm": 1.2580218315124512, "learning_rate": 3.40626166014402e-05, "loss": 0.6486, "num_input_tokens_seen": 34590792, "step": 59585 }, { "epoch": 8.875484063151623, "grad_norm": 1.079202651977539, "learning_rate": 3.4059588122729344e-05, "loss": 0.5483, "num_input_tokens_seen": 34593576, "step": 59590 }, { "epoch": 8.876228775692583, "grad_norm": 1.580507516860962, "learning_rate": 3.405655949096597e-05, "loss": 0.5866, "num_input_tokens_seen": 34596616, "step": 59595 }, { "epoch": 8.876973488233542, "grad_norm": 1.1255749464035034, "learning_rate": 3.405353070620122e-05, "loss": 0.7689, "num_input_tokens_seen": 34599656, "step": 59600 }, { "epoch": 8.8777182007745, "grad_norm": 1.2635204792022705, "learning_rate": 3.4050501768486266e-05, "loss": 0.625, "num_input_tokens_seen": 34602408, "step": 59605 }, { "epoch": 8.87846291331546, "grad_norm": 1.4468611478805542, "learning_rate": 3.404747267787228e-05, "loss": 0.6035, "num_input_tokens_seen": 34605448, "step": 59610 }, { "epoch": 8.87920762585642, "grad_norm": 1.3150800466537476, "learning_rate": 3.404444343441045e-05, "loss": 0.5221, "num_input_tokens_seen": 34608520, "step": 59615 }, { "epoch": 8.879952338397379, "grad_norm": 1.2673534154891968, "learning_rate": 3.404141403815193e-05, "loss": 0.5774, "num_input_tokens_seen": 34611752, "step": 59620 }, { "epoch": 8.880697050938338, "grad_norm": 1.4219635725021362, "learning_rate": 3.4038384489147926e-05, "loss": 0.4926, "num_input_tokens_seen": 34614568, "step": 59625 }, { "epoch": 8.881441763479296, "grad_norm": 0.9942583441734314, "learning_rate": 3.4035354787449584e-05, "loss": 0.7578, "num_input_tokens_seen": 34617256, "step": 59630 }, { "epoch": 8.882186476020257, "grad_norm": 0.8767685294151306, "learning_rate": 3.403232493310811e-05, "loss": 0.4532, "num_input_tokens_seen": 34620168, "step": 59635 }, { "epoch": 8.882931188561216, "grad_norm": 1.7209768295288086, "learning_rate": 3.402929492617469e-05, "loss": 0.7228, "num_input_tokens_seen": 34623048, "step": 59640 }, { "epoch": 8.883675901102174, "grad_norm": 2.1709868907928467, "learning_rate": 3.402626476670051e-05, "loss": 0.5909, "num_input_tokens_seen": 34625896, "step": 59645 }, { "epoch": 8.884420613643133, "grad_norm": 2.003164529800415, "learning_rate": 3.4023234454736756e-05, "loss": 0.686, "num_input_tokens_seen": 34628744, "step": 59650 }, { "epoch": 8.885165326184094, "grad_norm": 1.1088347434997559, "learning_rate": 3.402020399033463e-05, "loss": 0.6631, "num_input_tokens_seen": 34631464, "step": 59655 }, { "epoch": 8.885910038725052, "grad_norm": 2.9439985752105713, "learning_rate": 3.401717337354533e-05, "loss": 0.7645, "num_input_tokens_seen": 34634376, "step": 59660 }, { "epoch": 8.886654751266011, "grad_norm": 1.160654067993164, "learning_rate": 3.401414260442004e-05, "loss": 0.706, "num_input_tokens_seen": 34637192, "step": 59665 }, { "epoch": 8.88739946380697, "grad_norm": 0.9763854742050171, "learning_rate": 3.401111168300998e-05, "loss": 0.6472, "num_input_tokens_seen": 34639848, "step": 59670 }, { "epoch": 8.88814417634793, "grad_norm": 1.1791067123413086, "learning_rate": 3.400808060936635e-05, "loss": 0.6226, "num_input_tokens_seen": 34642472, "step": 59675 }, { "epoch": 8.88888888888889, "grad_norm": 1.4565013647079468, "learning_rate": 3.4005049383540345e-05, "loss": 0.7091, "num_input_tokens_seen": 34645288, "step": 59680 }, { "epoch": 8.889633601429848, "grad_norm": 1.7471387386322021, "learning_rate": 3.400201800558318e-05, "loss": 0.6724, "num_input_tokens_seen": 34648136, "step": 59685 }, { "epoch": 8.890378313970807, "grad_norm": 1.9751455783843994, "learning_rate": 3.399898647554608e-05, "loss": 0.5695, "num_input_tokens_seen": 34650856, "step": 59690 }, { "epoch": 8.891123026511767, "grad_norm": 1.38359534740448, "learning_rate": 3.399595479348024e-05, "loss": 0.6846, "num_input_tokens_seen": 34653800, "step": 59695 }, { "epoch": 8.891867739052726, "grad_norm": 1.1235663890838623, "learning_rate": 3.3992922959436894e-05, "loss": 0.8573, "num_input_tokens_seen": 34656712, "step": 59700 }, { "epoch": 8.892612451593685, "grad_norm": 1.0123370885849, "learning_rate": 3.3989890973467255e-05, "loss": 0.563, "num_input_tokens_seen": 34659528, "step": 59705 }, { "epoch": 8.893357164134644, "grad_norm": 1.30585515499115, "learning_rate": 3.3986858835622536e-05, "loss": 0.7159, "num_input_tokens_seen": 34662216, "step": 59710 }, { "epoch": 8.894101876675602, "grad_norm": 1.1650925874710083, "learning_rate": 3.398382654595398e-05, "loss": 0.6533, "num_input_tokens_seen": 34665192, "step": 59715 }, { "epoch": 8.894846589216563, "grad_norm": 1.1215708255767822, "learning_rate": 3.39807941045128e-05, "loss": 0.6103, "num_input_tokens_seen": 34667784, "step": 59720 }, { "epoch": 8.895591301757522, "grad_norm": 1.140829086303711, "learning_rate": 3.397776151135024e-05, "loss": 0.541, "num_input_tokens_seen": 34670824, "step": 59725 }, { "epoch": 8.89633601429848, "grad_norm": 1.192490577697754, "learning_rate": 3.397472876651752e-05, "loss": 0.5479, "num_input_tokens_seen": 34673608, "step": 59730 }, { "epoch": 8.89708072683944, "grad_norm": 0.9584538340568542, "learning_rate": 3.397169587006588e-05, "loss": 0.7739, "num_input_tokens_seen": 34676584, "step": 59735 }, { "epoch": 8.8978254393804, "grad_norm": 1.1380054950714111, "learning_rate": 3.396866282204655e-05, "loss": 0.6254, "num_input_tokens_seen": 34679368, "step": 59740 }, { "epoch": 8.898570151921358, "grad_norm": 1.6760412454605103, "learning_rate": 3.3965629622510776e-05, "loss": 0.7355, "num_input_tokens_seen": 34682248, "step": 59745 }, { "epoch": 8.899314864462317, "grad_norm": 1.873949646949768, "learning_rate": 3.3962596271509806e-05, "loss": 0.5219, "num_input_tokens_seen": 34685032, "step": 59750 }, { "epoch": 8.900059577003276, "grad_norm": 1.577309012413025, "learning_rate": 3.395956276909488e-05, "loss": 0.7108, "num_input_tokens_seen": 34687688, "step": 59755 }, { "epoch": 8.900804289544237, "grad_norm": 1.4716796875, "learning_rate": 3.395652911531725e-05, "loss": 0.5337, "num_input_tokens_seen": 34690728, "step": 59760 }, { "epoch": 8.901549002085195, "grad_norm": 1.3198922872543335, "learning_rate": 3.395349531022817e-05, "loss": 0.5255, "num_input_tokens_seen": 34693864, "step": 59765 }, { "epoch": 8.902293714626154, "grad_norm": 1.6198651790618896, "learning_rate": 3.395046135387888e-05, "loss": 0.5427, "num_input_tokens_seen": 34696904, "step": 59770 }, { "epoch": 8.903038427167113, "grad_norm": 1.8039190769195557, "learning_rate": 3.394742724632064e-05, "loss": 0.5535, "num_input_tokens_seen": 34699944, "step": 59775 }, { "epoch": 8.903783139708073, "grad_norm": 1.325305461883545, "learning_rate": 3.3944392987604703e-05, "loss": 0.9737, "num_input_tokens_seen": 34703048, "step": 59780 }, { "epoch": 8.904527852249032, "grad_norm": 1.2646417617797852, "learning_rate": 3.394135857778235e-05, "loss": 0.7818, "num_input_tokens_seen": 34705928, "step": 59785 }, { "epoch": 8.90527256478999, "grad_norm": 1.0036253929138184, "learning_rate": 3.3938324016904825e-05, "loss": 0.5268, "num_input_tokens_seen": 34708968, "step": 59790 }, { "epoch": 8.90601727733095, "grad_norm": 1.6390883922576904, "learning_rate": 3.3935289305023405e-05, "loss": 0.7167, "num_input_tokens_seen": 34711816, "step": 59795 }, { "epoch": 8.90676198987191, "grad_norm": 2.4074671268463135, "learning_rate": 3.393225444218936e-05, "loss": 0.6464, "num_input_tokens_seen": 34714440, "step": 59800 }, { "epoch": 8.907506702412869, "grad_norm": 0.9358476996421814, "learning_rate": 3.392921942845394e-05, "loss": 0.5564, "num_input_tokens_seen": 34717320, "step": 59805 }, { "epoch": 8.908251414953828, "grad_norm": 1.4797242879867554, "learning_rate": 3.392618426386843e-05, "loss": 0.7516, "num_input_tokens_seen": 34720328, "step": 59810 }, { "epoch": 8.908996127494786, "grad_norm": 1.6611114740371704, "learning_rate": 3.3923148948484115e-05, "loss": 0.6766, "num_input_tokens_seen": 34723560, "step": 59815 }, { "epoch": 8.909740840035747, "grad_norm": 2.369997978210449, "learning_rate": 3.392011348235228e-05, "loss": 0.5782, "num_input_tokens_seen": 34726568, "step": 59820 }, { "epoch": 8.910485552576706, "grad_norm": 0.6969296336174011, "learning_rate": 3.391707786552418e-05, "loss": 0.6436, "num_input_tokens_seen": 34729320, "step": 59825 }, { "epoch": 8.911230265117664, "grad_norm": 0.9174374341964722, "learning_rate": 3.391404209805112e-05, "loss": 0.5718, "num_input_tokens_seen": 34732104, "step": 59830 }, { "epoch": 8.911974977658623, "grad_norm": 1.0925058126449585, "learning_rate": 3.3911006179984375e-05, "loss": 0.5348, "num_input_tokens_seen": 34734856, "step": 59835 }, { "epoch": 8.912719690199584, "grad_norm": 1.3048067092895508, "learning_rate": 3.3907970111375237e-05, "loss": 0.6383, "num_input_tokens_seen": 34737576, "step": 59840 }, { "epoch": 8.913464402740543, "grad_norm": 0.8246944546699524, "learning_rate": 3.3904933892275e-05, "loss": 0.6652, "num_input_tokens_seen": 34740584, "step": 59845 }, { "epoch": 8.914209115281501, "grad_norm": 0.9098600745201111, "learning_rate": 3.390189752273495e-05, "loss": 0.5317, "num_input_tokens_seen": 34743656, "step": 59850 }, { "epoch": 8.91495382782246, "grad_norm": 1.0978615283966064, "learning_rate": 3.389886100280639e-05, "loss": 0.5671, "num_input_tokens_seen": 34746568, "step": 59855 }, { "epoch": 8.915698540363419, "grad_norm": 1.0264543294906616, "learning_rate": 3.389582433254062e-05, "loss": 0.686, "num_input_tokens_seen": 34749736, "step": 59860 }, { "epoch": 8.91644325290438, "grad_norm": 1.4690710306167603, "learning_rate": 3.3892787511988936e-05, "loss": 0.5206, "num_input_tokens_seen": 34752648, "step": 59865 }, { "epoch": 8.917187965445338, "grad_norm": 1.2569470405578613, "learning_rate": 3.3889750541202654e-05, "loss": 0.8372, "num_input_tokens_seen": 34755880, "step": 59870 }, { "epoch": 8.917932677986297, "grad_norm": 1.6617060899734497, "learning_rate": 3.388671342023306e-05, "loss": 0.7082, "num_input_tokens_seen": 34758696, "step": 59875 }, { "epoch": 8.918677390527257, "grad_norm": 0.9416143894195557, "learning_rate": 3.388367614913149e-05, "loss": 0.5751, "num_input_tokens_seen": 34761480, "step": 59880 }, { "epoch": 8.919422103068216, "grad_norm": 2.3420751094818115, "learning_rate": 3.388063872794923e-05, "loss": 0.5911, "num_input_tokens_seen": 34764744, "step": 59885 }, { "epoch": 8.920166815609175, "grad_norm": 1.2231063842773438, "learning_rate": 3.3877601156737604e-05, "loss": 0.5024, "num_input_tokens_seen": 34767496, "step": 59890 }, { "epoch": 8.920911528150134, "grad_norm": 2.2382593154907227, "learning_rate": 3.3874563435547934e-05, "loss": 0.7306, "num_input_tokens_seen": 34771560, "step": 59895 }, { "epoch": 8.921656240691092, "grad_norm": 1.2611374855041504, "learning_rate": 3.387152556443153e-05, "loss": 0.6754, "num_input_tokens_seen": 34774504, "step": 59900 }, { "epoch": 8.922400953232053, "grad_norm": 1.9613176584243774, "learning_rate": 3.386848754343972e-05, "loss": 0.5446, "num_input_tokens_seen": 34777352, "step": 59905 }, { "epoch": 8.923145665773012, "grad_norm": 1.1323422193527222, "learning_rate": 3.386544937262382e-05, "loss": 0.5348, "num_input_tokens_seen": 34780008, "step": 59910 }, { "epoch": 8.92389037831397, "grad_norm": 1.4846526384353638, "learning_rate": 3.386241105203517e-05, "loss": 0.6031, "num_input_tokens_seen": 34783144, "step": 59915 }, { "epoch": 8.92463509085493, "grad_norm": 1.1961724758148193, "learning_rate": 3.38593725817251e-05, "loss": 0.6467, "num_input_tokens_seen": 34785960, "step": 59920 }, { "epoch": 8.92537980339589, "grad_norm": 1.2757577896118164, "learning_rate": 3.385633396174492e-05, "loss": 0.6412, "num_input_tokens_seen": 34788680, "step": 59925 }, { "epoch": 8.926124515936849, "grad_norm": 1.1621087789535522, "learning_rate": 3.385329519214599e-05, "loss": 0.5169, "num_input_tokens_seen": 34791400, "step": 59930 }, { "epoch": 8.926869228477807, "grad_norm": 1.2832940816879272, "learning_rate": 3.385025627297963e-05, "loss": 0.6334, "num_input_tokens_seen": 34794440, "step": 59935 }, { "epoch": 8.927613941018766, "grad_norm": 1.8293360471725464, "learning_rate": 3.384721720429718e-05, "loss": 0.7073, "num_input_tokens_seen": 34797768, "step": 59940 }, { "epoch": 8.928358653559727, "grad_norm": 1.3395602703094482, "learning_rate": 3.384417798614999e-05, "loss": 0.836, "num_input_tokens_seen": 34800456, "step": 59945 }, { "epoch": 8.929103366100685, "grad_norm": 2.1074740886688232, "learning_rate": 3.3841138618589416e-05, "loss": 0.6637, "num_input_tokens_seen": 34803464, "step": 59950 }, { "epoch": 8.929848078641644, "grad_norm": 0.9633611440658569, "learning_rate": 3.383809910166678e-05, "loss": 0.6508, "num_input_tokens_seen": 34806472, "step": 59955 }, { "epoch": 8.930592791182603, "grad_norm": 1.1137219667434692, "learning_rate": 3.383505943543344e-05, "loss": 0.7709, "num_input_tokens_seen": 34809480, "step": 59960 }, { "epoch": 8.931337503723563, "grad_norm": 1.6703611612319946, "learning_rate": 3.383201961994076e-05, "loss": 0.6279, "num_input_tokens_seen": 34812328, "step": 59965 }, { "epoch": 8.932082216264522, "grad_norm": 1.2808358669281006, "learning_rate": 3.382897965524007e-05, "loss": 0.5071, "num_input_tokens_seen": 34814952, "step": 59970 }, { "epoch": 8.932826928805481, "grad_norm": 1.1710351705551147, "learning_rate": 3.382593954138276e-05, "loss": 0.5984, "num_input_tokens_seen": 34817512, "step": 59975 }, { "epoch": 8.93357164134644, "grad_norm": 1.8714959621429443, "learning_rate": 3.382289927842015e-05, "loss": 0.6991, "num_input_tokens_seen": 34820424, "step": 59980 }, { "epoch": 8.9343163538874, "grad_norm": 1.4271901845932007, "learning_rate": 3.381985886640364e-05, "loss": 0.7543, "num_input_tokens_seen": 34823112, "step": 59985 }, { "epoch": 8.935061066428359, "grad_norm": 1.8042269945144653, "learning_rate": 3.381681830538458e-05, "loss": 0.7077, "num_input_tokens_seen": 34825800, "step": 59990 }, { "epoch": 8.935805778969318, "grad_norm": 0.9946048855781555, "learning_rate": 3.3813777595414324e-05, "loss": 0.7075, "num_input_tokens_seen": 34828648, "step": 59995 }, { "epoch": 8.936550491510276, "grad_norm": 1.4228250980377197, "learning_rate": 3.3810736736544265e-05, "loss": 0.5891, "num_input_tokens_seen": 34831496, "step": 60000 }, { "epoch": 8.937295204051237, "grad_norm": 1.472459077835083, "learning_rate": 3.380769572882576e-05, "loss": 0.5188, "num_input_tokens_seen": 34834248, "step": 60005 }, { "epoch": 8.938039916592196, "grad_norm": 0.40210023522377014, "learning_rate": 3.380465457231018e-05, "loss": 0.4788, "num_input_tokens_seen": 34837064, "step": 60010 }, { "epoch": 8.938784629133155, "grad_norm": 1.1729869842529297, "learning_rate": 3.3801613267048916e-05, "loss": 0.639, "num_input_tokens_seen": 34839944, "step": 60015 }, { "epoch": 8.939529341674113, "grad_norm": 1.009813904762268, "learning_rate": 3.379857181309334e-05, "loss": 0.4825, "num_input_tokens_seen": 34843176, "step": 60020 }, { "epoch": 8.940274054215074, "grad_norm": 1.0863397121429443, "learning_rate": 3.379553021049484e-05, "loss": 0.5595, "num_input_tokens_seen": 34845960, "step": 60025 }, { "epoch": 8.941018766756033, "grad_norm": 1.2208654880523682, "learning_rate": 3.379248845930479e-05, "loss": 0.5665, "num_input_tokens_seen": 34848872, "step": 60030 }, { "epoch": 8.941763479296991, "grad_norm": 0.9843989014625549, "learning_rate": 3.378944655957458e-05, "loss": 0.7747, "num_input_tokens_seen": 34852168, "step": 60035 }, { "epoch": 8.94250819183795, "grad_norm": 1.1848589181900024, "learning_rate": 3.3786404511355616e-05, "loss": 0.6362, "num_input_tokens_seen": 34855144, "step": 60040 }, { "epoch": 8.943252904378909, "grad_norm": 1.6862351894378662, "learning_rate": 3.378336231469927e-05, "loss": 0.6935, "num_input_tokens_seen": 34858152, "step": 60045 }, { "epoch": 8.94399761691987, "grad_norm": 1.0223296880722046, "learning_rate": 3.378031996965695e-05, "loss": 0.6509, "num_input_tokens_seen": 34860872, "step": 60050 }, { "epoch": 8.944742329460828, "grad_norm": 1.9462186098098755, "learning_rate": 3.3777277476280036e-05, "loss": 0.6949, "num_input_tokens_seen": 34863560, "step": 60055 }, { "epoch": 8.945487042001787, "grad_norm": 2.0045244693756104, "learning_rate": 3.3774234834619956e-05, "loss": 0.7166, "num_input_tokens_seen": 34866600, "step": 60060 }, { "epoch": 8.946231754542747, "grad_norm": 1.9481369256973267, "learning_rate": 3.377119204472809e-05, "loss": 0.655, "num_input_tokens_seen": 34869608, "step": 60065 }, { "epoch": 8.946976467083706, "grad_norm": 1.2421109676361084, "learning_rate": 3.376814910665584e-05, "loss": 0.7758, "num_input_tokens_seen": 34872584, "step": 60070 }, { "epoch": 8.947721179624665, "grad_norm": 1.4230968952178955, "learning_rate": 3.3765106020454636e-05, "loss": 0.5293, "num_input_tokens_seen": 34875560, "step": 60075 }, { "epoch": 8.948465892165624, "grad_norm": 1.6116365194320679, "learning_rate": 3.376206278617587e-05, "loss": 0.7283, "num_input_tokens_seen": 34878216, "step": 60080 }, { "epoch": 8.949210604706582, "grad_norm": 0.5992555618286133, "learning_rate": 3.375901940387096e-05, "loss": 0.6384, "num_input_tokens_seen": 34880872, "step": 60085 }, { "epoch": 8.949955317247543, "grad_norm": 1.4037940502166748, "learning_rate": 3.375597587359131e-05, "loss": 0.8004, "num_input_tokens_seen": 34884072, "step": 60090 }, { "epoch": 8.950700029788502, "grad_norm": 1.161266565322876, "learning_rate": 3.375293219538836e-05, "loss": 0.5838, "num_input_tokens_seen": 34886952, "step": 60095 }, { "epoch": 8.95144474232946, "grad_norm": 1.0876624584197998, "learning_rate": 3.374988836931351e-05, "loss": 0.6671, "num_input_tokens_seen": 34889928, "step": 60100 }, { "epoch": 8.95218945487042, "grad_norm": 1.2069716453552246, "learning_rate": 3.374684439541819e-05, "loss": 0.6134, "num_input_tokens_seen": 34892584, "step": 60105 }, { "epoch": 8.95293416741138, "grad_norm": 1.3534923791885376, "learning_rate": 3.374380027375382e-05, "loss": 0.6672, "num_input_tokens_seen": 34895432, "step": 60110 }, { "epoch": 8.953678879952339, "grad_norm": 0.7861224412918091, "learning_rate": 3.374075600437183e-05, "loss": 0.6502, "num_input_tokens_seen": 34898280, "step": 60115 }, { "epoch": 8.954423592493297, "grad_norm": 1.8002896308898926, "learning_rate": 3.373771158732366e-05, "loss": 0.8023, "num_input_tokens_seen": 34901256, "step": 60120 }, { "epoch": 8.955168305034256, "grad_norm": 1.5501452684402466, "learning_rate": 3.3734667022660724e-05, "loss": 0.812, "num_input_tokens_seen": 34904072, "step": 60125 }, { "epoch": 8.955913017575217, "grad_norm": 1.2500555515289307, "learning_rate": 3.373162231043447e-05, "loss": 0.6692, "num_input_tokens_seen": 34906888, "step": 60130 }, { "epoch": 8.956657730116175, "grad_norm": 1.2044575214385986, "learning_rate": 3.3728577450696336e-05, "loss": 0.4969, "num_input_tokens_seen": 34909992, "step": 60135 }, { "epoch": 8.957402442657134, "grad_norm": 1.0662131309509277, "learning_rate": 3.372553244349775e-05, "loss": 0.5571, "num_input_tokens_seen": 34912840, "step": 60140 }, { "epoch": 8.958147155198093, "grad_norm": 1.2760366201400757, "learning_rate": 3.372248728889017e-05, "loss": 0.5353, "num_input_tokens_seen": 34915944, "step": 60145 }, { "epoch": 8.958891867739053, "grad_norm": 1.2080488204956055, "learning_rate": 3.371944198692502e-05, "loss": 0.616, "num_input_tokens_seen": 34918696, "step": 60150 }, { "epoch": 8.959636580280012, "grad_norm": 1.0509058237075806, "learning_rate": 3.3716396537653774e-05, "loss": 0.529, "num_input_tokens_seen": 34921736, "step": 60155 }, { "epoch": 8.960381292820971, "grad_norm": 2.1894748210906982, "learning_rate": 3.371335094112786e-05, "loss": 0.7056, "num_input_tokens_seen": 34924872, "step": 60160 }, { "epoch": 8.96112600536193, "grad_norm": 0.8566591143608093, "learning_rate": 3.371030519739874e-05, "loss": 0.4943, "num_input_tokens_seen": 34927656, "step": 60165 }, { "epoch": 8.96187071790289, "grad_norm": 1.3807860612869263, "learning_rate": 3.370725930651786e-05, "loss": 0.4688, "num_input_tokens_seen": 34930632, "step": 60170 }, { "epoch": 8.962615430443849, "grad_norm": 0.791877269744873, "learning_rate": 3.370421326853669e-05, "loss": 0.7204, "num_input_tokens_seen": 34933736, "step": 60175 }, { "epoch": 8.963360142984808, "grad_norm": 1.0405380725860596, "learning_rate": 3.370116708350668e-05, "loss": 0.6673, "num_input_tokens_seen": 34936552, "step": 60180 }, { "epoch": 8.964104855525767, "grad_norm": 1.8375415802001953, "learning_rate": 3.369812075147929e-05, "loss": 0.8683, "num_input_tokens_seen": 34939432, "step": 60185 }, { "epoch": 8.964849568066727, "grad_norm": 1.7116470336914062, "learning_rate": 3.369507427250601e-05, "loss": 0.5835, "num_input_tokens_seen": 34941992, "step": 60190 }, { "epoch": 8.965594280607686, "grad_norm": 1.5091423988342285, "learning_rate": 3.369202764663827e-05, "loss": 0.5651, "num_input_tokens_seen": 34944808, "step": 60195 }, { "epoch": 8.966338993148645, "grad_norm": 1.188315510749817, "learning_rate": 3.368898087392756e-05, "loss": 0.5276, "num_input_tokens_seen": 34948072, "step": 60200 }, { "epoch": 8.967083705689603, "grad_norm": 0.8900837898254395, "learning_rate": 3.3685933954425353e-05, "loss": 0.6694, "num_input_tokens_seen": 34950696, "step": 60205 }, { "epoch": 8.967828418230564, "grad_norm": 1.1966685056686401, "learning_rate": 3.368288688818312e-05, "loss": 0.569, "num_input_tokens_seen": 34953640, "step": 60210 }, { "epoch": 8.968573130771523, "grad_norm": 0.9185238480567932, "learning_rate": 3.367983967525234e-05, "loss": 0.7858, "num_input_tokens_seen": 34956392, "step": 60215 }, { "epoch": 8.969317843312481, "grad_norm": 1.7938318252563477, "learning_rate": 3.367679231568448e-05, "loss": 0.5352, "num_input_tokens_seen": 34959336, "step": 60220 }, { "epoch": 8.97006255585344, "grad_norm": 1.0707285404205322, "learning_rate": 3.367374480953104e-05, "loss": 0.7702, "num_input_tokens_seen": 34962024, "step": 60225 }, { "epoch": 8.970807268394399, "grad_norm": 1.1319669485092163, "learning_rate": 3.367069715684349e-05, "loss": 0.6533, "num_input_tokens_seen": 34964968, "step": 60230 }, { "epoch": 8.97155198093536, "grad_norm": 1.4330860376358032, "learning_rate": 3.366764935767333e-05, "loss": 0.6567, "num_input_tokens_seen": 34967912, "step": 60235 }, { "epoch": 8.972296693476318, "grad_norm": 0.7108640670776367, "learning_rate": 3.366460141207205e-05, "loss": 0.6826, "num_input_tokens_seen": 34970632, "step": 60240 }, { "epoch": 8.973041406017277, "grad_norm": 1.1226370334625244, "learning_rate": 3.366155332009113e-05, "loss": 0.7008, "num_input_tokens_seen": 34973320, "step": 60245 }, { "epoch": 8.973786118558236, "grad_norm": 1.478223204612732, "learning_rate": 3.3658505081782064e-05, "loss": 0.6313, "num_input_tokens_seen": 34976264, "step": 60250 }, { "epoch": 8.974530831099196, "grad_norm": 1.4380689859390259, "learning_rate": 3.3655456697196366e-05, "loss": 0.6905, "num_input_tokens_seen": 34978984, "step": 60255 }, { "epoch": 8.975275543640155, "grad_norm": 1.867032527923584, "learning_rate": 3.365240816638552e-05, "loss": 0.6783, "num_input_tokens_seen": 34981928, "step": 60260 }, { "epoch": 8.976020256181114, "grad_norm": 1.5206427574157715, "learning_rate": 3.3649359489401025e-05, "loss": 0.6062, "num_input_tokens_seen": 34984680, "step": 60265 }, { "epoch": 8.976764968722073, "grad_norm": 1.689987301826477, "learning_rate": 3.3646310666294396e-05, "loss": 0.7834, "num_input_tokens_seen": 34987400, "step": 60270 }, { "epoch": 8.977509681263033, "grad_norm": 1.1406278610229492, "learning_rate": 3.364326169711713e-05, "loss": 0.5862, "num_input_tokens_seen": 34990376, "step": 60275 }, { "epoch": 8.978254393803992, "grad_norm": 1.3253331184387207, "learning_rate": 3.364021258192075e-05, "loss": 0.4455, "num_input_tokens_seen": 34993128, "step": 60280 }, { "epoch": 8.97899910634495, "grad_norm": 1.35986328125, "learning_rate": 3.363716332075676e-05, "loss": 0.7022, "num_input_tokens_seen": 34995848, "step": 60285 }, { "epoch": 8.97974381888591, "grad_norm": 0.6511827111244202, "learning_rate": 3.363411391367668e-05, "loss": 0.494, "num_input_tokens_seen": 34998568, "step": 60290 }, { "epoch": 8.98048853142687, "grad_norm": 2.188502788543701, "learning_rate": 3.363106436073202e-05, "loss": 0.5366, "num_input_tokens_seen": 35001512, "step": 60295 }, { "epoch": 8.981233243967829, "grad_norm": 1.5320600271224976, "learning_rate": 3.362801466197429e-05, "loss": 0.5897, "num_input_tokens_seen": 35004680, "step": 60300 }, { "epoch": 8.981977956508787, "grad_norm": 0.7610694766044617, "learning_rate": 3.362496481745502e-05, "loss": 0.4831, "num_input_tokens_seen": 35007560, "step": 60305 }, { "epoch": 8.982722669049746, "grad_norm": 1.3381081819534302, "learning_rate": 3.362191482722574e-05, "loss": 0.8172, "num_input_tokens_seen": 35010344, "step": 60310 }, { "epoch": 8.983467381590707, "grad_norm": 1.092131495475769, "learning_rate": 3.361886469133798e-05, "loss": 0.6352, "num_input_tokens_seen": 35013160, "step": 60315 }, { "epoch": 8.984212094131665, "grad_norm": 1.4509162902832031, "learning_rate": 3.361581440984325e-05, "loss": 0.663, "num_input_tokens_seen": 35016008, "step": 60320 }, { "epoch": 8.984956806672624, "grad_norm": 1.4889823198318481, "learning_rate": 3.3612763982793094e-05, "loss": 0.7055, "num_input_tokens_seen": 35019208, "step": 60325 }, { "epoch": 8.985701519213583, "grad_norm": 0.9788690209388733, "learning_rate": 3.360971341023905e-05, "loss": 0.5541, "num_input_tokens_seen": 35021928, "step": 60330 }, { "epoch": 8.986446231754543, "grad_norm": 1.1935210227966309, "learning_rate": 3.360666269223264e-05, "loss": 0.666, "num_input_tokens_seen": 35024904, "step": 60335 }, { "epoch": 8.987190944295502, "grad_norm": 2.542759418487549, "learning_rate": 3.360361182882542e-05, "loss": 0.8096, "num_input_tokens_seen": 35027848, "step": 60340 }, { "epoch": 8.987935656836461, "grad_norm": 0.9760270118713379, "learning_rate": 3.3600560820068916e-05, "loss": 0.6033, "num_input_tokens_seen": 35030856, "step": 60345 }, { "epoch": 8.98868036937742, "grad_norm": 1.3037867546081543, "learning_rate": 3.3597509666014684e-05, "loss": 0.6031, "num_input_tokens_seen": 35033800, "step": 60350 }, { "epoch": 8.98942508191838, "grad_norm": 1.8474308252334595, "learning_rate": 3.359445836671426e-05, "loss": 0.7901, "num_input_tokens_seen": 35036712, "step": 60355 }, { "epoch": 8.990169794459339, "grad_norm": 1.2077457904815674, "learning_rate": 3.359140692221919e-05, "loss": 0.6515, "num_input_tokens_seen": 35039336, "step": 60360 }, { "epoch": 8.990914507000298, "grad_norm": 1.1831846237182617, "learning_rate": 3.3588355332581045e-05, "loss": 0.6586, "num_input_tokens_seen": 35042152, "step": 60365 }, { "epoch": 8.991659219541257, "grad_norm": 1.782500147819519, "learning_rate": 3.358530359785136e-05, "loss": 0.651, "num_input_tokens_seen": 35044936, "step": 60370 }, { "epoch": 8.992403932082215, "grad_norm": 1.009220004081726, "learning_rate": 3.35822517180817e-05, "loss": 0.625, "num_input_tokens_seen": 35047816, "step": 60375 }, { "epoch": 8.993148644623176, "grad_norm": 1.48080575466156, "learning_rate": 3.357919969332361e-05, "loss": 0.5524, "num_input_tokens_seen": 35050856, "step": 60380 }, { "epoch": 8.993893357164135, "grad_norm": 2.6798629760742188, "learning_rate": 3.357614752362867e-05, "loss": 0.6629, "num_input_tokens_seen": 35053672, "step": 60385 }, { "epoch": 8.994638069705093, "grad_norm": 0.9114723801612854, "learning_rate": 3.3573095209048435e-05, "loss": 0.6401, "num_input_tokens_seen": 35056360, "step": 60390 }, { "epoch": 8.995382782246054, "grad_norm": 1.9384663105010986, "learning_rate": 3.357004274963446e-05, "loss": 0.7943, "num_input_tokens_seen": 35059336, "step": 60395 }, { "epoch": 8.996127494787013, "grad_norm": 1.1244609355926514, "learning_rate": 3.356699014543833e-05, "loss": 0.5556, "num_input_tokens_seen": 35062120, "step": 60400 }, { "epoch": 8.996872207327971, "grad_norm": 1.1476941108703613, "learning_rate": 3.3563937396511607e-05, "loss": 0.6881, "num_input_tokens_seen": 35064904, "step": 60405 }, { "epoch": 8.99761691986893, "grad_norm": 0.9997794032096863, "learning_rate": 3.3560884502905865e-05, "loss": 0.617, "num_input_tokens_seen": 35068200, "step": 60410 }, { "epoch": 8.998361632409889, "grad_norm": 1.3038748502731323, "learning_rate": 3.355783146467268e-05, "loss": 0.7246, "num_input_tokens_seen": 35071304, "step": 60415 }, { "epoch": 8.99910634495085, "grad_norm": 1.8246780633926392, "learning_rate": 3.355477828186363e-05, "loss": 0.6122, "num_input_tokens_seen": 35074088, "step": 60420 }, { "epoch": 8.999851057491808, "grad_norm": 1.0773743391036987, "learning_rate": 3.3551724954530303e-05, "loss": 0.5925, "num_input_tokens_seen": 35076936, "step": 60425 }, { "epoch": 9.0, "eval_loss": 0.650672197341919, "eval_runtime": 47.0585, "eval_samples_per_second": 63.41, "eval_steps_per_second": 15.853, "num_input_tokens_seen": 35077032, "step": 60426 }, { "epoch": 9.000595770032767, "grad_norm": 1.3442095518112183, "learning_rate": 3.3548671482724267e-05, "loss": 0.6057, "num_input_tokens_seen": 35079464, "step": 60430 }, { "epoch": 9.001340482573726, "grad_norm": 0.8339857459068298, "learning_rate": 3.354561786649711e-05, "loss": 0.5576, "num_input_tokens_seen": 35082408, "step": 60435 }, { "epoch": 9.002085195114686, "grad_norm": 1.2514399290084839, "learning_rate": 3.354256410590043e-05, "loss": 0.5009, "num_input_tokens_seen": 35085608, "step": 60440 }, { "epoch": 9.002829907655645, "grad_norm": 0.7765415906906128, "learning_rate": 3.353951020098582e-05, "loss": 0.5724, "num_input_tokens_seen": 35088904, "step": 60445 }, { "epoch": 9.003574620196604, "grad_norm": 1.1519924402236938, "learning_rate": 3.353645615180485e-05, "loss": 0.7017, "num_input_tokens_seen": 35091816, "step": 60450 }, { "epoch": 9.004319332737563, "grad_norm": 1.1333198547363281, "learning_rate": 3.3533401958409136e-05, "loss": 0.4993, "num_input_tokens_seen": 35094760, "step": 60455 }, { "epoch": 9.005064045278523, "grad_norm": 1.2953723669052124, "learning_rate": 3.3530347620850276e-05, "loss": 0.6384, "num_input_tokens_seen": 35097512, "step": 60460 }, { "epoch": 9.005808757819482, "grad_norm": 1.5187764167785645, "learning_rate": 3.3527293139179854e-05, "loss": 0.6021, "num_input_tokens_seen": 35100456, "step": 60465 }, { "epoch": 9.00655347036044, "grad_norm": 1.773620367050171, "learning_rate": 3.352423851344948e-05, "loss": 0.7803, "num_input_tokens_seen": 35103304, "step": 60470 }, { "epoch": 9.0072981829014, "grad_norm": 0.9057937264442444, "learning_rate": 3.352118374371076e-05, "loss": 0.5144, "num_input_tokens_seen": 35106216, "step": 60475 }, { "epoch": 9.00804289544236, "grad_norm": 2.303365707397461, "learning_rate": 3.351812883001531e-05, "loss": 0.5535, "num_input_tokens_seen": 35109384, "step": 60480 }, { "epoch": 9.008787607983319, "grad_norm": 1.5573537349700928, "learning_rate": 3.3515073772414725e-05, "loss": 0.6777, "num_input_tokens_seen": 35112328, "step": 60485 }, { "epoch": 9.009532320524277, "grad_norm": 1.7125052213668823, "learning_rate": 3.351201857096062e-05, "loss": 0.682, "num_input_tokens_seen": 35115176, "step": 60490 }, { "epoch": 9.010277033065236, "grad_norm": 0.9626370072364807, "learning_rate": 3.350896322570462e-05, "loss": 0.4793, "num_input_tokens_seen": 35118088, "step": 60495 }, { "epoch": 9.011021745606197, "grad_norm": 1.0843921899795532, "learning_rate": 3.350590773669833e-05, "loss": 0.612, "num_input_tokens_seen": 35120904, "step": 60500 }, { "epoch": 9.011766458147155, "grad_norm": 0.9879947900772095, "learning_rate": 3.350285210399337e-05, "loss": 0.4879, "num_input_tokens_seen": 35123592, "step": 60505 }, { "epoch": 9.012511170688114, "grad_norm": 1.3604612350463867, "learning_rate": 3.3499796327641366e-05, "loss": 0.6663, "num_input_tokens_seen": 35126408, "step": 60510 }, { "epoch": 9.013255883229073, "grad_norm": 1.0613985061645508, "learning_rate": 3.349674040769394e-05, "loss": 0.6394, "num_input_tokens_seen": 35129512, "step": 60515 }, { "epoch": 9.014000595770034, "grad_norm": 1.7532484531402588, "learning_rate": 3.349368434420274e-05, "loss": 0.7693, "num_input_tokens_seen": 35132360, "step": 60520 }, { "epoch": 9.014745308310992, "grad_norm": 1.10719633102417, "learning_rate": 3.349062813721936e-05, "loss": 0.505, "num_input_tokens_seen": 35135272, "step": 60525 }, { "epoch": 9.015490020851951, "grad_norm": 1.0024453401565552, "learning_rate": 3.348757178679545e-05, "loss": 0.7012, "num_input_tokens_seen": 35138248, "step": 60530 }, { "epoch": 9.01623473339291, "grad_norm": 1.2808911800384521, "learning_rate": 3.3484515292982634e-05, "loss": 0.7408, "num_input_tokens_seen": 35140936, "step": 60535 }, { "epoch": 9.01697944593387, "grad_norm": 1.0294207334518433, "learning_rate": 3.348145865583256e-05, "loss": 0.7247, "num_input_tokens_seen": 35143752, "step": 60540 }, { "epoch": 9.017724158474829, "grad_norm": 1.2307454347610474, "learning_rate": 3.347840187539686e-05, "loss": 0.5248, "num_input_tokens_seen": 35146536, "step": 60545 }, { "epoch": 9.018468871015788, "grad_norm": 3.437333106994629, "learning_rate": 3.347534495172718e-05, "loss": 0.6718, "num_input_tokens_seen": 35149800, "step": 60550 }, { "epoch": 9.019213583556747, "grad_norm": 1.0995992422103882, "learning_rate": 3.3472287884875167e-05, "loss": 0.7689, "num_input_tokens_seen": 35152872, "step": 60555 }, { "epoch": 9.019958296097707, "grad_norm": 0.9993875622749329, "learning_rate": 3.346923067489245e-05, "loss": 0.5663, "num_input_tokens_seen": 35155752, "step": 60560 }, { "epoch": 9.020703008638666, "grad_norm": 1.1236460208892822, "learning_rate": 3.3466173321830705e-05, "loss": 0.501, "num_input_tokens_seen": 35158792, "step": 60565 }, { "epoch": 9.021447721179625, "grad_norm": 0.9534934163093567, "learning_rate": 3.346311582574155e-05, "loss": 0.5647, "num_input_tokens_seen": 35161832, "step": 60570 }, { "epoch": 9.022192433720583, "grad_norm": 1.0658602714538574, "learning_rate": 3.3460058186676656e-05, "loss": 0.7371, "num_input_tokens_seen": 35164616, "step": 60575 }, { "epoch": 9.022937146261542, "grad_norm": 2.0064380168914795, "learning_rate": 3.345700040468768e-05, "loss": 0.6944, "num_input_tokens_seen": 35167304, "step": 60580 }, { "epoch": 9.023681858802503, "grad_norm": 0.8689847588539124, "learning_rate": 3.345394247982628e-05, "loss": 0.5041, "num_input_tokens_seen": 35170312, "step": 60585 }, { "epoch": 9.024426571343461, "grad_norm": 0.7156358361244202, "learning_rate": 3.345088441214411e-05, "loss": 0.6023, "num_input_tokens_seen": 35173288, "step": 60590 }, { "epoch": 9.02517128388442, "grad_norm": 1.8720896244049072, "learning_rate": 3.344782620169284e-05, "loss": 0.7316, "num_input_tokens_seen": 35176424, "step": 60595 }, { "epoch": 9.025915996425379, "grad_norm": 1.2754685878753662, "learning_rate": 3.344476784852413e-05, "loss": 0.7323, "num_input_tokens_seen": 35179656, "step": 60600 }, { "epoch": 9.02666070896634, "grad_norm": 1.4588600397109985, "learning_rate": 3.344170935268966e-05, "loss": 0.718, "num_input_tokens_seen": 35182568, "step": 60605 }, { "epoch": 9.027405421507298, "grad_norm": 1.191611647605896, "learning_rate": 3.3438650714241084e-05, "loss": 0.5069, "num_input_tokens_seen": 35185768, "step": 60610 }, { "epoch": 9.028150134048257, "grad_norm": 1.546545147895813, "learning_rate": 3.3435591933230074e-05, "loss": 0.8113, "num_input_tokens_seen": 35188456, "step": 60615 }, { "epoch": 9.028894846589216, "grad_norm": 0.8790033459663391, "learning_rate": 3.343253300970832e-05, "loss": 0.5359, "num_input_tokens_seen": 35191720, "step": 60620 }, { "epoch": 9.029639559130176, "grad_norm": 2.028301954269409, "learning_rate": 3.3429473943727486e-05, "loss": 0.6793, "num_input_tokens_seen": 35194664, "step": 60625 }, { "epoch": 9.030384271671135, "grad_norm": 2.6867876052856445, "learning_rate": 3.342641473533926e-05, "loss": 0.5446, "num_input_tokens_seen": 35197576, "step": 60630 }, { "epoch": 9.031128984212094, "grad_norm": 0.9345393776893616, "learning_rate": 3.3423355384595316e-05, "loss": 0.8156, "num_input_tokens_seen": 35200552, "step": 60635 }, { "epoch": 9.031873696753053, "grad_norm": 1.4600974321365356, "learning_rate": 3.342029589154735e-05, "loss": 0.6073, "num_input_tokens_seen": 35203432, "step": 60640 }, { "epoch": 9.032618409294013, "grad_norm": 1.385637879371643, "learning_rate": 3.3417236256247044e-05, "loss": 0.581, "num_input_tokens_seen": 35206088, "step": 60645 }, { "epoch": 9.033363121834972, "grad_norm": 1.0559983253479004, "learning_rate": 3.341417647874608e-05, "loss": 0.5294, "num_input_tokens_seen": 35208968, "step": 60650 }, { "epoch": 9.03410783437593, "grad_norm": 0.7823620438575745, "learning_rate": 3.341111655909616e-05, "loss": 0.5066, "num_input_tokens_seen": 35211496, "step": 60655 }, { "epoch": 9.03485254691689, "grad_norm": 1.414414882659912, "learning_rate": 3.340805649734898e-05, "loss": 0.6672, "num_input_tokens_seen": 35214184, "step": 60660 }, { "epoch": 9.03559725945785, "grad_norm": 1.3552643060684204, "learning_rate": 3.340499629355622e-05, "loss": 0.7171, "num_input_tokens_seen": 35216968, "step": 60665 }, { "epoch": 9.036341971998809, "grad_norm": 2.30558443069458, "learning_rate": 3.34019359477696e-05, "loss": 0.5954, "num_input_tokens_seen": 35219592, "step": 60670 }, { "epoch": 9.037086684539767, "grad_norm": 1.5485695600509644, "learning_rate": 3.33988754600408e-05, "loss": 0.7475, "num_input_tokens_seen": 35222568, "step": 60675 }, { "epoch": 9.037831397080726, "grad_norm": 1.6238657236099243, "learning_rate": 3.339581483042155e-05, "loss": 0.7788, "num_input_tokens_seen": 35225384, "step": 60680 }, { "epoch": 9.038576109621687, "grad_norm": 1.52097487449646, "learning_rate": 3.339275405896353e-05, "loss": 0.6201, "num_input_tokens_seen": 35228200, "step": 60685 }, { "epoch": 9.039320822162646, "grad_norm": 2.2576990127563477, "learning_rate": 3.338969314571847e-05, "loss": 0.7474, "num_input_tokens_seen": 35231016, "step": 60690 }, { "epoch": 9.040065534703604, "grad_norm": 1.4244213104248047, "learning_rate": 3.338663209073806e-05, "loss": 0.5137, "num_input_tokens_seen": 35233896, "step": 60695 }, { "epoch": 9.040810247244563, "grad_norm": 1.630690097808838, "learning_rate": 3.338357089407403e-05, "loss": 0.7082, "num_input_tokens_seen": 35236552, "step": 60700 }, { "epoch": 9.041554959785524, "grad_norm": 1.1924333572387695, "learning_rate": 3.338050955577809e-05, "loss": 0.5998, "num_input_tokens_seen": 35239304, "step": 60705 }, { "epoch": 9.042299672326482, "grad_norm": 0.8430942296981812, "learning_rate": 3.337744807590196e-05, "loss": 0.5793, "num_input_tokens_seen": 35241864, "step": 60710 }, { "epoch": 9.043044384867441, "grad_norm": 1.0353641510009766, "learning_rate": 3.337438645449735e-05, "loss": 0.7029, "num_input_tokens_seen": 35244744, "step": 60715 }, { "epoch": 9.0437890974084, "grad_norm": 1.4709599018096924, "learning_rate": 3.3371324691616004e-05, "loss": 0.7828, "num_input_tokens_seen": 35247656, "step": 60720 }, { "epoch": 9.04453380994936, "grad_norm": 1.5306159257888794, "learning_rate": 3.3368262787309636e-05, "loss": 0.5876, "num_input_tokens_seen": 35250408, "step": 60725 }, { "epoch": 9.04527852249032, "grad_norm": 1.0605524778366089, "learning_rate": 3.3365200741629973e-05, "loss": 0.6541, "num_input_tokens_seen": 35253192, "step": 60730 }, { "epoch": 9.046023235031278, "grad_norm": 1.0205726623535156, "learning_rate": 3.336213855462874e-05, "loss": 0.5452, "num_input_tokens_seen": 35256136, "step": 60735 }, { "epoch": 9.046767947572237, "grad_norm": 1.8577591180801392, "learning_rate": 3.3359076226357675e-05, "loss": 0.5609, "num_input_tokens_seen": 35259112, "step": 60740 }, { "epoch": 9.047512660113195, "grad_norm": 1.2639656066894531, "learning_rate": 3.335601375686851e-05, "loss": 0.6301, "num_input_tokens_seen": 35262184, "step": 60745 }, { "epoch": 9.048257372654156, "grad_norm": 1.1343193054199219, "learning_rate": 3.335295114621299e-05, "loss": 0.4438, "num_input_tokens_seen": 35264968, "step": 60750 }, { "epoch": 9.049002085195115, "grad_norm": 0.9524393081665039, "learning_rate": 3.334988839444285e-05, "loss": 0.5775, "num_input_tokens_seen": 35267976, "step": 60755 }, { "epoch": 9.049746797736073, "grad_norm": 1.1029168367385864, "learning_rate": 3.3346825501609834e-05, "loss": 0.5397, "num_input_tokens_seen": 35271400, "step": 60760 }, { "epoch": 9.050491510277032, "grad_norm": 1.558846116065979, "learning_rate": 3.3343762467765685e-05, "loss": 0.6546, "num_input_tokens_seen": 35274312, "step": 60765 }, { "epoch": 9.051236222817993, "grad_norm": 1.1679363250732422, "learning_rate": 3.334069929296215e-05, "loss": 0.7002, "num_input_tokens_seen": 35277096, "step": 60770 }, { "epoch": 9.051980935358952, "grad_norm": 0.5206915140151978, "learning_rate": 3.333763597725097e-05, "loss": 0.6718, "num_input_tokens_seen": 35280072, "step": 60775 }, { "epoch": 9.05272564789991, "grad_norm": 1.7153277397155762, "learning_rate": 3.333457252068391e-05, "loss": 0.7927, "num_input_tokens_seen": 35282920, "step": 60780 }, { "epoch": 9.053470360440869, "grad_norm": 2.2218549251556396, "learning_rate": 3.333150892331271e-05, "loss": 0.5722, "num_input_tokens_seen": 35285864, "step": 60785 }, { "epoch": 9.05421507298183, "grad_norm": 0.9252157807350159, "learning_rate": 3.3328445185189145e-05, "loss": 0.4632, "num_input_tokens_seen": 35288552, "step": 60790 }, { "epoch": 9.054959785522788, "grad_norm": 1.3535205125808716, "learning_rate": 3.332538130636496e-05, "loss": 0.5612, "num_input_tokens_seen": 35291176, "step": 60795 }, { "epoch": 9.055704498063747, "grad_norm": 1.4136959314346313, "learning_rate": 3.3322317286891913e-05, "loss": 0.6569, "num_input_tokens_seen": 35294120, "step": 60800 }, { "epoch": 9.056449210604706, "grad_norm": 0.9999324083328247, "learning_rate": 3.331925312682178e-05, "loss": 0.6078, "num_input_tokens_seen": 35297128, "step": 60805 }, { "epoch": 9.057193923145666, "grad_norm": 1.2910091876983643, "learning_rate": 3.331618882620632e-05, "loss": 0.5342, "num_input_tokens_seen": 35300104, "step": 60810 }, { "epoch": 9.057938635686625, "grad_norm": 1.0662733316421509, "learning_rate": 3.3313124385097306e-05, "loss": 0.6079, "num_input_tokens_seen": 35303048, "step": 60815 }, { "epoch": 9.058683348227584, "grad_norm": 1.3054299354553223, "learning_rate": 3.33100598035465e-05, "loss": 0.3916, "num_input_tokens_seen": 35306216, "step": 60820 }, { "epoch": 9.059428060768543, "grad_norm": 2.443385362625122, "learning_rate": 3.3306995081605686e-05, "loss": 0.813, "num_input_tokens_seen": 35308904, "step": 60825 }, { "epoch": 9.060172773309503, "grad_norm": 1.368707299232483, "learning_rate": 3.3303930219326625e-05, "loss": 0.623, "num_input_tokens_seen": 35311880, "step": 60830 }, { "epoch": 9.060917485850462, "grad_norm": 1.3037142753601074, "learning_rate": 3.33008652167611e-05, "loss": 0.6252, "num_input_tokens_seen": 35314984, "step": 60835 }, { "epoch": 9.06166219839142, "grad_norm": 1.2620294094085693, "learning_rate": 3.32978000739609e-05, "loss": 0.5894, "num_input_tokens_seen": 35317832, "step": 60840 }, { "epoch": 9.06240691093238, "grad_norm": 1.2364048957824707, "learning_rate": 3.32947347909778e-05, "loss": 0.4677, "num_input_tokens_seen": 35320776, "step": 60845 }, { "epoch": 9.06315162347334, "grad_norm": 1.0409551858901978, "learning_rate": 3.329166936786359e-05, "loss": 0.5137, "num_input_tokens_seen": 35323592, "step": 60850 }, { "epoch": 9.063896336014299, "grad_norm": 1.6589022874832153, "learning_rate": 3.328860380467005e-05, "loss": 0.6238, "num_input_tokens_seen": 35326856, "step": 60855 }, { "epoch": 9.064641048555258, "grad_norm": 1.7570314407348633, "learning_rate": 3.328553810144897e-05, "loss": 0.6338, "num_input_tokens_seen": 35329544, "step": 60860 }, { "epoch": 9.065385761096216, "grad_norm": 1.1626427173614502, "learning_rate": 3.328247225825215e-05, "loss": 0.6347, "num_input_tokens_seen": 35333000, "step": 60865 }, { "epoch": 9.066130473637177, "grad_norm": 0.9880113005638123, "learning_rate": 3.327940627513137e-05, "loss": 0.5742, "num_input_tokens_seen": 35335816, "step": 60870 }, { "epoch": 9.066875186178136, "grad_norm": 1.3042206764221191, "learning_rate": 3.327634015213844e-05, "loss": 0.7779, "num_input_tokens_seen": 35339112, "step": 60875 }, { "epoch": 9.067619898719094, "grad_norm": 1.209391713142395, "learning_rate": 3.327327388932516e-05, "loss": 0.6826, "num_input_tokens_seen": 35342344, "step": 60880 }, { "epoch": 9.068364611260053, "grad_norm": 0.9932790994644165, "learning_rate": 3.327020748674333e-05, "loss": 0.5135, "num_input_tokens_seen": 35345352, "step": 60885 }, { "epoch": 9.069109323801014, "grad_norm": 1.7036446332931519, "learning_rate": 3.326714094444474e-05, "loss": 0.7324, "num_input_tokens_seen": 35348232, "step": 60890 }, { "epoch": 9.069854036341972, "grad_norm": 1.0075335502624512, "learning_rate": 3.326407426248121e-05, "loss": 0.6894, "num_input_tokens_seen": 35351400, "step": 60895 }, { "epoch": 9.070598748882931, "grad_norm": 1.0844404697418213, "learning_rate": 3.326100744090455e-05, "loss": 0.733, "num_input_tokens_seen": 35354184, "step": 60900 }, { "epoch": 9.07134346142389, "grad_norm": 1.6125291585922241, "learning_rate": 3.3257940479766544e-05, "loss": 0.584, "num_input_tokens_seen": 35357192, "step": 60905 }, { "epoch": 9.07208817396485, "grad_norm": 1.651610016822815, "learning_rate": 3.3254873379119044e-05, "loss": 0.6547, "num_input_tokens_seen": 35360072, "step": 60910 }, { "epoch": 9.07283288650581, "grad_norm": 1.9442801475524902, "learning_rate": 3.325180613901385e-05, "loss": 0.597, "num_input_tokens_seen": 35362728, "step": 60915 }, { "epoch": 9.073577599046768, "grad_norm": 0.5899292230606079, "learning_rate": 3.3248738759502775e-05, "loss": 0.5579, "num_input_tokens_seen": 35365512, "step": 60920 }, { "epoch": 9.074322311587727, "grad_norm": 1.3270535469055176, "learning_rate": 3.3245671240637635e-05, "loss": 0.6888, "num_input_tokens_seen": 35368296, "step": 60925 }, { "epoch": 9.075067024128685, "grad_norm": 1.0019307136535645, "learning_rate": 3.324260358247028e-05, "loss": 0.5627, "num_input_tokens_seen": 35371496, "step": 60930 }, { "epoch": 9.075811736669646, "grad_norm": 1.8553643226623535, "learning_rate": 3.323953578505249e-05, "loss": 0.5959, "num_input_tokens_seen": 35374184, "step": 60935 }, { "epoch": 9.076556449210605, "grad_norm": 1.1799594163894653, "learning_rate": 3.323646784843613e-05, "loss": 0.5963, "num_input_tokens_seen": 35377064, "step": 60940 }, { "epoch": 9.077301161751564, "grad_norm": 1.0905632972717285, "learning_rate": 3.323339977267301e-05, "loss": 0.6993, "num_input_tokens_seen": 35380040, "step": 60945 }, { "epoch": 9.078045874292522, "grad_norm": 0.6705948710441589, "learning_rate": 3.3230331557814975e-05, "loss": 0.5908, "num_input_tokens_seen": 35382920, "step": 60950 }, { "epoch": 9.078790586833483, "grad_norm": 0.6662572026252747, "learning_rate": 3.322726320391386e-05, "loss": 0.6053, "num_input_tokens_seen": 35385672, "step": 60955 }, { "epoch": 9.079535299374442, "grad_norm": 1.2193655967712402, "learning_rate": 3.322419471102148e-05, "loss": 0.732, "num_input_tokens_seen": 35388712, "step": 60960 }, { "epoch": 9.0802800119154, "grad_norm": 1.9449094533920288, "learning_rate": 3.3221126079189704e-05, "loss": 0.6002, "num_input_tokens_seen": 35391912, "step": 60965 }, { "epoch": 9.081024724456359, "grad_norm": 1.23288094997406, "learning_rate": 3.321805730847035e-05, "loss": 0.6119, "num_input_tokens_seen": 35394600, "step": 60970 }, { "epoch": 9.08176943699732, "grad_norm": 2.209538221359253, "learning_rate": 3.321498839891527e-05, "loss": 0.5989, "num_input_tokens_seen": 35397352, "step": 60975 }, { "epoch": 9.082514149538278, "grad_norm": 0.7385549545288086, "learning_rate": 3.321191935057631e-05, "loss": 0.4773, "num_input_tokens_seen": 35399912, "step": 60980 }, { "epoch": 9.083258862079237, "grad_norm": 1.0992074012756348, "learning_rate": 3.3208850163505314e-05, "loss": 0.6678, "num_input_tokens_seen": 35402600, "step": 60985 }, { "epoch": 9.084003574620196, "grad_norm": 0.7762351036071777, "learning_rate": 3.3205780837754154e-05, "loss": 0.551, "num_input_tokens_seen": 35405512, "step": 60990 }, { "epoch": 9.084748287161156, "grad_norm": 3.590486764907837, "learning_rate": 3.3202711373374654e-05, "loss": 0.7522, "num_input_tokens_seen": 35408200, "step": 60995 }, { "epoch": 9.085492999702115, "grad_norm": 1.0547420978546143, "learning_rate": 3.319964177041868e-05, "loss": 0.4927, "num_input_tokens_seen": 35411272, "step": 61000 }, { "epoch": 9.086237712243074, "grad_norm": 1.7301688194274902, "learning_rate": 3.31965720289381e-05, "loss": 0.5215, "num_input_tokens_seen": 35414152, "step": 61005 }, { "epoch": 9.086982424784033, "grad_norm": 0.9518055319786072, "learning_rate": 3.319350214898476e-05, "loss": 0.5001, "num_input_tokens_seen": 35416776, "step": 61010 }, { "epoch": 9.087727137324993, "grad_norm": 1.5864681005477905, "learning_rate": 3.319043213061053e-05, "loss": 0.6454, "num_input_tokens_seen": 35419624, "step": 61015 }, { "epoch": 9.088471849865952, "grad_norm": 2.0995683670043945, "learning_rate": 3.318736197386728e-05, "loss": 0.6272, "num_input_tokens_seen": 35422760, "step": 61020 }, { "epoch": 9.08921656240691, "grad_norm": 2.2253119945526123, "learning_rate": 3.3184291678806866e-05, "loss": 0.8087, "num_input_tokens_seen": 35425736, "step": 61025 }, { "epoch": 9.08996127494787, "grad_norm": 0.7616510391235352, "learning_rate": 3.3181221245481164e-05, "loss": 0.5352, "num_input_tokens_seen": 35428552, "step": 61030 }, { "epoch": 9.09070598748883, "grad_norm": 1.234377384185791, "learning_rate": 3.317815067394204e-05, "loss": 0.5923, "num_input_tokens_seen": 35431496, "step": 61035 }, { "epoch": 9.091450700029789, "grad_norm": 1.2065125703811646, "learning_rate": 3.317507996424137e-05, "loss": 0.622, "num_input_tokens_seen": 35434248, "step": 61040 }, { "epoch": 9.092195412570748, "grad_norm": 0.6879681348800659, "learning_rate": 3.317200911643103e-05, "loss": 0.5414, "num_input_tokens_seen": 35437032, "step": 61045 }, { "epoch": 9.092940125111706, "grad_norm": 1.7420017719268799, "learning_rate": 3.316893813056292e-05, "loss": 0.5428, "num_input_tokens_seen": 35439976, "step": 61050 }, { "epoch": 9.093684837652667, "grad_norm": 1.1357897520065308, "learning_rate": 3.3165867006688894e-05, "loss": 0.6367, "num_input_tokens_seen": 35442824, "step": 61055 }, { "epoch": 9.094429550193626, "grad_norm": 0.9136952757835388, "learning_rate": 3.3162795744860845e-05, "loss": 0.5439, "num_input_tokens_seen": 35445736, "step": 61060 }, { "epoch": 9.095174262734584, "grad_norm": 1.1355894804000854, "learning_rate": 3.315972434513065e-05, "loss": 0.6222, "num_input_tokens_seen": 35448520, "step": 61065 }, { "epoch": 9.095918975275543, "grad_norm": 0.6865509748458862, "learning_rate": 3.315665280755021e-05, "loss": 0.3686, "num_input_tokens_seen": 35451432, "step": 61070 }, { "epoch": 9.096663687816504, "grad_norm": 2.555626630783081, "learning_rate": 3.315358113217141e-05, "loss": 0.6944, "num_input_tokens_seen": 35454504, "step": 61075 }, { "epoch": 9.097408400357462, "grad_norm": 1.2464019060134888, "learning_rate": 3.315050931904614e-05, "loss": 0.6251, "num_input_tokens_seen": 35457640, "step": 61080 }, { "epoch": 9.098153112898421, "grad_norm": 0.8752123117446899, "learning_rate": 3.314743736822631e-05, "loss": 0.509, "num_input_tokens_seen": 35460872, "step": 61085 }, { "epoch": 9.09889782543938, "grad_norm": 1.2904425859451294, "learning_rate": 3.314436527976381e-05, "loss": 0.361, "num_input_tokens_seen": 35463688, "step": 61090 }, { "epoch": 9.099642537980339, "grad_norm": 1.1149094104766846, "learning_rate": 3.314129305371052e-05, "loss": 0.6813, "num_input_tokens_seen": 35466440, "step": 61095 }, { "epoch": 9.1003872505213, "grad_norm": 1.0835719108581543, "learning_rate": 3.313822069011837e-05, "loss": 0.6624, "num_input_tokens_seen": 35469448, "step": 61100 }, { "epoch": 9.101131963062258, "grad_norm": 0.8074672222137451, "learning_rate": 3.313514818903924e-05, "loss": 0.6722, "num_input_tokens_seen": 35472552, "step": 61105 }, { "epoch": 9.101876675603217, "grad_norm": 1.3039069175720215, "learning_rate": 3.313207555052505e-05, "loss": 0.6687, "num_input_tokens_seen": 35475944, "step": 61110 }, { "epoch": 9.102621388144176, "grad_norm": 1.3877166509628296, "learning_rate": 3.3129002774627723e-05, "loss": 0.7414, "num_input_tokens_seen": 35479336, "step": 61115 }, { "epoch": 9.103366100685136, "grad_norm": 1.4967443943023682, "learning_rate": 3.3125929861399155e-05, "loss": 0.6213, "num_input_tokens_seen": 35482088, "step": 61120 }, { "epoch": 9.104110813226095, "grad_norm": 1.6288260221481323, "learning_rate": 3.3122856810891245e-05, "loss": 0.6347, "num_input_tokens_seen": 35485000, "step": 61125 }, { "epoch": 9.104855525767054, "grad_norm": 0.9569479823112488, "learning_rate": 3.311978362315594e-05, "loss": 0.7118, "num_input_tokens_seen": 35487944, "step": 61130 }, { "epoch": 9.105600238308012, "grad_norm": 1.0187311172485352, "learning_rate": 3.3116710298245134e-05, "loss": 0.5795, "num_input_tokens_seen": 35490760, "step": 61135 }, { "epoch": 9.106344950848973, "grad_norm": 1.19954252243042, "learning_rate": 3.311363683621076e-05, "loss": 0.5658, "num_input_tokens_seen": 35493768, "step": 61140 }, { "epoch": 9.107089663389932, "grad_norm": 1.856719732284546, "learning_rate": 3.311056323710474e-05, "loss": 0.5583, "num_input_tokens_seen": 35496584, "step": 61145 }, { "epoch": 9.10783437593089, "grad_norm": 0.9427337646484375, "learning_rate": 3.3107489500978996e-05, "loss": 0.5327, "num_input_tokens_seen": 35499144, "step": 61150 }, { "epoch": 9.10857908847185, "grad_norm": 0.8632135987281799, "learning_rate": 3.310441562788546e-05, "loss": 0.6808, "num_input_tokens_seen": 35501864, "step": 61155 }, { "epoch": 9.10932380101281, "grad_norm": 1.682958960533142, "learning_rate": 3.310134161787605e-05, "loss": 0.7272, "num_input_tokens_seen": 35504808, "step": 61160 }, { "epoch": 9.110068513553768, "grad_norm": 0.9661756753921509, "learning_rate": 3.309826747100272e-05, "loss": 0.6028, "num_input_tokens_seen": 35507816, "step": 61165 }, { "epoch": 9.110813226094727, "grad_norm": 1.6209678649902344, "learning_rate": 3.309519318731739e-05, "loss": 0.6659, "num_input_tokens_seen": 35510824, "step": 61170 }, { "epoch": 9.111557938635686, "grad_norm": 0.9003808498382568, "learning_rate": 3.309211876687199e-05, "loss": 0.6696, "num_input_tokens_seen": 35513800, "step": 61175 }, { "epoch": 9.112302651176647, "grad_norm": 1.240940809249878, "learning_rate": 3.308904420971847e-05, "loss": 0.635, "num_input_tokens_seen": 35516520, "step": 61180 }, { "epoch": 9.113047363717605, "grad_norm": 0.6448483467102051, "learning_rate": 3.308596951590877e-05, "loss": 0.5965, "num_input_tokens_seen": 35519336, "step": 61185 }, { "epoch": 9.113792076258564, "grad_norm": 0.7866686582565308, "learning_rate": 3.308289468549484e-05, "loss": 0.5944, "num_input_tokens_seen": 35522216, "step": 61190 }, { "epoch": 9.114536788799523, "grad_norm": 1.2541954517364502, "learning_rate": 3.30798197185286e-05, "loss": 0.5346, "num_input_tokens_seen": 35524904, "step": 61195 }, { "epoch": 9.115281501340483, "grad_norm": 1.6013494729995728, "learning_rate": 3.307674461506204e-05, "loss": 0.7114, "num_input_tokens_seen": 35527720, "step": 61200 }, { "epoch": 9.116026213881442, "grad_norm": 1.0900404453277588, "learning_rate": 3.3073669375147074e-05, "loss": 0.5843, "num_input_tokens_seen": 35530856, "step": 61205 }, { "epoch": 9.1167709264224, "grad_norm": 1.7124732732772827, "learning_rate": 3.307059399883568e-05, "loss": 0.5756, "num_input_tokens_seen": 35533832, "step": 61210 }, { "epoch": 9.11751563896336, "grad_norm": 2.9589366912841797, "learning_rate": 3.30675184861798e-05, "loss": 0.9067, "num_input_tokens_seen": 35536552, "step": 61215 }, { "epoch": 9.11826035150432, "grad_norm": 1.3084813356399536, "learning_rate": 3.30644428372314e-05, "loss": 0.5746, "num_input_tokens_seen": 35539336, "step": 61220 }, { "epoch": 9.119005064045279, "grad_norm": 0.786375880241394, "learning_rate": 3.306136705204242e-05, "loss": 0.4573, "num_input_tokens_seen": 35542184, "step": 61225 }, { "epoch": 9.119749776586238, "grad_norm": 1.070980429649353, "learning_rate": 3.3058291130664844e-05, "loss": 0.6464, "num_input_tokens_seen": 35545032, "step": 61230 }, { "epoch": 9.120494489127196, "grad_norm": 1.685831904411316, "learning_rate": 3.305521507315063e-05, "loss": 0.5571, "num_input_tokens_seen": 35548008, "step": 61235 }, { "epoch": 9.121239201668157, "grad_norm": 1.4566245079040527, "learning_rate": 3.305213887955174e-05, "loss": 0.62, "num_input_tokens_seen": 35550728, "step": 61240 }, { "epoch": 9.121983914209116, "grad_norm": 1.7025471925735474, "learning_rate": 3.3049062549920154e-05, "loss": 0.6754, "num_input_tokens_seen": 35553544, "step": 61245 }, { "epoch": 9.122728626750074, "grad_norm": 1.8977937698364258, "learning_rate": 3.3045986084307835e-05, "loss": 0.6974, "num_input_tokens_seen": 35556712, "step": 61250 }, { "epoch": 9.123473339291033, "grad_norm": 1.6754177808761597, "learning_rate": 3.304290948276677e-05, "loss": 0.6287, "num_input_tokens_seen": 35559432, "step": 61255 }, { "epoch": 9.124218051831992, "grad_norm": 2.0716309547424316, "learning_rate": 3.30398327453489e-05, "loss": 0.7286, "num_input_tokens_seen": 35562248, "step": 61260 }, { "epoch": 9.124962764372953, "grad_norm": 1.0415288209915161, "learning_rate": 3.303675587210624e-05, "loss": 0.6885, "num_input_tokens_seen": 35565256, "step": 61265 }, { "epoch": 9.125707476913911, "grad_norm": 1.1135660409927368, "learning_rate": 3.3033678863090756e-05, "loss": 0.6808, "num_input_tokens_seen": 35568200, "step": 61270 }, { "epoch": 9.12645218945487, "grad_norm": 0.866766095161438, "learning_rate": 3.303060171835444e-05, "loss": 0.613, "num_input_tokens_seen": 35570984, "step": 61275 }, { "epoch": 9.127196901995829, "grad_norm": 1.2553870677947998, "learning_rate": 3.302752443794925e-05, "loss": 0.6649, "num_input_tokens_seen": 35573992, "step": 61280 }, { "epoch": 9.12794161453679, "grad_norm": 4.636324882507324, "learning_rate": 3.302444702192722e-05, "loss": 0.5427, "num_input_tokens_seen": 35576872, "step": 61285 }, { "epoch": 9.128686327077748, "grad_norm": 0.8436137437820435, "learning_rate": 3.30213694703403e-05, "loss": 0.5737, "num_input_tokens_seen": 35579944, "step": 61290 }, { "epoch": 9.129431039618707, "grad_norm": 1.1794475317001343, "learning_rate": 3.3018291783240495e-05, "loss": 0.7103, "num_input_tokens_seen": 35582568, "step": 61295 }, { "epoch": 9.130175752159666, "grad_norm": 1.176944375038147, "learning_rate": 3.3015213960679796e-05, "loss": 0.4314, "num_input_tokens_seen": 35585544, "step": 61300 }, { "epoch": 9.130920464700626, "grad_norm": 1.3034002780914307, "learning_rate": 3.301213600271021e-05, "loss": 0.701, "num_input_tokens_seen": 35588648, "step": 61305 }, { "epoch": 9.131665177241585, "grad_norm": 1.315360188484192, "learning_rate": 3.3009057909383725e-05, "loss": 0.6482, "num_input_tokens_seen": 35591688, "step": 61310 }, { "epoch": 9.132409889782544, "grad_norm": 1.2417141199111938, "learning_rate": 3.300597968075235e-05, "loss": 0.7875, "num_input_tokens_seen": 35594760, "step": 61315 }, { "epoch": 9.133154602323502, "grad_norm": 1.2690081596374512, "learning_rate": 3.3002901316868085e-05, "loss": 0.4925, "num_input_tokens_seen": 35597576, "step": 61320 }, { "epoch": 9.133899314864463, "grad_norm": 1.9762938022613525, "learning_rate": 3.299982281778293e-05, "loss": 0.6377, "num_input_tokens_seen": 35600392, "step": 61325 }, { "epoch": 9.134644027405422, "grad_norm": 1.488670825958252, "learning_rate": 3.2996744183548905e-05, "loss": 0.5378, "num_input_tokens_seen": 35603432, "step": 61330 }, { "epoch": 9.13538873994638, "grad_norm": 1.1144156455993652, "learning_rate": 3.2993665414218024e-05, "loss": 0.7583, "num_input_tokens_seen": 35606120, "step": 61335 }, { "epoch": 9.13613345248734, "grad_norm": 1.0265637636184692, "learning_rate": 3.2990586509842274e-05, "loss": 0.6095, "num_input_tokens_seen": 35609256, "step": 61340 }, { "epoch": 9.1368781650283, "grad_norm": 1.3765701055526733, "learning_rate": 3.298750747047369e-05, "loss": 0.6297, "num_input_tokens_seen": 35612360, "step": 61345 }, { "epoch": 9.137622877569259, "grad_norm": 1.1259853839874268, "learning_rate": 3.2984428296164296e-05, "loss": 0.6028, "num_input_tokens_seen": 35615304, "step": 61350 }, { "epoch": 9.138367590110217, "grad_norm": 1.2788212299346924, "learning_rate": 3.298134898696609e-05, "loss": 0.573, "num_input_tokens_seen": 35618376, "step": 61355 }, { "epoch": 9.139112302651176, "grad_norm": 2.195823907852173, "learning_rate": 3.297826954293111e-05, "loss": 0.6048, "num_input_tokens_seen": 35621256, "step": 61360 }, { "epoch": 9.139857015192137, "grad_norm": 1.7860679626464844, "learning_rate": 3.2975189964111365e-05, "loss": 0.6081, "num_input_tokens_seen": 35624040, "step": 61365 }, { "epoch": 9.140601727733095, "grad_norm": 3.2185521125793457, "learning_rate": 3.2972110250558895e-05, "loss": 0.9465, "num_input_tokens_seen": 35626856, "step": 61370 }, { "epoch": 9.141346440274054, "grad_norm": 1.4356615543365479, "learning_rate": 3.296903040232573e-05, "loss": 0.7394, "num_input_tokens_seen": 35630152, "step": 61375 }, { "epoch": 9.142091152815013, "grad_norm": 0.7873650789260864, "learning_rate": 3.29659504194639e-05, "loss": 0.6715, "num_input_tokens_seen": 35633128, "step": 61380 }, { "epoch": 9.142835865355973, "grad_norm": 1.2335219383239746, "learning_rate": 3.296287030202543e-05, "loss": 0.7566, "num_input_tokens_seen": 35636360, "step": 61385 }, { "epoch": 9.143580577896932, "grad_norm": 1.6211800575256348, "learning_rate": 3.295979005006235e-05, "loss": 0.7229, "num_input_tokens_seen": 35639144, "step": 61390 }, { "epoch": 9.14432529043789, "grad_norm": 1.0399842262268066, "learning_rate": 3.295670966362672e-05, "loss": 0.4822, "num_input_tokens_seen": 35641928, "step": 61395 }, { "epoch": 9.14507000297885, "grad_norm": 1.2386951446533203, "learning_rate": 3.2953629142770556e-05, "loss": 0.5428, "num_input_tokens_seen": 35644680, "step": 61400 }, { "epoch": 9.14581471551981, "grad_norm": 1.9357831478118896, "learning_rate": 3.295054848754591e-05, "loss": 0.9441, "num_input_tokens_seen": 35647720, "step": 61405 }, { "epoch": 9.146559428060769, "grad_norm": 1.867835521697998, "learning_rate": 3.294746769800484e-05, "loss": 0.7698, "num_input_tokens_seen": 35650472, "step": 61410 }, { "epoch": 9.147304140601728, "grad_norm": 1.291704773902893, "learning_rate": 3.2944386774199373e-05, "loss": 0.661, "num_input_tokens_seen": 35653160, "step": 61415 }, { "epoch": 9.148048853142686, "grad_norm": 1.7166807651519775, "learning_rate": 3.294130571618157e-05, "loss": 0.6427, "num_input_tokens_seen": 35656456, "step": 61420 }, { "epoch": 9.148793565683647, "grad_norm": 1.198453664779663, "learning_rate": 3.2938224524003483e-05, "loss": 0.4994, "num_input_tokens_seen": 35659240, "step": 61425 }, { "epoch": 9.149538278224606, "grad_norm": 1.4490760564804077, "learning_rate": 3.293514319771715e-05, "loss": 0.5808, "num_input_tokens_seen": 35662056, "step": 61430 }, { "epoch": 9.150282990765565, "grad_norm": 1.7765759229660034, "learning_rate": 3.2932061737374635e-05, "loss": 0.5769, "num_input_tokens_seen": 35664968, "step": 61435 }, { "epoch": 9.151027703306523, "grad_norm": 1.244788646697998, "learning_rate": 3.292898014302801e-05, "loss": 0.593, "num_input_tokens_seen": 35667624, "step": 61440 }, { "epoch": 9.151772415847482, "grad_norm": 1.4169553518295288, "learning_rate": 3.292589841472932e-05, "loss": 0.8278, "num_input_tokens_seen": 35670504, "step": 61445 }, { "epoch": 9.152517128388443, "grad_norm": 1.2071460485458374, "learning_rate": 3.292281655253063e-05, "loss": 0.6962, "num_input_tokens_seen": 35673448, "step": 61450 }, { "epoch": 9.153261840929401, "grad_norm": 2.520002841949463, "learning_rate": 3.291973455648401e-05, "loss": 0.6332, "num_input_tokens_seen": 35676360, "step": 61455 }, { "epoch": 9.15400655347036, "grad_norm": 1.382935643196106, "learning_rate": 3.291665242664152e-05, "loss": 0.6636, "num_input_tokens_seen": 35679304, "step": 61460 }, { "epoch": 9.154751266011319, "grad_norm": 1.0624829530715942, "learning_rate": 3.291357016305523e-05, "loss": 0.6999, "num_input_tokens_seen": 35682440, "step": 61465 }, { "epoch": 9.15549597855228, "grad_norm": 2.341625452041626, "learning_rate": 3.291048776577722e-05, "loss": 0.5585, "num_input_tokens_seen": 35685256, "step": 61470 }, { "epoch": 9.156240691093238, "grad_norm": 1.4190055131912231, "learning_rate": 3.290740523485956e-05, "loss": 0.6526, "num_input_tokens_seen": 35688264, "step": 61475 }, { "epoch": 9.156985403634197, "grad_norm": 1.1827354431152344, "learning_rate": 3.290432257035432e-05, "loss": 0.588, "num_input_tokens_seen": 35691272, "step": 61480 }, { "epoch": 9.157730116175156, "grad_norm": 1.6096245050430298, "learning_rate": 3.29012397723136e-05, "loss": 0.8018, "num_input_tokens_seen": 35694344, "step": 61485 }, { "epoch": 9.158474828716116, "grad_norm": 1.4308346509933472, "learning_rate": 3.289815684078944e-05, "loss": 0.6424, "num_input_tokens_seen": 35697384, "step": 61490 }, { "epoch": 9.159219541257075, "grad_norm": 0.9901154637336731, "learning_rate": 3.2895073775833976e-05, "loss": 0.5418, "num_input_tokens_seen": 35700296, "step": 61495 }, { "epoch": 9.159964253798034, "grad_norm": 1.4650501012802124, "learning_rate": 3.2891990577499246e-05, "loss": 0.4794, "num_input_tokens_seen": 35702920, "step": 61500 }, { "epoch": 9.160708966338992, "grad_norm": 0.893925666809082, "learning_rate": 3.2888907245837356e-05, "loss": 0.5705, "num_input_tokens_seen": 35705672, "step": 61505 }, { "epoch": 9.161453678879953, "grad_norm": 1.5403331518173218, "learning_rate": 3.2885823780900395e-05, "loss": 0.5519, "num_input_tokens_seen": 35708744, "step": 61510 }, { "epoch": 9.162198391420912, "grad_norm": 1.8000562191009521, "learning_rate": 3.2882740182740466e-05, "loss": 0.7057, "num_input_tokens_seen": 35711752, "step": 61515 }, { "epoch": 9.16294310396187, "grad_norm": 0.9579048752784729, "learning_rate": 3.2879656451409644e-05, "loss": 0.584, "num_input_tokens_seen": 35714696, "step": 61520 }, { "epoch": 9.16368781650283, "grad_norm": 1.1667293310165405, "learning_rate": 3.287657258696004e-05, "loss": 0.4614, "num_input_tokens_seen": 35717576, "step": 61525 }, { "epoch": 9.16443252904379, "grad_norm": 1.316542387008667, "learning_rate": 3.2873488589443747e-05, "loss": 0.7602, "num_input_tokens_seen": 35720616, "step": 61530 }, { "epoch": 9.165177241584749, "grad_norm": 1.2068984508514404, "learning_rate": 3.287040445891286e-05, "loss": 0.6037, "num_input_tokens_seen": 35723368, "step": 61535 }, { "epoch": 9.165921954125707, "grad_norm": 1.1921898126602173, "learning_rate": 3.28673201954195e-05, "loss": 0.4576, "num_input_tokens_seen": 35726120, "step": 61540 }, { "epoch": 9.166666666666666, "grad_norm": 2.0759928226470947, "learning_rate": 3.286423579901575e-05, "loss": 0.578, "num_input_tokens_seen": 35728872, "step": 61545 }, { "epoch": 9.167411379207627, "grad_norm": 2.85653018951416, "learning_rate": 3.2861151269753745e-05, "loss": 0.6638, "num_input_tokens_seen": 35731656, "step": 61550 }, { "epoch": 9.168156091748585, "grad_norm": 1.2977603673934937, "learning_rate": 3.285806660768556e-05, "loss": 0.6055, "num_input_tokens_seen": 35734504, "step": 61555 }, { "epoch": 9.168900804289544, "grad_norm": 1.3120125532150269, "learning_rate": 3.285498181286334e-05, "loss": 0.6911, "num_input_tokens_seen": 35737320, "step": 61560 }, { "epoch": 9.169645516830503, "grad_norm": 1.4564297199249268, "learning_rate": 3.285189688533917e-05, "loss": 0.7089, "num_input_tokens_seen": 35740392, "step": 61565 }, { "epoch": 9.170390229371463, "grad_norm": 1.2337349653244019, "learning_rate": 3.284881182516519e-05, "loss": 0.7273, "num_input_tokens_seen": 35743272, "step": 61570 }, { "epoch": 9.171134941912422, "grad_norm": 1.0972086191177368, "learning_rate": 3.2845726632393525e-05, "loss": 0.5151, "num_input_tokens_seen": 35746216, "step": 61575 }, { "epoch": 9.171879654453381, "grad_norm": 1.1155221462249756, "learning_rate": 3.284264130707627e-05, "loss": 0.5434, "num_input_tokens_seen": 35749032, "step": 61580 }, { "epoch": 9.17262436699434, "grad_norm": 1.553985595703125, "learning_rate": 3.283955584926557e-05, "loss": 0.4662, "num_input_tokens_seen": 35752200, "step": 61585 }, { "epoch": 9.1733690795353, "grad_norm": 1.8094922304153442, "learning_rate": 3.283647025901353e-05, "loss": 0.5976, "num_input_tokens_seen": 35754984, "step": 61590 }, { "epoch": 9.174113792076259, "grad_norm": 1.2792887687683105, "learning_rate": 3.283338453637229e-05, "loss": 0.7369, "num_input_tokens_seen": 35757768, "step": 61595 }, { "epoch": 9.174858504617218, "grad_norm": 0.9070379137992859, "learning_rate": 3.2830298681393985e-05, "loss": 0.7454, "num_input_tokens_seen": 35760360, "step": 61600 }, { "epoch": 9.175603217158177, "grad_norm": 1.6619377136230469, "learning_rate": 3.2827212694130736e-05, "loss": 0.7299, "num_input_tokens_seen": 35762824, "step": 61605 }, { "epoch": 9.176347929699135, "grad_norm": 0.9702489972114563, "learning_rate": 3.282412657463469e-05, "loss": 0.5026, "num_input_tokens_seen": 35765736, "step": 61610 }, { "epoch": 9.177092642240096, "grad_norm": 1.6077224016189575, "learning_rate": 3.282104032295798e-05, "loss": 0.7357, "num_input_tokens_seen": 35768968, "step": 61615 }, { "epoch": 9.177837354781055, "grad_norm": 2.409707546234131, "learning_rate": 3.281795393915275e-05, "loss": 0.5762, "num_input_tokens_seen": 35771880, "step": 61620 }, { "epoch": 9.178582067322013, "grad_norm": 0.8309320211410522, "learning_rate": 3.281486742327112e-05, "loss": 0.6185, "num_input_tokens_seen": 35774760, "step": 61625 }, { "epoch": 9.179326779862972, "grad_norm": 2.0782406330108643, "learning_rate": 3.281178077536525e-05, "loss": 0.4713, "num_input_tokens_seen": 35777960, "step": 61630 }, { "epoch": 9.180071492403933, "grad_norm": 1.2556966543197632, "learning_rate": 3.280869399548728e-05, "loss": 0.4781, "num_input_tokens_seen": 35780936, "step": 61635 }, { "epoch": 9.180816204944891, "grad_norm": 1.1828088760375977, "learning_rate": 3.280560708368936e-05, "loss": 0.6366, "num_input_tokens_seen": 35783688, "step": 61640 }, { "epoch": 9.18156091748585, "grad_norm": 0.6831191182136536, "learning_rate": 3.2802520040023646e-05, "loss": 0.5825, "num_input_tokens_seen": 35786632, "step": 61645 }, { "epoch": 9.182305630026809, "grad_norm": 1.8214714527130127, "learning_rate": 3.279943286454229e-05, "loss": 0.7161, "num_input_tokens_seen": 35789640, "step": 61650 }, { "epoch": 9.18305034256777, "grad_norm": 1.006617784500122, "learning_rate": 3.2796345557297446e-05, "loss": 0.6998, "num_input_tokens_seen": 35792392, "step": 61655 }, { "epoch": 9.183795055108728, "grad_norm": 1.4045830965042114, "learning_rate": 3.2793258118341265e-05, "loss": 0.5634, "num_input_tokens_seen": 35795496, "step": 61660 }, { "epoch": 9.184539767649687, "grad_norm": 2.075792074203491, "learning_rate": 3.2790170547725894e-05, "loss": 0.6976, "num_input_tokens_seen": 35798472, "step": 61665 }, { "epoch": 9.185284480190646, "grad_norm": 1.366011619567871, "learning_rate": 3.2787082845503525e-05, "loss": 0.5046, "num_input_tokens_seen": 35801160, "step": 61670 }, { "epoch": 9.186029192731606, "grad_norm": 1.0151773691177368, "learning_rate": 3.27839950117263e-05, "loss": 0.6685, "num_input_tokens_seen": 35804552, "step": 61675 }, { "epoch": 9.186773905272565, "grad_norm": 1.024301290512085, "learning_rate": 3.27809070464464e-05, "loss": 0.719, "num_input_tokens_seen": 35807688, "step": 61680 }, { "epoch": 9.187518617813524, "grad_norm": 1.4011539220809937, "learning_rate": 3.2777818949715965e-05, "loss": 0.6828, "num_input_tokens_seen": 35810568, "step": 61685 }, { "epoch": 9.188263330354483, "grad_norm": 2.1133038997650146, "learning_rate": 3.27747307215872e-05, "loss": 0.6138, "num_input_tokens_seen": 35813640, "step": 61690 }, { "epoch": 9.189008042895443, "grad_norm": 1.2652381658554077, "learning_rate": 3.2771642362112255e-05, "loss": 0.7684, "num_input_tokens_seen": 35816424, "step": 61695 }, { "epoch": 9.189752755436402, "grad_norm": 1.1355453729629517, "learning_rate": 3.276855387134331e-05, "loss": 0.6429, "num_input_tokens_seen": 35819208, "step": 61700 }, { "epoch": 9.19049746797736, "grad_norm": 0.8883737325668335, "learning_rate": 3.2765465249332545e-05, "loss": 0.5419, "num_input_tokens_seen": 35822120, "step": 61705 }, { "epoch": 9.19124218051832, "grad_norm": 2.3499770164489746, "learning_rate": 3.276237649613214e-05, "loss": 0.6773, "num_input_tokens_seen": 35825320, "step": 61710 }, { "epoch": 9.19198689305928, "grad_norm": 0.8922409415245056, "learning_rate": 3.275928761179427e-05, "loss": 0.4351, "num_input_tokens_seen": 35828232, "step": 61715 }, { "epoch": 9.192731605600239, "grad_norm": 0.7869373559951782, "learning_rate": 3.2756198596371115e-05, "loss": 0.4519, "num_input_tokens_seen": 35830856, "step": 61720 }, { "epoch": 9.193476318141197, "grad_norm": 1.1812987327575684, "learning_rate": 3.275310944991487e-05, "loss": 0.5828, "num_input_tokens_seen": 35833672, "step": 61725 }, { "epoch": 9.194221030682156, "grad_norm": 1.1703842878341675, "learning_rate": 3.275002017247773e-05, "loss": 0.5565, "num_input_tokens_seen": 35836776, "step": 61730 }, { "epoch": 9.194965743223117, "grad_norm": 1.8073683977127075, "learning_rate": 3.2746930764111876e-05, "loss": 0.5854, "num_input_tokens_seen": 35839592, "step": 61735 }, { "epoch": 9.195710455764075, "grad_norm": 0.9280007481575012, "learning_rate": 3.2743841224869496e-05, "loss": 0.4881, "num_input_tokens_seen": 35842280, "step": 61740 }, { "epoch": 9.196455168305034, "grad_norm": 0.6344937682151794, "learning_rate": 3.274075155480278e-05, "loss": 0.5739, "num_input_tokens_seen": 35845288, "step": 61745 }, { "epoch": 9.197199880845993, "grad_norm": 1.5914785861968994, "learning_rate": 3.273766175396395e-05, "loss": 0.547, "num_input_tokens_seen": 35848008, "step": 61750 }, { "epoch": 9.197944593386953, "grad_norm": 0.7396677732467651, "learning_rate": 3.273457182240518e-05, "loss": 0.3981, "num_input_tokens_seen": 35850952, "step": 61755 }, { "epoch": 9.198689305927912, "grad_norm": 1.315093755722046, "learning_rate": 3.273148176017868e-05, "loss": 0.5615, "num_input_tokens_seen": 35853736, "step": 61760 }, { "epoch": 9.199434018468871, "grad_norm": 0.9656255841255188, "learning_rate": 3.2728391567336656e-05, "loss": 0.6778, "num_input_tokens_seen": 35856872, "step": 61765 }, { "epoch": 9.20017873100983, "grad_norm": 1.1518357992172241, "learning_rate": 3.272530124393131e-05, "loss": 0.6571, "num_input_tokens_seen": 35859880, "step": 61770 }, { "epoch": 9.200923443550789, "grad_norm": 1.4990577697753906, "learning_rate": 3.2722210790014854e-05, "loss": 0.5868, "num_input_tokens_seen": 35862792, "step": 61775 }, { "epoch": 9.201668156091749, "grad_norm": 3.271653413772583, "learning_rate": 3.271912020563949e-05, "loss": 0.7638, "num_input_tokens_seen": 35865992, "step": 61780 }, { "epoch": 9.202412868632708, "grad_norm": 1.03374183177948, "learning_rate": 3.2716029490857445e-05, "loss": 0.5558, "num_input_tokens_seen": 35869032, "step": 61785 }, { "epoch": 9.203157581173667, "grad_norm": 1.5222965478897095, "learning_rate": 3.271293864572092e-05, "loss": 0.6723, "num_input_tokens_seen": 35872104, "step": 61790 }, { "epoch": 9.203902293714625, "grad_norm": 2.1523079872131348, "learning_rate": 3.2709847670282126e-05, "loss": 0.5702, "num_input_tokens_seen": 35874856, "step": 61795 }, { "epoch": 9.204647006255586, "grad_norm": 2.661729097366333, "learning_rate": 3.27067565645933e-05, "loss": 0.5615, "num_input_tokens_seen": 35877896, "step": 61800 }, { "epoch": 9.205391718796545, "grad_norm": 1.6947944164276123, "learning_rate": 3.2703665328706654e-05, "loss": 0.4896, "num_input_tokens_seen": 35880936, "step": 61805 }, { "epoch": 9.206136431337503, "grad_norm": 0.92951500415802, "learning_rate": 3.270057396267441e-05, "loss": 0.534, "num_input_tokens_seen": 35883688, "step": 61810 }, { "epoch": 9.206881143878462, "grad_norm": 1.3852741718292236, "learning_rate": 3.26974824665488e-05, "loss": 0.6798, "num_input_tokens_seen": 35886472, "step": 61815 }, { "epoch": 9.207625856419423, "grad_norm": 1.9840582609176636, "learning_rate": 3.269439084038205e-05, "loss": 0.5629, "num_input_tokens_seen": 35889288, "step": 61820 }, { "epoch": 9.208370568960381, "grad_norm": 0.8036888837814331, "learning_rate": 3.2691299084226375e-05, "loss": 0.6216, "num_input_tokens_seen": 35892168, "step": 61825 }, { "epoch": 9.20911528150134, "grad_norm": 1.2996416091918945, "learning_rate": 3.2688207198134026e-05, "loss": 0.7423, "num_input_tokens_seen": 35895208, "step": 61830 }, { "epoch": 9.209859994042299, "grad_norm": 2.175092935562134, "learning_rate": 3.2685115182157225e-05, "loss": 0.5194, "num_input_tokens_seen": 35897864, "step": 61835 }, { "epoch": 9.21060470658326, "grad_norm": 1.1608647108078003, "learning_rate": 3.2682023036348216e-05, "loss": 0.4724, "num_input_tokens_seen": 35900712, "step": 61840 }, { "epoch": 9.211349419124218, "grad_norm": 1.1740285158157349, "learning_rate": 3.267893076075924e-05, "loss": 0.3673, "num_input_tokens_seen": 35903272, "step": 61845 }, { "epoch": 9.212094131665177, "grad_norm": 0.9499437808990479, "learning_rate": 3.267583835544253e-05, "loss": 0.4425, "num_input_tokens_seen": 35906056, "step": 61850 }, { "epoch": 9.212838844206136, "grad_norm": 1.7452740669250488, "learning_rate": 3.2672745820450336e-05, "loss": 0.6197, "num_input_tokens_seen": 35908968, "step": 61855 }, { "epoch": 9.213583556747096, "grad_norm": 1.2501577138900757, "learning_rate": 3.2669653155834894e-05, "loss": 0.568, "num_input_tokens_seen": 35911784, "step": 61860 }, { "epoch": 9.214328269288055, "grad_norm": 1.578710675239563, "learning_rate": 3.2666560361648456e-05, "loss": 0.7649, "num_input_tokens_seen": 35914568, "step": 61865 }, { "epoch": 9.215072981829014, "grad_norm": 1.9195616245269775, "learning_rate": 3.266346743794328e-05, "loss": 0.6142, "num_input_tokens_seen": 35917512, "step": 61870 }, { "epoch": 9.215817694369973, "grad_norm": 1.4659359455108643, "learning_rate": 3.26603743847716e-05, "loss": 0.5229, "num_input_tokens_seen": 35920200, "step": 61875 }, { "epoch": 9.216562406910933, "grad_norm": 1.3650683164596558, "learning_rate": 3.26572812021857e-05, "loss": 0.4838, "num_input_tokens_seen": 35923368, "step": 61880 }, { "epoch": 9.217307119451892, "grad_norm": 1.0533356666564941, "learning_rate": 3.2654187890237795e-05, "loss": 0.6752, "num_input_tokens_seen": 35926120, "step": 61885 }, { "epoch": 9.21805183199285, "grad_norm": 0.7285788059234619, "learning_rate": 3.2651094448980175e-05, "loss": 0.5791, "num_input_tokens_seen": 35929160, "step": 61890 }, { "epoch": 9.21879654453381, "grad_norm": 1.4374573230743408, "learning_rate": 3.264800087846509e-05, "loss": 0.5588, "num_input_tokens_seen": 35931848, "step": 61895 }, { "epoch": 9.21954125707477, "grad_norm": 1.6238569021224976, "learning_rate": 3.2644907178744805e-05, "loss": 0.6488, "num_input_tokens_seen": 35934792, "step": 61900 }, { "epoch": 9.220285969615729, "grad_norm": 2.531635284423828, "learning_rate": 3.264181334987157e-05, "loss": 0.5988, "num_input_tokens_seen": 35937896, "step": 61905 }, { "epoch": 9.221030682156687, "grad_norm": 2.4236156940460205, "learning_rate": 3.2638719391897684e-05, "loss": 0.6656, "num_input_tokens_seen": 35940776, "step": 61910 }, { "epoch": 9.221775394697646, "grad_norm": 0.7995433211326599, "learning_rate": 3.2635625304875386e-05, "loss": 0.5848, "num_input_tokens_seen": 35943592, "step": 61915 }, { "epoch": 9.222520107238607, "grad_norm": 4.725177764892578, "learning_rate": 3.263253108885696e-05, "loss": 0.8327, "num_input_tokens_seen": 35946280, "step": 61920 }, { "epoch": 9.223264819779565, "grad_norm": 0.9749099612236023, "learning_rate": 3.262943674389469e-05, "loss": 0.6704, "num_input_tokens_seen": 35949096, "step": 61925 }, { "epoch": 9.224009532320524, "grad_norm": 0.9187060594558716, "learning_rate": 3.2626342270040823e-05, "loss": 0.5614, "num_input_tokens_seen": 35952040, "step": 61930 }, { "epoch": 9.224754244861483, "grad_norm": 1.7918212413787842, "learning_rate": 3.262324766734766e-05, "loss": 0.4304, "num_input_tokens_seen": 35954760, "step": 61935 }, { "epoch": 9.225498957402444, "grad_norm": 1.906719446182251, "learning_rate": 3.2620152935867484e-05, "loss": 0.6223, "num_input_tokens_seen": 35957864, "step": 61940 }, { "epoch": 9.226243669943402, "grad_norm": 1.3807899951934814, "learning_rate": 3.261705807565256e-05, "loss": 0.6636, "num_input_tokens_seen": 35960712, "step": 61945 }, { "epoch": 9.226988382484361, "grad_norm": 2.600464344024658, "learning_rate": 3.26139630867552e-05, "loss": 0.5956, "num_input_tokens_seen": 35963656, "step": 61950 }, { "epoch": 9.22773309502532, "grad_norm": 1.1395316123962402, "learning_rate": 3.261086796922765e-05, "loss": 0.5867, "num_input_tokens_seen": 35966472, "step": 61955 }, { "epoch": 9.228477807566279, "grad_norm": 1.1094846725463867, "learning_rate": 3.260777272312222e-05, "loss": 0.5302, "num_input_tokens_seen": 35969384, "step": 61960 }, { "epoch": 9.229222520107239, "grad_norm": 1.3253217935562134, "learning_rate": 3.2604677348491215e-05, "loss": 0.5166, "num_input_tokens_seen": 35972520, "step": 61965 }, { "epoch": 9.229967232648198, "grad_norm": 1.0751432180404663, "learning_rate": 3.260158184538691e-05, "loss": 0.5683, "num_input_tokens_seen": 35975880, "step": 61970 }, { "epoch": 9.230711945189157, "grad_norm": 1.5995092391967773, "learning_rate": 3.25984862138616e-05, "loss": 0.5606, "num_input_tokens_seen": 35978632, "step": 61975 }, { "epoch": 9.231456657730115, "grad_norm": 1.4597009420394897, "learning_rate": 3.25953904539676e-05, "loss": 0.7806, "num_input_tokens_seen": 35981416, "step": 61980 }, { "epoch": 9.232201370271076, "grad_norm": 2.443256378173828, "learning_rate": 3.259229456575719e-05, "loss": 0.5928, "num_input_tokens_seen": 35984328, "step": 61985 }, { "epoch": 9.232946082812035, "grad_norm": 2.104112386703491, "learning_rate": 3.258919854928268e-05, "loss": 0.698, "num_input_tokens_seen": 35987016, "step": 61990 }, { "epoch": 9.233690795352993, "grad_norm": 1.6373610496520996, "learning_rate": 3.2586102404596375e-05, "loss": 0.7298, "num_input_tokens_seen": 35989800, "step": 61995 }, { "epoch": 9.234435507893952, "grad_norm": 2.1861236095428467, "learning_rate": 3.258300613175058e-05, "loss": 0.6073, "num_input_tokens_seen": 35992712, "step": 62000 }, { "epoch": 9.235180220434913, "grad_norm": 1.281726598739624, "learning_rate": 3.2579909730797605e-05, "loss": 0.5634, "num_input_tokens_seen": 35995816, "step": 62005 }, { "epoch": 9.235924932975871, "grad_norm": 1.3379720449447632, "learning_rate": 3.2576813201789755e-05, "loss": 0.8056, "num_input_tokens_seen": 35998600, "step": 62010 }, { "epoch": 9.23666964551683, "grad_norm": 1.9071385860443115, "learning_rate": 3.257371654477935e-05, "loss": 0.7709, "num_input_tokens_seen": 36001352, "step": 62015 }, { "epoch": 9.237414358057789, "grad_norm": 1.4910821914672852, "learning_rate": 3.257061975981871e-05, "loss": 0.6203, "num_input_tokens_seen": 36004456, "step": 62020 }, { "epoch": 9.23815907059875, "grad_norm": 1.5026702880859375, "learning_rate": 3.256752284696013e-05, "loss": 0.7032, "num_input_tokens_seen": 36007240, "step": 62025 }, { "epoch": 9.238903783139708, "grad_norm": 1.0157893896102905, "learning_rate": 3.256442580625595e-05, "loss": 0.5651, "num_input_tokens_seen": 36009992, "step": 62030 }, { "epoch": 9.239648495680667, "grad_norm": 1.3531887531280518, "learning_rate": 3.2561328637758475e-05, "loss": 0.7289, "num_input_tokens_seen": 36013000, "step": 62035 }, { "epoch": 9.240393208221626, "grad_norm": 1.0074617862701416, "learning_rate": 3.2558231341520046e-05, "loss": 0.6234, "num_input_tokens_seen": 36016200, "step": 62040 }, { "epoch": 9.241137920762586, "grad_norm": 1.33219313621521, "learning_rate": 3.255513391759299e-05, "loss": 0.7744, "num_input_tokens_seen": 36018920, "step": 62045 }, { "epoch": 9.241882633303545, "grad_norm": 1.4197262525558472, "learning_rate": 3.25520363660296e-05, "loss": 0.6468, "num_input_tokens_seen": 36021576, "step": 62050 }, { "epoch": 9.242627345844504, "grad_norm": 1.2346985340118408, "learning_rate": 3.2548938686882246e-05, "loss": 0.5311, "num_input_tokens_seen": 36024776, "step": 62055 }, { "epoch": 9.243372058385463, "grad_norm": 0.9171754717826843, "learning_rate": 3.254584088020325e-05, "loss": 0.6023, "num_input_tokens_seen": 36027432, "step": 62060 }, { "epoch": 9.244116770926423, "grad_norm": 1.5497814416885376, "learning_rate": 3.254274294604494e-05, "loss": 0.5791, "num_input_tokens_seen": 36030344, "step": 62065 }, { "epoch": 9.244861483467382, "grad_norm": 1.428118109703064, "learning_rate": 3.253964488445964e-05, "loss": 0.6397, "num_input_tokens_seen": 36033672, "step": 62070 }, { "epoch": 9.24560619600834, "grad_norm": 1.4705798625946045, "learning_rate": 3.253654669549972e-05, "loss": 0.4614, "num_input_tokens_seen": 36036616, "step": 62075 }, { "epoch": 9.2463509085493, "grad_norm": 0.962621808052063, "learning_rate": 3.253344837921749e-05, "loss": 0.5645, "num_input_tokens_seen": 36039368, "step": 62080 }, { "epoch": 9.24709562109026, "grad_norm": 1.0915366411209106, "learning_rate": 3.253034993566532e-05, "loss": 0.6282, "num_input_tokens_seen": 36042152, "step": 62085 }, { "epoch": 9.247840333631219, "grad_norm": 1.2227978706359863, "learning_rate": 3.252725136489553e-05, "loss": 0.6475, "num_input_tokens_seen": 36045096, "step": 62090 }, { "epoch": 9.248585046172177, "grad_norm": 1.3584822416305542, "learning_rate": 3.2524152666960476e-05, "loss": 0.5473, "num_input_tokens_seen": 36047976, "step": 62095 }, { "epoch": 9.249329758713136, "grad_norm": 0.7340521812438965, "learning_rate": 3.252105384191252e-05, "loss": 0.5431, "num_input_tokens_seen": 36050888, "step": 62100 }, { "epoch": 9.250074471254097, "grad_norm": 1.7466022968292236, "learning_rate": 3.2517954889803995e-05, "loss": 0.7055, "num_input_tokens_seen": 36053768, "step": 62105 }, { "epoch": 9.250819183795056, "grad_norm": 1.5199787616729736, "learning_rate": 3.2514855810687265e-05, "loss": 0.5163, "num_input_tokens_seen": 36056680, "step": 62110 }, { "epoch": 9.251563896336014, "grad_norm": 0.8882969617843628, "learning_rate": 3.2511756604614695e-05, "loss": 0.7089, "num_input_tokens_seen": 36059560, "step": 62115 }, { "epoch": 9.252308608876973, "grad_norm": 1.654776692390442, "learning_rate": 3.250865727163862e-05, "loss": 0.758, "num_input_tokens_seen": 36062312, "step": 62120 }, { "epoch": 9.253053321417934, "grad_norm": 1.5913174152374268, "learning_rate": 3.250555781181142e-05, "loss": 0.6964, "num_input_tokens_seen": 36065128, "step": 62125 }, { "epoch": 9.253798033958892, "grad_norm": 1.8844283819198608, "learning_rate": 3.250245822518544e-05, "loss": 0.7033, "num_input_tokens_seen": 36067912, "step": 62130 }, { "epoch": 9.254542746499851, "grad_norm": 1.1155847311019897, "learning_rate": 3.249935851181305e-05, "loss": 0.6332, "num_input_tokens_seen": 36070632, "step": 62135 }, { "epoch": 9.25528745904081, "grad_norm": 2.6249313354492188, "learning_rate": 3.2496258671746636e-05, "loss": 0.6574, "num_input_tokens_seen": 36073512, "step": 62140 }, { "epoch": 9.256032171581769, "grad_norm": 1.958999514579773, "learning_rate": 3.249315870503854e-05, "loss": 0.7217, "num_input_tokens_seen": 36076328, "step": 62145 }, { "epoch": 9.25677688412273, "grad_norm": 1.5030834674835205, "learning_rate": 3.249005861174115e-05, "loss": 0.7512, "num_input_tokens_seen": 36079432, "step": 62150 }, { "epoch": 9.257521596663688, "grad_norm": 1.772554636001587, "learning_rate": 3.2486958391906825e-05, "loss": 0.6513, "num_input_tokens_seen": 36082312, "step": 62155 }, { "epoch": 9.258266309204647, "grad_norm": 0.8395677208900452, "learning_rate": 3.2483858045587944e-05, "loss": 0.5354, "num_input_tokens_seen": 36084904, "step": 62160 }, { "epoch": 9.259011021745605, "grad_norm": 1.1840531826019287, "learning_rate": 3.2480757572836895e-05, "loss": 0.6119, "num_input_tokens_seen": 36087784, "step": 62165 }, { "epoch": 9.259755734286566, "grad_norm": 0.9293261170387268, "learning_rate": 3.247765697370604e-05, "loss": 0.6314, "num_input_tokens_seen": 36090600, "step": 62170 }, { "epoch": 9.260500446827525, "grad_norm": 1.460442066192627, "learning_rate": 3.247455624824779e-05, "loss": 0.6354, "num_input_tokens_seen": 36093448, "step": 62175 }, { "epoch": 9.261245159368483, "grad_norm": 1.5049587488174438, "learning_rate": 3.247145539651449e-05, "loss": 0.4475, "num_input_tokens_seen": 36096392, "step": 62180 }, { "epoch": 9.261989871909442, "grad_norm": 2.1095499992370605, "learning_rate": 3.246835441855856e-05, "loss": 0.6069, "num_input_tokens_seen": 36099368, "step": 62185 }, { "epoch": 9.262734584450403, "grad_norm": 1.9423038959503174, "learning_rate": 3.2465253314432366e-05, "loss": 0.7314, "num_input_tokens_seen": 36102376, "step": 62190 }, { "epoch": 9.263479296991362, "grad_norm": 1.6534605026245117, "learning_rate": 3.24621520841883e-05, "loss": 0.613, "num_input_tokens_seen": 36105128, "step": 62195 }, { "epoch": 9.26422400953232, "grad_norm": 1.242985486984253, "learning_rate": 3.245905072787876e-05, "loss": 0.6099, "num_input_tokens_seen": 36108296, "step": 62200 }, { "epoch": 9.264968722073279, "grad_norm": 1.591416835784912, "learning_rate": 3.245594924555614e-05, "loss": 0.6776, "num_input_tokens_seen": 36111656, "step": 62205 }, { "epoch": 9.26571343461424, "grad_norm": 0.9108465909957886, "learning_rate": 3.2452847637272845e-05, "loss": 0.5038, "num_input_tokens_seen": 36114536, "step": 62210 }, { "epoch": 9.266458147155198, "grad_norm": 1.8261535167694092, "learning_rate": 3.244974590308125e-05, "loss": 0.6165, "num_input_tokens_seen": 36117672, "step": 62215 }, { "epoch": 9.267202859696157, "grad_norm": 1.4834412336349487, "learning_rate": 3.244664404303378e-05, "loss": 0.5937, "num_input_tokens_seen": 36120360, "step": 62220 }, { "epoch": 9.267947572237116, "grad_norm": 2.1832311153411865, "learning_rate": 3.2443542057182825e-05, "loss": 0.6307, "num_input_tokens_seen": 36123208, "step": 62225 }, { "epoch": 9.268692284778076, "grad_norm": 1.1200779676437378, "learning_rate": 3.244043994558079e-05, "loss": 0.4855, "num_input_tokens_seen": 36125992, "step": 62230 }, { "epoch": 9.269436997319035, "grad_norm": 1.0481903553009033, "learning_rate": 3.243733770828008e-05, "loss": 0.5609, "num_input_tokens_seen": 36128936, "step": 62235 }, { "epoch": 9.270181709859994, "grad_norm": 1.4257279634475708, "learning_rate": 3.243423534533311e-05, "loss": 0.628, "num_input_tokens_seen": 36132072, "step": 62240 }, { "epoch": 9.270926422400953, "grad_norm": 1.4154915809631348, "learning_rate": 3.2431132856792294e-05, "loss": 0.6256, "num_input_tokens_seen": 36135304, "step": 62245 }, { "epoch": 9.271671134941913, "grad_norm": 1.6065952777862549, "learning_rate": 3.242803024271004e-05, "loss": 0.6115, "num_input_tokens_seen": 36138312, "step": 62250 }, { "epoch": 9.272415847482872, "grad_norm": 1.8459619283676147, "learning_rate": 3.2424927503138766e-05, "loss": 0.6376, "num_input_tokens_seen": 36141192, "step": 62255 }, { "epoch": 9.27316056002383, "grad_norm": 1.2492237091064453, "learning_rate": 3.242182463813088e-05, "loss": 0.6621, "num_input_tokens_seen": 36144232, "step": 62260 }, { "epoch": 9.27390527256479, "grad_norm": 1.478186845779419, "learning_rate": 3.241872164773882e-05, "loss": 0.5984, "num_input_tokens_seen": 36147016, "step": 62265 }, { "epoch": 9.27464998510575, "grad_norm": 1.0679799318313599, "learning_rate": 3.241561853201499e-05, "loss": 0.542, "num_input_tokens_seen": 36149832, "step": 62270 }, { "epoch": 9.275394697646709, "grad_norm": 1.1198151111602783, "learning_rate": 3.2412515291011826e-05, "loss": 0.5703, "num_input_tokens_seen": 36153000, "step": 62275 }, { "epoch": 9.276139410187668, "grad_norm": 2.1387453079223633, "learning_rate": 3.2409411924781754e-05, "loss": 0.7129, "num_input_tokens_seen": 36156040, "step": 62280 }, { "epoch": 9.276884122728626, "grad_norm": 1.5802994966506958, "learning_rate": 3.2406308433377194e-05, "loss": 0.6233, "num_input_tokens_seen": 36158568, "step": 62285 }, { "epoch": 9.277628835269585, "grad_norm": 1.0959599018096924, "learning_rate": 3.2403204816850574e-05, "loss": 0.4771, "num_input_tokens_seen": 36161512, "step": 62290 }, { "epoch": 9.278373547810546, "grad_norm": 1.8741798400878906, "learning_rate": 3.240010107525434e-05, "loss": 0.5622, "num_input_tokens_seen": 36164328, "step": 62295 }, { "epoch": 9.279118260351504, "grad_norm": 1.2324801683425903, "learning_rate": 3.2396997208640925e-05, "loss": 0.741, "num_input_tokens_seen": 36167272, "step": 62300 }, { "epoch": 9.279862972892463, "grad_norm": 3.558565139770508, "learning_rate": 3.2393893217062746e-05, "loss": 0.691, "num_input_tokens_seen": 36170120, "step": 62305 }, { "epoch": 9.280607685433422, "grad_norm": 2.031888484954834, "learning_rate": 3.239078910057226e-05, "loss": 0.729, "num_input_tokens_seen": 36173160, "step": 62310 }, { "epoch": 9.281352397974382, "grad_norm": 1.5118343830108643, "learning_rate": 3.238768485922191e-05, "loss": 0.6463, "num_input_tokens_seen": 36175816, "step": 62315 }, { "epoch": 9.282097110515341, "grad_norm": 1.010040283203125, "learning_rate": 3.238458049306413e-05, "loss": 0.6178, "num_input_tokens_seen": 36179304, "step": 62320 }, { "epoch": 9.2828418230563, "grad_norm": 1.0726444721221924, "learning_rate": 3.2381476002151365e-05, "loss": 0.6604, "num_input_tokens_seen": 36182024, "step": 62325 }, { "epoch": 9.283586535597259, "grad_norm": 0.9302594065666199, "learning_rate": 3.2378371386536074e-05, "loss": 0.5986, "num_input_tokens_seen": 36184904, "step": 62330 }, { "epoch": 9.28433124813822, "grad_norm": 1.3739476203918457, "learning_rate": 3.2375266646270684e-05, "loss": 0.6462, "num_input_tokens_seen": 36188040, "step": 62335 }, { "epoch": 9.285075960679178, "grad_norm": 1.3543058633804321, "learning_rate": 3.2372161781407675e-05, "loss": 0.4563, "num_input_tokens_seen": 36191048, "step": 62340 }, { "epoch": 9.285820673220137, "grad_norm": 0.9296495318412781, "learning_rate": 3.2369056791999476e-05, "loss": 0.4333, "num_input_tokens_seen": 36193768, "step": 62345 }, { "epoch": 9.286565385761095, "grad_norm": 1.9708577394485474, "learning_rate": 3.236595167809856e-05, "loss": 0.5067, "num_input_tokens_seen": 36196584, "step": 62350 }, { "epoch": 9.287310098302056, "grad_norm": 1.2548712491989136, "learning_rate": 3.236284643975737e-05, "loss": 0.6643, "num_input_tokens_seen": 36199432, "step": 62355 }, { "epoch": 9.288054810843015, "grad_norm": 1.5912781953811646, "learning_rate": 3.235974107702837e-05, "loss": 0.4857, "num_input_tokens_seen": 36202280, "step": 62360 }, { "epoch": 9.288799523383974, "grad_norm": 1.5511143207550049, "learning_rate": 3.235663558996402e-05, "loss": 0.6816, "num_input_tokens_seen": 36205064, "step": 62365 }, { "epoch": 9.289544235924932, "grad_norm": 1.964035987854004, "learning_rate": 3.2353529978616806e-05, "loss": 0.4296, "num_input_tokens_seen": 36207880, "step": 62370 }, { "epoch": 9.290288948465893, "grad_norm": 1.2063459157943726, "learning_rate": 3.235042424303917e-05, "loss": 0.6861, "num_input_tokens_seen": 36210664, "step": 62375 }, { "epoch": 9.291033661006852, "grad_norm": 0.8735743761062622, "learning_rate": 3.2347318383283585e-05, "loss": 0.5459, "num_input_tokens_seen": 36213192, "step": 62380 }, { "epoch": 9.29177837354781, "grad_norm": 1.959228754043579, "learning_rate": 3.234421239940252e-05, "loss": 0.9237, "num_input_tokens_seen": 36215976, "step": 62385 }, { "epoch": 9.292523086088769, "grad_norm": 2.405998706817627, "learning_rate": 3.2341106291448456e-05, "loss": 0.6298, "num_input_tokens_seen": 36218888, "step": 62390 }, { "epoch": 9.29326779862973, "grad_norm": 1.0803775787353516, "learning_rate": 3.233800005947386e-05, "loss": 0.6871, "num_input_tokens_seen": 36221640, "step": 62395 }, { "epoch": 9.294012511170688, "grad_norm": 1.1449061632156372, "learning_rate": 3.23348937035312e-05, "loss": 0.6017, "num_input_tokens_seen": 36224360, "step": 62400 }, { "epoch": 9.294757223711647, "grad_norm": 0.906154990196228, "learning_rate": 3.233178722367298e-05, "loss": 0.5014, "num_input_tokens_seen": 36227432, "step": 62405 }, { "epoch": 9.295501936252606, "grad_norm": 2.3989670276641846, "learning_rate": 3.232868061995167e-05, "loss": 0.624, "num_input_tokens_seen": 36230344, "step": 62410 }, { "epoch": 9.296246648793566, "grad_norm": 2.2950849533081055, "learning_rate": 3.2325573892419745e-05, "loss": 0.545, "num_input_tokens_seen": 36233160, "step": 62415 }, { "epoch": 9.296991361334525, "grad_norm": 1.0107043981552124, "learning_rate": 3.232246704112969e-05, "loss": 0.5977, "num_input_tokens_seen": 36236008, "step": 62420 }, { "epoch": 9.297736073875484, "grad_norm": 1.199745535850525, "learning_rate": 3.2319360066134e-05, "loss": 0.6039, "num_input_tokens_seen": 36239208, "step": 62425 }, { "epoch": 9.298480786416443, "grad_norm": 2.9848456382751465, "learning_rate": 3.2316252967485155e-05, "loss": 0.8291, "num_input_tokens_seen": 36242216, "step": 62430 }, { "epoch": 9.299225498957403, "grad_norm": 1.0364067554473877, "learning_rate": 3.231314574523566e-05, "loss": 0.583, "num_input_tokens_seen": 36245320, "step": 62435 }, { "epoch": 9.299970211498362, "grad_norm": 1.6379696130752563, "learning_rate": 3.2310038399437995e-05, "loss": 0.6478, "num_input_tokens_seen": 36248104, "step": 62440 }, { "epoch": 9.30071492403932, "grad_norm": 1.114795207977295, "learning_rate": 3.230693093014466e-05, "loss": 0.585, "num_input_tokens_seen": 36250696, "step": 62445 }, { "epoch": 9.30145963658028, "grad_norm": 1.450886845588684, "learning_rate": 3.230382333740816e-05, "loss": 0.567, "num_input_tokens_seen": 36253448, "step": 62450 }, { "epoch": 9.30220434912124, "grad_norm": 1.073699951171875, "learning_rate": 3.230071562128098e-05, "loss": 0.6379, "num_input_tokens_seen": 36256072, "step": 62455 }, { "epoch": 9.302949061662199, "grad_norm": 1.4322906732559204, "learning_rate": 3.2297607781815645e-05, "loss": 0.644, "num_input_tokens_seen": 36258952, "step": 62460 }, { "epoch": 9.303693774203158, "grad_norm": 1.2358728647232056, "learning_rate": 3.229449981906463e-05, "loss": 0.5399, "num_input_tokens_seen": 36261832, "step": 62465 }, { "epoch": 9.304438486744116, "grad_norm": 1.6046863794326782, "learning_rate": 3.229139173308045e-05, "loss": 0.7948, "num_input_tokens_seen": 36264680, "step": 62470 }, { "epoch": 9.305183199285075, "grad_norm": 1.8023481369018555, "learning_rate": 3.228828352391562e-05, "loss": 0.672, "num_input_tokens_seen": 36267656, "step": 62475 }, { "epoch": 9.305927911826036, "grad_norm": 1.653151035308838, "learning_rate": 3.2285175191622656e-05, "loss": 0.8743, "num_input_tokens_seen": 36270408, "step": 62480 }, { "epoch": 9.306672624366994, "grad_norm": 1.1021562814712524, "learning_rate": 3.2282066736254056e-05, "loss": 0.494, "num_input_tokens_seen": 36273224, "step": 62485 }, { "epoch": 9.307417336907953, "grad_norm": 1.051720380783081, "learning_rate": 3.2278958157862336e-05, "loss": 0.5589, "num_input_tokens_seen": 36276232, "step": 62490 }, { "epoch": 9.308162049448912, "grad_norm": 1.189164638519287, "learning_rate": 3.2275849456500026e-05, "loss": 0.5569, "num_input_tokens_seen": 36279272, "step": 62495 }, { "epoch": 9.308906761989872, "grad_norm": 0.9130998253822327, "learning_rate": 3.2272740632219635e-05, "loss": 0.5716, "num_input_tokens_seen": 36281928, "step": 62500 }, { "epoch": 9.309651474530831, "grad_norm": 2.0337131023406982, "learning_rate": 3.226963168507367e-05, "loss": 0.7185, "num_input_tokens_seen": 36284744, "step": 62505 }, { "epoch": 9.31039618707179, "grad_norm": 0.9081185460090637, "learning_rate": 3.226652261511467e-05, "loss": 0.573, "num_input_tokens_seen": 36287752, "step": 62510 }, { "epoch": 9.311140899612749, "grad_norm": 1.3016107082366943, "learning_rate": 3.226341342239516e-05, "loss": 0.5139, "num_input_tokens_seen": 36290504, "step": 62515 }, { "epoch": 9.31188561215371, "grad_norm": 0.842497706413269, "learning_rate": 3.226030410696766e-05, "loss": 0.6891, "num_input_tokens_seen": 36293384, "step": 62520 }, { "epoch": 9.312630324694668, "grad_norm": 1.3948792219161987, "learning_rate": 3.2257194668884704e-05, "loss": 0.6347, "num_input_tokens_seen": 36296328, "step": 62525 }, { "epoch": 9.313375037235627, "grad_norm": 2.1691176891326904, "learning_rate": 3.2254085108198815e-05, "loss": 0.6719, "num_input_tokens_seen": 36299240, "step": 62530 }, { "epoch": 9.314119749776586, "grad_norm": 1.740295171737671, "learning_rate": 3.225097542496254e-05, "loss": 0.7438, "num_input_tokens_seen": 36302024, "step": 62535 }, { "epoch": 9.314864462317546, "grad_norm": 1.1636836528778076, "learning_rate": 3.2247865619228394e-05, "loss": 0.8061, "num_input_tokens_seen": 36305288, "step": 62540 }, { "epoch": 9.315609174858505, "grad_norm": 0.9784386157989502, "learning_rate": 3.2244755691048933e-05, "loss": 0.7559, "num_input_tokens_seen": 36308008, "step": 62545 }, { "epoch": 9.316353887399464, "grad_norm": 0.6652922630310059, "learning_rate": 3.224164564047669e-05, "loss": 0.5249, "num_input_tokens_seen": 36310984, "step": 62550 }, { "epoch": 9.317098599940422, "grad_norm": 1.957346796989441, "learning_rate": 3.223853546756419e-05, "loss": 0.5856, "num_input_tokens_seen": 36313960, "step": 62555 }, { "epoch": 9.317843312481383, "grad_norm": 1.63511061668396, "learning_rate": 3.2235425172363996e-05, "loss": 0.6324, "num_input_tokens_seen": 36316744, "step": 62560 }, { "epoch": 9.318588025022342, "grad_norm": 1.4520996809005737, "learning_rate": 3.223231475492865e-05, "loss": 0.6708, "num_input_tokens_seen": 36319496, "step": 62565 }, { "epoch": 9.3193327375633, "grad_norm": 1.088654637336731, "learning_rate": 3.222920421531069e-05, "loss": 0.6404, "num_input_tokens_seen": 36322248, "step": 62570 }, { "epoch": 9.32007745010426, "grad_norm": 0.8311365842819214, "learning_rate": 3.222609355356269e-05, "loss": 0.5069, "num_input_tokens_seen": 36325224, "step": 62575 }, { "epoch": 9.32082216264522, "grad_norm": 1.1628140211105347, "learning_rate": 3.222298276973717e-05, "loss": 0.6263, "num_input_tokens_seen": 36327976, "step": 62580 }, { "epoch": 9.321566875186178, "grad_norm": 1.2445474863052368, "learning_rate": 3.22198718638867e-05, "loss": 0.6102, "num_input_tokens_seen": 36330792, "step": 62585 }, { "epoch": 9.322311587727137, "grad_norm": 1.2076021432876587, "learning_rate": 3.2216760836063834e-05, "loss": 0.8403, "num_input_tokens_seen": 36333576, "step": 62590 }, { "epoch": 9.323056300268096, "grad_norm": 1.1079999208450317, "learning_rate": 3.2213649686321124e-05, "loss": 0.6795, "num_input_tokens_seen": 36336424, "step": 62595 }, { "epoch": 9.323801012809056, "grad_norm": 1.0689021348953247, "learning_rate": 3.2210538414711136e-05, "loss": 0.6287, "num_input_tokens_seen": 36339304, "step": 62600 }, { "epoch": 9.324545725350015, "grad_norm": 0.7964173555374146, "learning_rate": 3.220742702128643e-05, "loss": 0.745, "num_input_tokens_seen": 36341992, "step": 62605 }, { "epoch": 9.325290437890974, "grad_norm": 1.3058048486709595, "learning_rate": 3.220431550609958e-05, "loss": 0.4554, "num_input_tokens_seen": 36344968, "step": 62610 }, { "epoch": 9.326035150431933, "grad_norm": 0.6882466077804565, "learning_rate": 3.220120386920313e-05, "loss": 0.4865, "num_input_tokens_seen": 36348136, "step": 62615 }, { "epoch": 9.326779862972893, "grad_norm": 1.2316415309906006, "learning_rate": 3.219809211064966e-05, "loss": 0.9153, "num_input_tokens_seen": 36350824, "step": 62620 }, { "epoch": 9.327524575513852, "grad_norm": 1.1072946786880493, "learning_rate": 3.2194980230491744e-05, "loss": 0.4159, "num_input_tokens_seen": 36353800, "step": 62625 }, { "epoch": 9.32826928805481, "grad_norm": 0.8867097496986389, "learning_rate": 3.2191868228781944e-05, "loss": 0.7057, "num_input_tokens_seen": 36356584, "step": 62630 }, { "epoch": 9.32901400059577, "grad_norm": 1.4568408727645874, "learning_rate": 3.2188756105572844e-05, "loss": 0.8486, "num_input_tokens_seen": 36359368, "step": 62635 }, { "epoch": 9.32975871313673, "grad_norm": 1.1346005201339722, "learning_rate": 3.218564386091701e-05, "loss": 0.6402, "num_input_tokens_seen": 36362248, "step": 62640 }, { "epoch": 9.330503425677689, "grad_norm": 1.1451369524002075, "learning_rate": 3.218253149486704e-05, "loss": 0.7618, "num_input_tokens_seen": 36364936, "step": 62645 }, { "epoch": 9.331248138218648, "grad_norm": 1.295224666595459, "learning_rate": 3.2179419007475483e-05, "loss": 0.6959, "num_input_tokens_seen": 36368328, "step": 62650 }, { "epoch": 9.331992850759606, "grad_norm": 1.2579066753387451, "learning_rate": 3.217630639879495e-05, "loss": 0.5701, "num_input_tokens_seen": 36371144, "step": 62655 }, { "epoch": 9.332737563300565, "grad_norm": 1.0658785104751587, "learning_rate": 3.217319366887801e-05, "loss": 0.5786, "num_input_tokens_seen": 36373896, "step": 62660 }, { "epoch": 9.333482275841526, "grad_norm": 1.1762179136276245, "learning_rate": 3.217008081777726e-05, "loss": 0.756, "num_input_tokens_seen": 36376840, "step": 62665 }, { "epoch": 9.334226988382484, "grad_norm": 1.6145440340042114, "learning_rate": 3.2166967845545275e-05, "loss": 0.5654, "num_input_tokens_seen": 36379656, "step": 62670 }, { "epoch": 9.334971700923443, "grad_norm": 1.4339385032653809, "learning_rate": 3.216385475223465e-05, "loss": 0.6898, "num_input_tokens_seen": 36382472, "step": 62675 }, { "epoch": 9.335716413464402, "grad_norm": 0.9752415418624878, "learning_rate": 3.216074153789799e-05, "loss": 0.6278, "num_input_tokens_seen": 36385448, "step": 62680 }, { "epoch": 9.336461126005362, "grad_norm": 2.3095169067382812, "learning_rate": 3.2157628202587874e-05, "loss": 0.6429, "num_input_tokens_seen": 36388520, "step": 62685 }, { "epoch": 9.337205838546321, "grad_norm": 1.7786753177642822, "learning_rate": 3.21545147463569e-05, "loss": 0.5284, "num_input_tokens_seen": 36391272, "step": 62690 }, { "epoch": 9.33795055108728, "grad_norm": 1.059255838394165, "learning_rate": 3.2151401169257676e-05, "loss": 0.6047, "num_input_tokens_seen": 36394152, "step": 62695 }, { "epoch": 9.338695263628239, "grad_norm": 1.0264815092086792, "learning_rate": 3.2148287471342796e-05, "loss": 0.6669, "num_input_tokens_seen": 36397192, "step": 62700 }, { "epoch": 9.3394399761692, "grad_norm": 1.6071276664733887, "learning_rate": 3.2145173652664864e-05, "loss": 0.6947, "num_input_tokens_seen": 36400168, "step": 62705 }, { "epoch": 9.340184688710158, "grad_norm": 1.2380986213684082, "learning_rate": 3.21420597132765e-05, "loss": 0.6152, "num_input_tokens_seen": 36402920, "step": 62710 }, { "epoch": 9.340929401251117, "grad_norm": 0.6969045996665955, "learning_rate": 3.213894565323027e-05, "loss": 0.5127, "num_input_tokens_seen": 36406088, "step": 62715 }, { "epoch": 9.341674113792076, "grad_norm": 1.1469305753707886, "learning_rate": 3.213583147257883e-05, "loss": 0.6713, "num_input_tokens_seen": 36409256, "step": 62720 }, { "epoch": 9.342418826333036, "grad_norm": 0.770130455493927, "learning_rate": 3.213271717137475e-05, "loss": 0.6188, "num_input_tokens_seen": 36411880, "step": 62725 }, { "epoch": 9.343163538873995, "grad_norm": 1.0613259077072144, "learning_rate": 3.2129602749670674e-05, "loss": 0.4447, "num_input_tokens_seen": 36414760, "step": 62730 }, { "epoch": 9.343908251414954, "grad_norm": 1.4595372676849365, "learning_rate": 3.212648820751921e-05, "loss": 0.5874, "num_input_tokens_seen": 36417608, "step": 62735 }, { "epoch": 9.344652963955912, "grad_norm": 1.5542771816253662, "learning_rate": 3.212337354497296e-05, "loss": 0.6002, "num_input_tokens_seen": 36420648, "step": 62740 }, { "epoch": 9.345397676496873, "grad_norm": 0.8355757594108582, "learning_rate": 3.2120258762084565e-05, "loss": 0.4994, "num_input_tokens_seen": 36423432, "step": 62745 }, { "epoch": 9.346142389037832, "grad_norm": 1.5295403003692627, "learning_rate": 3.211714385890663e-05, "loss": 0.5504, "num_input_tokens_seen": 36426312, "step": 62750 }, { "epoch": 9.34688710157879, "grad_norm": 2.5869579315185547, "learning_rate": 3.2114028835491786e-05, "loss": 0.6349, "num_input_tokens_seen": 36429256, "step": 62755 }, { "epoch": 9.34763181411975, "grad_norm": 1.4232475757598877, "learning_rate": 3.211091369189265e-05, "loss": 0.5213, "num_input_tokens_seen": 36432360, "step": 62760 }, { "epoch": 9.34837652666071, "grad_norm": 1.094157338142395, "learning_rate": 3.210779842816185e-05, "loss": 0.579, "num_input_tokens_seen": 36435400, "step": 62765 }, { "epoch": 9.349121239201668, "grad_norm": 1.9120655059814453, "learning_rate": 3.2104683044352025e-05, "loss": 0.7518, "num_input_tokens_seen": 36438312, "step": 62770 }, { "epoch": 9.349865951742627, "grad_norm": 1.6146538257598877, "learning_rate": 3.210156754051581e-05, "loss": 0.6635, "num_input_tokens_seen": 36441256, "step": 62775 }, { "epoch": 9.350610664283586, "grad_norm": 1.2598110437393188, "learning_rate": 3.2098451916705815e-05, "loss": 0.6055, "num_input_tokens_seen": 36444168, "step": 62780 }, { "epoch": 9.351355376824547, "grad_norm": 1.4692747592926025, "learning_rate": 3.20953361729747e-05, "loss": 0.6403, "num_input_tokens_seen": 36446760, "step": 62785 }, { "epoch": 9.352100089365505, "grad_norm": 1.7504650354385376, "learning_rate": 3.209222030937509e-05, "loss": 0.5693, "num_input_tokens_seen": 36449512, "step": 62790 }, { "epoch": 9.352844801906464, "grad_norm": 1.1621754169464111, "learning_rate": 3.208910432595962e-05, "loss": 0.705, "num_input_tokens_seen": 36452648, "step": 62795 }, { "epoch": 9.353589514447423, "grad_norm": 1.0929540395736694, "learning_rate": 3.208598822278094e-05, "loss": 0.6907, "num_input_tokens_seen": 36455432, "step": 62800 }, { "epoch": 9.354334226988382, "grad_norm": 1.3069862127304077, "learning_rate": 3.208287199989169e-05, "loss": 0.5527, "num_input_tokens_seen": 36458600, "step": 62805 }, { "epoch": 9.355078939529342, "grad_norm": 1.083594560623169, "learning_rate": 3.207975565734452e-05, "loss": 0.7178, "num_input_tokens_seen": 36461544, "step": 62810 }, { "epoch": 9.3558236520703, "grad_norm": 1.8554108142852783, "learning_rate": 3.207663919519207e-05, "loss": 0.7465, "num_input_tokens_seen": 36464392, "step": 62815 }, { "epoch": 9.35656836461126, "grad_norm": 1.9469988346099854, "learning_rate": 3.2073522613486994e-05, "loss": 0.6309, "num_input_tokens_seen": 36467080, "step": 62820 }, { "epoch": 9.357313077152218, "grad_norm": 1.1957390308380127, "learning_rate": 3.207040591228194e-05, "loss": 0.5673, "num_input_tokens_seen": 36469896, "step": 62825 }, { "epoch": 9.358057789693179, "grad_norm": 1.9734395742416382, "learning_rate": 3.206728909162957e-05, "loss": 0.9943, "num_input_tokens_seen": 36472744, "step": 62830 }, { "epoch": 9.358802502234138, "grad_norm": 1.8459914922714233, "learning_rate": 3.206417215158253e-05, "loss": 0.7266, "num_input_tokens_seen": 36475368, "step": 62835 }, { "epoch": 9.359547214775096, "grad_norm": 1.7635425329208374, "learning_rate": 3.206105509219348e-05, "loss": 0.6016, "num_input_tokens_seen": 36478152, "step": 62840 }, { "epoch": 9.360291927316055, "grad_norm": 1.0369611978530884, "learning_rate": 3.205793791351509e-05, "loss": 0.658, "num_input_tokens_seen": 36480904, "step": 62845 }, { "epoch": 9.361036639857016, "grad_norm": 1.0154008865356445, "learning_rate": 3.2054820615600003e-05, "loss": 0.5492, "num_input_tokens_seen": 36483784, "step": 62850 }, { "epoch": 9.361781352397974, "grad_norm": 0.9044551849365234, "learning_rate": 3.2051703198500896e-05, "loss": 0.6255, "num_input_tokens_seen": 36486792, "step": 62855 }, { "epoch": 9.362526064938933, "grad_norm": 1.2127388715744019, "learning_rate": 3.2048585662270425e-05, "loss": 0.7282, "num_input_tokens_seen": 36489832, "step": 62860 }, { "epoch": 9.363270777479892, "grad_norm": 2.2246286869049072, "learning_rate": 3.204546800696127e-05, "loss": 0.6899, "num_input_tokens_seen": 36492744, "step": 62865 }, { "epoch": 9.364015490020853, "grad_norm": 1.0179082155227661, "learning_rate": 3.2042350232626086e-05, "loss": 0.5232, "num_input_tokens_seen": 36495432, "step": 62870 }, { "epoch": 9.364760202561811, "grad_norm": 0.9410451650619507, "learning_rate": 3.203923233931757e-05, "loss": 0.5664, "num_input_tokens_seen": 36497992, "step": 62875 }, { "epoch": 9.36550491510277, "grad_norm": 1.299551248550415, "learning_rate": 3.2036114327088354e-05, "loss": 0.6632, "num_input_tokens_seen": 36500648, "step": 62880 }, { "epoch": 9.366249627643729, "grad_norm": 1.212566614151001, "learning_rate": 3.203299619599115e-05, "loss": 0.5377, "num_input_tokens_seen": 36503784, "step": 62885 }, { "epoch": 9.36699434018469, "grad_norm": 0.9529605507850647, "learning_rate": 3.2029877946078624e-05, "loss": 0.7201, "num_input_tokens_seen": 36506376, "step": 62890 }, { "epoch": 9.367739052725648, "grad_norm": 1.616058349609375, "learning_rate": 3.2026759577403445e-05, "loss": 0.6475, "num_input_tokens_seen": 36509480, "step": 62895 }, { "epoch": 9.368483765266607, "grad_norm": 1.228770136833191, "learning_rate": 3.202364109001831e-05, "loss": 0.5548, "num_input_tokens_seen": 36512264, "step": 62900 }, { "epoch": 9.369228477807566, "grad_norm": 1.120156168937683, "learning_rate": 3.2020522483975906e-05, "loss": 0.6113, "num_input_tokens_seen": 36515080, "step": 62905 }, { "epoch": 9.369973190348526, "grad_norm": 1.1130186319351196, "learning_rate": 3.201740375932891e-05, "loss": 0.6461, "num_input_tokens_seen": 36517896, "step": 62910 }, { "epoch": 9.370717902889485, "grad_norm": 1.5928797721862793, "learning_rate": 3.201428491613e-05, "loss": 0.6593, "num_input_tokens_seen": 36520552, "step": 62915 }, { "epoch": 9.371462615430444, "grad_norm": 1.9174550771713257, "learning_rate": 3.2011165954431873e-05, "loss": 0.5973, "num_input_tokens_seen": 36523464, "step": 62920 }, { "epoch": 9.372207327971402, "grad_norm": 1.0899975299835205, "learning_rate": 3.200804687428724e-05, "loss": 0.6007, "num_input_tokens_seen": 36526216, "step": 62925 }, { "epoch": 9.372952040512363, "grad_norm": 1.513472080230713, "learning_rate": 3.200492767574876e-05, "loss": 0.6321, "num_input_tokens_seen": 36529032, "step": 62930 }, { "epoch": 9.373696753053322, "grad_norm": 1.4348698854446411, "learning_rate": 3.200180835886915e-05, "loss": 0.6835, "num_input_tokens_seen": 36531592, "step": 62935 }, { "epoch": 9.37444146559428, "grad_norm": 1.134659767150879, "learning_rate": 3.199868892370111e-05, "loss": 0.4555, "num_input_tokens_seen": 36534760, "step": 62940 }, { "epoch": 9.37518617813524, "grad_norm": 1.530044674873352, "learning_rate": 3.199556937029734e-05, "loss": 0.5353, "num_input_tokens_seen": 36537736, "step": 62945 }, { "epoch": 9.3759308906762, "grad_norm": 2.1403822898864746, "learning_rate": 3.199244969871052e-05, "loss": 0.642, "num_input_tokens_seen": 36540680, "step": 62950 }, { "epoch": 9.376675603217159, "grad_norm": 0.7881371974945068, "learning_rate": 3.198932990899337e-05, "loss": 0.6187, "num_input_tokens_seen": 36543144, "step": 62955 }, { "epoch": 9.377420315758117, "grad_norm": 1.40399169921875, "learning_rate": 3.19862100011986e-05, "loss": 0.4971, "num_input_tokens_seen": 36546088, "step": 62960 }, { "epoch": 9.378165028299076, "grad_norm": 0.9945307374000549, "learning_rate": 3.198308997537891e-05, "loss": 0.5851, "num_input_tokens_seen": 36549384, "step": 62965 }, { "epoch": 9.378909740840037, "grad_norm": 1.0062105655670166, "learning_rate": 3.1979969831587014e-05, "loss": 0.4428, "num_input_tokens_seen": 36552424, "step": 62970 }, { "epoch": 9.379654453380995, "grad_norm": 0.8670439124107361, "learning_rate": 3.1976849569875624e-05, "loss": 0.3973, "num_input_tokens_seen": 36555432, "step": 62975 }, { "epoch": 9.380399165921954, "grad_norm": 1.3073997497558594, "learning_rate": 3.197372919029745e-05, "loss": 0.4716, "num_input_tokens_seen": 36558632, "step": 62980 }, { "epoch": 9.381143878462913, "grad_norm": 1.0182383060455322, "learning_rate": 3.1970608692905216e-05, "loss": 0.6246, "num_input_tokens_seen": 36561768, "step": 62985 }, { "epoch": 9.381888591003872, "grad_norm": 1.068842887878418, "learning_rate": 3.196748807775162e-05, "loss": 0.5825, "num_input_tokens_seen": 36564488, "step": 62990 }, { "epoch": 9.382633303544832, "grad_norm": 0.9734368920326233, "learning_rate": 3.19643673448894e-05, "loss": 0.6051, "num_input_tokens_seen": 36567240, "step": 62995 }, { "epoch": 9.383378016085791, "grad_norm": 3.462742567062378, "learning_rate": 3.1961246494371275e-05, "loss": 0.8154, "num_input_tokens_seen": 36570248, "step": 63000 }, { "epoch": 9.38412272862675, "grad_norm": 1.192917823791504, "learning_rate": 3.195812552624996e-05, "loss": 0.5901, "num_input_tokens_seen": 36573000, "step": 63005 }, { "epoch": 9.384867441167708, "grad_norm": 1.06881844997406, "learning_rate": 3.1955004440578196e-05, "loss": 0.6856, "num_input_tokens_seen": 36576008, "step": 63010 }, { "epoch": 9.385612153708669, "grad_norm": 2.5991365909576416, "learning_rate": 3.195188323740869e-05, "loss": 0.8368, "num_input_tokens_seen": 36578696, "step": 63015 }, { "epoch": 9.386356866249628, "grad_norm": 1.0938276052474976, "learning_rate": 3.194876191679418e-05, "loss": 0.557, "num_input_tokens_seen": 36581448, "step": 63020 }, { "epoch": 9.387101578790586, "grad_norm": 1.5684630870819092, "learning_rate": 3.194564047878742e-05, "loss": 0.5905, "num_input_tokens_seen": 36584456, "step": 63025 }, { "epoch": 9.387846291331545, "grad_norm": 1.0167555809020996, "learning_rate": 3.19425189234411e-05, "loss": 0.6234, "num_input_tokens_seen": 36587528, "step": 63030 }, { "epoch": 9.388591003872506, "grad_norm": 2.762622833251953, "learning_rate": 3.193939725080799e-05, "loss": 0.5929, "num_input_tokens_seen": 36590408, "step": 63035 }, { "epoch": 9.389335716413465, "grad_norm": 1.1806265115737915, "learning_rate": 3.1936275460940815e-05, "loss": 0.4906, "num_input_tokens_seen": 36593160, "step": 63040 }, { "epoch": 9.390080428954423, "grad_norm": 1.4781522750854492, "learning_rate": 3.193315355389231e-05, "loss": 0.6097, "num_input_tokens_seen": 36596008, "step": 63045 }, { "epoch": 9.390825141495382, "grad_norm": 1.8031896352767944, "learning_rate": 3.1930031529715234e-05, "loss": 0.636, "num_input_tokens_seen": 36598696, "step": 63050 }, { "epoch": 9.391569854036343, "grad_norm": 1.7063487768173218, "learning_rate": 3.192690938846231e-05, "loss": 0.7121, "num_input_tokens_seen": 36601640, "step": 63055 }, { "epoch": 9.392314566577301, "grad_norm": 3.4969987869262695, "learning_rate": 3.1923787130186286e-05, "loss": 0.5926, "num_input_tokens_seen": 36604424, "step": 63060 }, { "epoch": 9.39305927911826, "grad_norm": 1.2556202411651611, "learning_rate": 3.1920664754939936e-05, "loss": 0.6062, "num_input_tokens_seen": 36607272, "step": 63065 }, { "epoch": 9.393803991659219, "grad_norm": 1.5533658266067505, "learning_rate": 3.1917542262775975e-05, "loss": 0.5721, "num_input_tokens_seen": 36610120, "step": 63070 }, { "epoch": 9.39454870420018, "grad_norm": 1.3788509368896484, "learning_rate": 3.191441965374717e-05, "loss": 0.5705, "num_input_tokens_seen": 36612904, "step": 63075 }, { "epoch": 9.395293416741138, "grad_norm": 1.0579721927642822, "learning_rate": 3.191129692790627e-05, "loss": 0.5953, "num_input_tokens_seen": 36615752, "step": 63080 }, { "epoch": 9.396038129282097, "grad_norm": 1.324992060661316, "learning_rate": 3.190817408530604e-05, "loss": 0.4458, "num_input_tokens_seen": 36618888, "step": 63085 }, { "epoch": 9.396782841823056, "grad_norm": 0.8390823602676392, "learning_rate": 3.190505112599922e-05, "loss": 0.4154, "num_input_tokens_seen": 36621896, "step": 63090 }, { "epoch": 9.397527554364016, "grad_norm": 1.0983307361602783, "learning_rate": 3.190192805003858e-05, "loss": 0.5887, "num_input_tokens_seen": 36624712, "step": 63095 }, { "epoch": 9.398272266904975, "grad_norm": 0.923041582107544, "learning_rate": 3.189880485747688e-05, "loss": 0.5367, "num_input_tokens_seen": 36627400, "step": 63100 }, { "epoch": 9.399016979445934, "grad_norm": 1.3418391942977905, "learning_rate": 3.1895681548366896e-05, "loss": 0.7238, "num_input_tokens_seen": 36630312, "step": 63105 }, { "epoch": 9.399761691986892, "grad_norm": 0.9938468337059021, "learning_rate": 3.189255812276137e-05, "loss": 0.5692, "num_input_tokens_seen": 36633192, "step": 63110 }, { "epoch": 9.400506404527853, "grad_norm": 1.9627636671066284, "learning_rate": 3.188943458071308e-05, "loss": 0.6131, "num_input_tokens_seen": 36636008, "step": 63115 }, { "epoch": 9.401251117068812, "grad_norm": 1.793459177017212, "learning_rate": 3.18863109222748e-05, "loss": 0.6021, "num_input_tokens_seen": 36638984, "step": 63120 }, { "epoch": 9.40199582960977, "grad_norm": 3.2425155639648438, "learning_rate": 3.188318714749929e-05, "loss": 0.8765, "num_input_tokens_seen": 36641960, "step": 63125 }, { "epoch": 9.40274054215073, "grad_norm": 1.0558325052261353, "learning_rate": 3.188006325643934e-05, "loss": 0.5987, "num_input_tokens_seen": 36644776, "step": 63130 }, { "epoch": 9.40348525469169, "grad_norm": 1.627044439315796, "learning_rate": 3.1876939249147694e-05, "loss": 0.7259, "num_input_tokens_seen": 36647560, "step": 63135 }, { "epoch": 9.404229967232649, "grad_norm": 1.174651026725769, "learning_rate": 3.187381512567717e-05, "loss": 0.6818, "num_input_tokens_seen": 36650536, "step": 63140 }, { "epoch": 9.404974679773607, "grad_norm": 2.22175669670105, "learning_rate": 3.1870690886080515e-05, "loss": 0.6755, "num_input_tokens_seen": 36653288, "step": 63145 }, { "epoch": 9.405719392314566, "grad_norm": 1.5894627571105957, "learning_rate": 3.186756653041053e-05, "loss": 0.5129, "num_input_tokens_seen": 36656424, "step": 63150 }, { "epoch": 9.406464104855527, "grad_norm": 1.0958807468414307, "learning_rate": 3.186444205871997e-05, "loss": 0.7142, "num_input_tokens_seen": 36659368, "step": 63155 }, { "epoch": 9.407208817396485, "grad_norm": 2.729276657104492, "learning_rate": 3.186131747106165e-05, "loss": 0.5873, "num_input_tokens_seen": 36662408, "step": 63160 }, { "epoch": 9.407953529937444, "grad_norm": 1.6174737215042114, "learning_rate": 3.1858192767488345e-05, "loss": 0.5418, "num_input_tokens_seen": 36665576, "step": 63165 }, { "epoch": 9.408698242478403, "grad_norm": 1.6023215055465698, "learning_rate": 3.185506794805284e-05, "loss": 0.4503, "num_input_tokens_seen": 36668360, "step": 63170 }, { "epoch": 9.409442955019362, "grad_norm": 0.9851825833320618, "learning_rate": 3.1851943012807934e-05, "loss": 0.7473, "num_input_tokens_seen": 36671400, "step": 63175 }, { "epoch": 9.410187667560322, "grad_norm": 0.7184407711029053, "learning_rate": 3.184881796180641e-05, "loss": 0.5632, "num_input_tokens_seen": 36674344, "step": 63180 }, { "epoch": 9.410932380101281, "grad_norm": 1.1173282861709595, "learning_rate": 3.184569279510107e-05, "loss": 0.3732, "num_input_tokens_seen": 36677544, "step": 63185 }, { "epoch": 9.41167709264224, "grad_norm": 0.8523809909820557, "learning_rate": 3.184256751274471e-05, "loss": 0.6628, "num_input_tokens_seen": 36680744, "step": 63190 }, { "epoch": 9.412421805183198, "grad_norm": 0.9789317846298218, "learning_rate": 3.183944211479012e-05, "loss": 0.6103, "num_input_tokens_seen": 36683624, "step": 63195 }, { "epoch": 9.413166517724159, "grad_norm": 1.6742140054702759, "learning_rate": 3.183631660129011e-05, "loss": 0.4143, "num_input_tokens_seen": 36686376, "step": 63200 }, { "epoch": 9.413911230265118, "grad_norm": 1.6403207778930664, "learning_rate": 3.183319097229748e-05, "loss": 0.6526, "num_input_tokens_seen": 36689544, "step": 63205 }, { "epoch": 9.414655942806077, "grad_norm": 1.1146845817565918, "learning_rate": 3.183006522786504e-05, "loss": 0.4314, "num_input_tokens_seen": 36692200, "step": 63210 }, { "epoch": 9.415400655347035, "grad_norm": 1.9119675159454346, "learning_rate": 3.182693936804558e-05, "loss": 0.6317, "num_input_tokens_seen": 36695400, "step": 63215 }, { "epoch": 9.416145367887996, "grad_norm": 1.8937010765075684, "learning_rate": 3.182381339289192e-05, "loss": 0.6392, "num_input_tokens_seen": 36698216, "step": 63220 }, { "epoch": 9.416890080428955, "grad_norm": 1.4967890977859497, "learning_rate": 3.182068730245686e-05, "loss": 0.7353, "num_input_tokens_seen": 36701288, "step": 63225 }, { "epoch": 9.417634792969913, "grad_norm": 1.5539498329162598, "learning_rate": 3.181756109679324e-05, "loss": 0.7459, "num_input_tokens_seen": 36704072, "step": 63230 }, { "epoch": 9.418379505510872, "grad_norm": 1.6648966073989868, "learning_rate": 3.1814434775953837e-05, "loss": 0.7745, "num_input_tokens_seen": 36707080, "step": 63235 }, { "epoch": 9.419124218051833, "grad_norm": 1.685107707977295, "learning_rate": 3.18113083399915e-05, "loss": 0.6554, "num_input_tokens_seen": 36709800, "step": 63240 }, { "epoch": 9.419868930592791, "grad_norm": 1.089268445968628, "learning_rate": 3.180818178895901e-05, "loss": 0.7109, "num_input_tokens_seen": 36712328, "step": 63245 }, { "epoch": 9.42061364313375, "grad_norm": 1.7198961973190308, "learning_rate": 3.180505512290922e-05, "loss": 0.621, "num_input_tokens_seen": 36715560, "step": 63250 }, { "epoch": 9.421358355674709, "grad_norm": 1.1711021661758423, "learning_rate": 3.1801928341894943e-05, "loss": 0.6681, "num_input_tokens_seen": 36718248, "step": 63255 }, { "epoch": 9.42210306821567, "grad_norm": 2.805960178375244, "learning_rate": 3.1798801445968993e-05, "loss": 0.6614, "num_input_tokens_seen": 36720936, "step": 63260 }, { "epoch": 9.422847780756628, "grad_norm": 1.9374510049819946, "learning_rate": 3.179567443518421e-05, "loss": 0.5636, "num_input_tokens_seen": 36723688, "step": 63265 }, { "epoch": 9.423592493297587, "grad_norm": 2.5404574871063232, "learning_rate": 3.1792547309593415e-05, "loss": 0.7917, "num_input_tokens_seen": 36726664, "step": 63270 }, { "epoch": 9.424337205838546, "grad_norm": 3.6566944122314453, "learning_rate": 3.178942006924943e-05, "loss": 0.6726, "num_input_tokens_seen": 36729448, "step": 63275 }, { "epoch": 9.425081918379506, "grad_norm": 1.1462196111679077, "learning_rate": 3.17862927142051e-05, "loss": 0.5817, "num_input_tokens_seen": 36732104, "step": 63280 }, { "epoch": 9.425826630920465, "grad_norm": 1.6407581567764282, "learning_rate": 3.178316524451325e-05, "loss": 0.7221, "num_input_tokens_seen": 36735080, "step": 63285 }, { "epoch": 9.426571343461424, "grad_norm": 1.1839641332626343, "learning_rate": 3.178003766022671e-05, "loss": 0.5819, "num_input_tokens_seen": 36738024, "step": 63290 }, { "epoch": 9.427316056002383, "grad_norm": 1.5864282846450806, "learning_rate": 3.177690996139833e-05, "loss": 0.5824, "num_input_tokens_seen": 36740904, "step": 63295 }, { "epoch": 9.428060768543343, "grad_norm": 1.351272702217102, "learning_rate": 3.177378214808094e-05, "loss": 0.5046, "num_input_tokens_seen": 36744040, "step": 63300 }, { "epoch": 9.428805481084302, "grad_norm": 1.216101050376892, "learning_rate": 3.177065422032739e-05, "loss": 0.635, "num_input_tokens_seen": 36746888, "step": 63305 }, { "epoch": 9.42955019362526, "grad_norm": 1.5418564081192017, "learning_rate": 3.176752617819052e-05, "loss": 0.7616, "num_input_tokens_seen": 36749736, "step": 63310 }, { "epoch": 9.43029490616622, "grad_norm": 1.35719895362854, "learning_rate": 3.1764398021723175e-05, "loss": 0.6063, "num_input_tokens_seen": 36752584, "step": 63315 }, { "epoch": 9.43103961870718, "grad_norm": 1.6943752765655518, "learning_rate": 3.17612697509782e-05, "loss": 0.7187, "num_input_tokens_seen": 36755784, "step": 63320 }, { "epoch": 9.431784331248139, "grad_norm": 0.9579237103462219, "learning_rate": 3.1758141366008434e-05, "loss": 0.6511, "num_input_tokens_seen": 36758600, "step": 63325 }, { "epoch": 9.432529043789097, "grad_norm": 1.230526328086853, "learning_rate": 3.175501286686674e-05, "loss": 0.6452, "num_input_tokens_seen": 36761672, "step": 63330 }, { "epoch": 9.433273756330056, "grad_norm": 1.697403073310852, "learning_rate": 3.1751884253605974e-05, "loss": 0.5581, "num_input_tokens_seen": 36764680, "step": 63335 }, { "epoch": 9.434018468871017, "grad_norm": 1.8764188289642334, "learning_rate": 3.174875552627899e-05, "loss": 0.6892, "num_input_tokens_seen": 36767304, "step": 63340 }, { "epoch": 9.434763181411975, "grad_norm": 1.8326327800750732, "learning_rate": 3.174562668493863e-05, "loss": 0.6695, "num_input_tokens_seen": 36770216, "step": 63345 }, { "epoch": 9.435507893952934, "grad_norm": 0.8978216052055359, "learning_rate": 3.1742497729637774e-05, "loss": 0.775, "num_input_tokens_seen": 36773096, "step": 63350 }, { "epoch": 9.436252606493893, "grad_norm": 2.839362382888794, "learning_rate": 3.173936866042927e-05, "loss": 0.722, "num_input_tokens_seen": 36775976, "step": 63355 }, { "epoch": 9.436997319034852, "grad_norm": 2.7523772716522217, "learning_rate": 3.173623947736597e-05, "loss": 0.5847, "num_input_tokens_seen": 36779016, "step": 63360 }, { "epoch": 9.437742031575812, "grad_norm": 1.102399230003357, "learning_rate": 3.1733110180500766e-05, "loss": 0.4096, "num_input_tokens_seen": 36781768, "step": 63365 }, { "epoch": 9.438486744116771, "grad_norm": 2.3928275108337402, "learning_rate": 3.17299807698865e-05, "loss": 0.6494, "num_input_tokens_seen": 36784840, "step": 63370 }, { "epoch": 9.43923145665773, "grad_norm": 1.0636498928070068, "learning_rate": 3.1726851245576056e-05, "loss": 0.6004, "num_input_tokens_seen": 36787816, "step": 63375 }, { "epoch": 9.439976169198689, "grad_norm": 1.6978862285614014, "learning_rate": 3.172372160762229e-05, "loss": 0.6925, "num_input_tokens_seen": 36790696, "step": 63380 }, { "epoch": 9.440720881739649, "grad_norm": 2.4333746433258057, "learning_rate": 3.172059185607808e-05, "loss": 0.6944, "num_input_tokens_seen": 36793768, "step": 63385 }, { "epoch": 9.441465594280608, "grad_norm": 0.9602773785591125, "learning_rate": 3.171746199099631e-05, "loss": 0.5844, "num_input_tokens_seen": 36796680, "step": 63390 }, { "epoch": 9.442210306821567, "grad_norm": 1.0226565599441528, "learning_rate": 3.171433201242984e-05, "loss": 0.7342, "num_input_tokens_seen": 36799656, "step": 63395 }, { "epoch": 9.442955019362525, "grad_norm": 1.0883586406707764, "learning_rate": 3.1711201920431556e-05, "loss": 0.5316, "num_input_tokens_seen": 36802376, "step": 63400 }, { "epoch": 9.443699731903486, "grad_norm": 1.6659390926361084, "learning_rate": 3.170807171505434e-05, "loss": 0.7942, "num_input_tokens_seen": 36804968, "step": 63405 }, { "epoch": 9.444444444444445, "grad_norm": 1.945521593093872, "learning_rate": 3.1704941396351064e-05, "loss": 0.5029, "num_input_tokens_seen": 36808136, "step": 63410 }, { "epoch": 9.445189156985403, "grad_norm": 0.9378041625022888, "learning_rate": 3.1701810964374626e-05, "loss": 0.6824, "num_input_tokens_seen": 36810920, "step": 63415 }, { "epoch": 9.445933869526362, "grad_norm": 1.8709155321121216, "learning_rate": 3.169868041917789e-05, "loss": 0.644, "num_input_tokens_seen": 36813992, "step": 63420 }, { "epoch": 9.446678582067323, "grad_norm": 2.7219161987304688, "learning_rate": 3.1695549760813764e-05, "loss": 0.6632, "num_input_tokens_seen": 36816744, "step": 63425 }, { "epoch": 9.447423294608281, "grad_norm": 1.4563895463943481, "learning_rate": 3.169241898933514e-05, "loss": 0.6534, "num_input_tokens_seen": 36819880, "step": 63430 }, { "epoch": 9.44816800714924, "grad_norm": 0.8321465253829956, "learning_rate": 3.168928810479488e-05, "loss": 0.5056, "num_input_tokens_seen": 36822920, "step": 63435 }, { "epoch": 9.448912719690199, "grad_norm": 1.559900164604187, "learning_rate": 3.1686157107245915e-05, "loss": 0.6509, "num_input_tokens_seen": 36825960, "step": 63440 }, { "epoch": 9.44965743223116, "grad_norm": 3.629969358444214, "learning_rate": 3.1683025996741104e-05, "loss": 0.7057, "num_input_tokens_seen": 36829064, "step": 63445 }, { "epoch": 9.450402144772118, "grad_norm": 1.8735108375549316, "learning_rate": 3.167989477333337e-05, "loss": 0.7207, "num_input_tokens_seen": 36832040, "step": 63450 }, { "epoch": 9.451146857313077, "grad_norm": 1.098414421081543, "learning_rate": 3.16767634370756e-05, "loss": 0.8404, "num_input_tokens_seen": 36834952, "step": 63455 }, { "epoch": 9.451891569854036, "grad_norm": 1.332224726676941, "learning_rate": 3.16736319880207e-05, "loss": 0.4571, "num_input_tokens_seen": 36837832, "step": 63460 }, { "epoch": 9.452636282394996, "grad_norm": 1.2715057134628296, "learning_rate": 3.1670500426221566e-05, "loss": 0.4989, "num_input_tokens_seen": 36840904, "step": 63465 }, { "epoch": 9.453380994935955, "grad_norm": 2.4468095302581787, "learning_rate": 3.1667368751731116e-05, "loss": 0.6147, "num_input_tokens_seen": 36843688, "step": 63470 }, { "epoch": 9.454125707476914, "grad_norm": 1.7006462812423706, "learning_rate": 3.1664236964602244e-05, "loss": 0.4507, "num_input_tokens_seen": 36846440, "step": 63475 }, { "epoch": 9.454870420017873, "grad_norm": 0.840989887714386, "learning_rate": 3.166110506488786e-05, "loss": 0.5132, "num_input_tokens_seen": 36849416, "step": 63480 }, { "epoch": 9.455615132558833, "grad_norm": 0.7554431557655334, "learning_rate": 3.165797305264087e-05, "loss": 0.7484, "num_input_tokens_seen": 36852456, "step": 63485 }, { "epoch": 9.456359845099792, "grad_norm": 3.451772928237915, "learning_rate": 3.1654840927914196e-05, "loss": 0.5704, "num_input_tokens_seen": 36855368, "step": 63490 }, { "epoch": 9.45710455764075, "grad_norm": 2.1164627075195312, "learning_rate": 3.165170869076075e-05, "loss": 0.7145, "num_input_tokens_seen": 36858088, "step": 63495 }, { "epoch": 9.45784927018171, "grad_norm": 1.8791695833206177, "learning_rate": 3.164857634123345e-05, "loss": 0.5492, "num_input_tokens_seen": 36860808, "step": 63500 }, { "epoch": 9.458593982722668, "grad_norm": 1.2456042766571045, "learning_rate": 3.1645443879385206e-05, "loss": 0.4901, "num_input_tokens_seen": 36863624, "step": 63505 }, { "epoch": 9.459338695263629, "grad_norm": 1.6221644878387451, "learning_rate": 3.164231130526894e-05, "loss": 0.646, "num_input_tokens_seen": 36866248, "step": 63510 }, { "epoch": 9.460083407804587, "grad_norm": 1.4530160427093506, "learning_rate": 3.163917861893758e-05, "loss": 0.8398, "num_input_tokens_seen": 36868936, "step": 63515 }, { "epoch": 9.460828120345546, "grad_norm": 1.2619366645812988, "learning_rate": 3.1636045820444044e-05, "loss": 0.3991, "num_input_tokens_seen": 36871784, "step": 63520 }, { "epoch": 9.461572832886505, "grad_norm": 1.4433382749557495, "learning_rate": 3.163291290984125e-05, "loss": 0.652, "num_input_tokens_seen": 36874984, "step": 63525 }, { "epoch": 9.462317545427466, "grad_norm": 1.1655975580215454, "learning_rate": 3.162977988718214e-05, "loss": 0.6028, "num_input_tokens_seen": 36877736, "step": 63530 }, { "epoch": 9.463062257968424, "grad_norm": 1.0152806043624878, "learning_rate": 3.162664675251965e-05, "loss": 0.5087, "num_input_tokens_seen": 36880360, "step": 63535 }, { "epoch": 9.463806970509383, "grad_norm": 1.289224624633789, "learning_rate": 3.162351350590668e-05, "loss": 0.7209, "num_input_tokens_seen": 36883208, "step": 63540 }, { "epoch": 9.464551683050342, "grad_norm": 1.412196159362793, "learning_rate": 3.1620380147396186e-05, "loss": 0.4456, "num_input_tokens_seen": 36886184, "step": 63545 }, { "epoch": 9.465296395591302, "grad_norm": 2.696903705596924, "learning_rate": 3.1617246677041104e-05, "loss": 0.6703, "num_input_tokens_seen": 36889160, "step": 63550 }, { "epoch": 9.466041108132261, "grad_norm": 2.086404323577881, "learning_rate": 3.161411309489436e-05, "loss": 0.7228, "num_input_tokens_seen": 36892040, "step": 63555 }, { "epoch": 9.46678582067322, "grad_norm": 0.9307975172996521, "learning_rate": 3.161097940100889e-05, "loss": 0.7273, "num_input_tokens_seen": 36894856, "step": 63560 }, { "epoch": 9.467530533214179, "grad_norm": 0.9042707681655884, "learning_rate": 3.160784559543765e-05, "loss": 0.7372, "num_input_tokens_seen": 36897768, "step": 63565 }, { "epoch": 9.46827524575514, "grad_norm": 1.5999170541763306, "learning_rate": 3.160471167823358e-05, "loss": 0.7045, "num_input_tokens_seen": 36900616, "step": 63570 }, { "epoch": 9.469019958296098, "grad_norm": 1.5685958862304688, "learning_rate": 3.1601577649449606e-05, "loss": 0.5245, "num_input_tokens_seen": 36903592, "step": 63575 }, { "epoch": 9.469764670837057, "grad_norm": 1.700575828552246, "learning_rate": 3.15984435091387e-05, "loss": 1.076, "num_input_tokens_seen": 36906472, "step": 63580 }, { "epoch": 9.470509383378015, "grad_norm": 0.650607705116272, "learning_rate": 3.159530925735379e-05, "loss": 0.6406, "num_input_tokens_seen": 36909896, "step": 63585 }, { "epoch": 9.471254095918976, "grad_norm": 2.1968092918395996, "learning_rate": 3.1592174894147835e-05, "loss": 0.5282, "num_input_tokens_seen": 36912840, "step": 63590 }, { "epoch": 9.471998808459935, "grad_norm": 1.2802234888076782, "learning_rate": 3.158904041957379e-05, "loss": 0.789, "num_input_tokens_seen": 36916040, "step": 63595 }, { "epoch": 9.472743521000893, "grad_norm": 1.7594245672225952, "learning_rate": 3.1585905833684595e-05, "loss": 0.4612, "num_input_tokens_seen": 36918824, "step": 63600 }, { "epoch": 9.473488233541852, "grad_norm": 1.0037580728530884, "learning_rate": 3.158277113653322e-05, "loss": 0.4162, "num_input_tokens_seen": 36921864, "step": 63605 }, { "epoch": 9.474232946082813, "grad_norm": 1.3947917222976685, "learning_rate": 3.157963632817261e-05, "loss": 0.5527, "num_input_tokens_seen": 36924456, "step": 63610 }, { "epoch": 9.474977658623772, "grad_norm": 0.9501282572746277, "learning_rate": 3.157650140865574e-05, "loss": 0.467, "num_input_tokens_seen": 36927368, "step": 63615 }, { "epoch": 9.47572237116473, "grad_norm": 1.920210599899292, "learning_rate": 3.157336637803556e-05, "loss": 0.7155, "num_input_tokens_seen": 36930760, "step": 63620 }, { "epoch": 9.476467083705689, "grad_norm": 1.5268194675445557, "learning_rate": 3.1570231236365035e-05, "loss": 0.7353, "num_input_tokens_seen": 36933448, "step": 63625 }, { "epoch": 9.47721179624665, "grad_norm": 1.2531874179840088, "learning_rate": 3.156709598369713e-05, "loss": 0.6252, "num_input_tokens_seen": 36936232, "step": 63630 }, { "epoch": 9.477956508787608, "grad_norm": 1.220199704170227, "learning_rate": 3.1563960620084816e-05, "loss": 0.6742, "num_input_tokens_seen": 36939080, "step": 63635 }, { "epoch": 9.478701221328567, "grad_norm": 1.710710883140564, "learning_rate": 3.1560825145581056e-05, "loss": 0.5776, "num_input_tokens_seen": 36942184, "step": 63640 }, { "epoch": 9.479445933869526, "grad_norm": 1.1199995279312134, "learning_rate": 3.155768956023882e-05, "loss": 0.5838, "num_input_tokens_seen": 36945192, "step": 63645 }, { "epoch": 9.480190646410486, "grad_norm": 2.060537815093994, "learning_rate": 3.155455386411109e-05, "loss": 0.612, "num_input_tokens_seen": 36948264, "step": 63650 }, { "epoch": 9.480935358951445, "grad_norm": 1.227771520614624, "learning_rate": 3.1551418057250835e-05, "loss": 0.7424, "num_input_tokens_seen": 36951240, "step": 63655 }, { "epoch": 9.481680071492404, "grad_norm": 2.4503796100616455, "learning_rate": 3.1548282139711025e-05, "loss": 0.9035, "num_input_tokens_seen": 36954152, "step": 63660 }, { "epoch": 9.482424784033363, "grad_norm": 1.2180633544921875, "learning_rate": 3.154514611154464e-05, "loss": 0.5467, "num_input_tokens_seen": 36957096, "step": 63665 }, { "epoch": 9.483169496574323, "grad_norm": 1.0210812091827393, "learning_rate": 3.154200997280468e-05, "loss": 0.4889, "num_input_tokens_seen": 36959976, "step": 63670 }, { "epoch": 9.483914209115282, "grad_norm": 1.2997359037399292, "learning_rate": 3.1538873723544105e-05, "loss": 0.5159, "num_input_tokens_seen": 36962664, "step": 63675 }, { "epoch": 9.48465892165624, "grad_norm": 2.5079009532928467, "learning_rate": 3.1535737363815896e-05, "loss": 0.7322, "num_input_tokens_seen": 36965416, "step": 63680 }, { "epoch": 9.4854036341972, "grad_norm": 1.428756594657898, "learning_rate": 3.1532600893673045e-05, "loss": 0.7091, "num_input_tokens_seen": 36968232, "step": 63685 }, { "epoch": 9.486148346738158, "grad_norm": 0.929376482963562, "learning_rate": 3.152946431316855e-05, "loss": 0.63, "num_input_tokens_seen": 36970952, "step": 63690 }, { "epoch": 9.486893059279119, "grad_norm": 1.0592621564865112, "learning_rate": 3.152632762235539e-05, "loss": 0.5987, "num_input_tokens_seen": 36973768, "step": 63695 }, { "epoch": 9.487637771820078, "grad_norm": 1.1649572849273682, "learning_rate": 3.152319082128656e-05, "loss": 0.4994, "num_input_tokens_seen": 36976712, "step": 63700 }, { "epoch": 9.488382484361036, "grad_norm": 1.3160697221755981, "learning_rate": 3.1520053910015046e-05, "loss": 0.6443, "num_input_tokens_seen": 36979592, "step": 63705 }, { "epoch": 9.489127196901995, "grad_norm": 3.3915317058563232, "learning_rate": 3.151691688859385e-05, "loss": 0.6546, "num_input_tokens_seen": 36982184, "step": 63710 }, { "epoch": 9.489871909442956, "grad_norm": 1.234057903289795, "learning_rate": 3.151377975707597e-05, "loss": 0.7039, "num_input_tokens_seen": 36984840, "step": 63715 }, { "epoch": 9.490616621983914, "grad_norm": 1.38738214969635, "learning_rate": 3.1510642515514393e-05, "loss": 0.6758, "num_input_tokens_seen": 36987944, "step": 63720 }, { "epoch": 9.491361334524873, "grad_norm": 0.9898858070373535, "learning_rate": 3.150750516396213e-05, "loss": 0.6026, "num_input_tokens_seen": 36990728, "step": 63725 }, { "epoch": 9.492106047065832, "grad_norm": 1.026850700378418, "learning_rate": 3.1504367702472185e-05, "loss": 0.5885, "num_input_tokens_seen": 36993480, "step": 63730 }, { "epoch": 9.492850759606792, "grad_norm": 1.070182204246521, "learning_rate": 3.150123013109756e-05, "loss": 0.7339, "num_input_tokens_seen": 36996104, "step": 63735 }, { "epoch": 9.493595472147751, "grad_norm": 1.5011467933654785, "learning_rate": 3.149809244989125e-05, "loss": 0.8309, "num_input_tokens_seen": 36998856, "step": 63740 }, { "epoch": 9.49434018468871, "grad_norm": 0.9215620160102844, "learning_rate": 3.149495465890628e-05, "loss": 0.6366, "num_input_tokens_seen": 37001448, "step": 63745 }, { "epoch": 9.495084897229669, "grad_norm": 2.9091765880584717, "learning_rate": 3.149181675819565e-05, "loss": 0.674, "num_input_tokens_seen": 37004360, "step": 63750 }, { "epoch": 9.49582960977063, "grad_norm": 2.2717044353485107, "learning_rate": 3.148867874781238e-05, "loss": 0.6937, "num_input_tokens_seen": 37006984, "step": 63755 }, { "epoch": 9.496574322311588, "grad_norm": 1.2969719171524048, "learning_rate": 3.148554062780947e-05, "loss": 0.5709, "num_input_tokens_seen": 37009960, "step": 63760 }, { "epoch": 9.497319034852547, "grad_norm": 2.145092725753784, "learning_rate": 3.148240239823994e-05, "loss": 0.6867, "num_input_tokens_seen": 37013000, "step": 63765 }, { "epoch": 9.498063747393505, "grad_norm": 1.266493320465088, "learning_rate": 3.147926405915682e-05, "loss": 0.5645, "num_input_tokens_seen": 37015784, "step": 63770 }, { "epoch": 9.498808459934466, "grad_norm": 1.3465347290039062, "learning_rate": 3.147612561061312e-05, "loss": 0.7523, "num_input_tokens_seen": 37018344, "step": 63775 }, { "epoch": 9.499553172475425, "grad_norm": 2.335782289505005, "learning_rate": 3.147298705266185e-05, "loss": 0.6126, "num_input_tokens_seen": 37021416, "step": 63780 }, { "epoch": 9.500297885016384, "grad_norm": 1.4895390272140503, "learning_rate": 3.146984838535604e-05, "loss": 0.8117, "num_input_tokens_seen": 37024552, "step": 63785 }, { "epoch": 9.501042597557342, "grad_norm": 1.2156716585159302, "learning_rate": 3.146670960874872e-05, "loss": 0.506, "num_input_tokens_seen": 37027464, "step": 63790 }, { "epoch": 9.501787310098303, "grad_norm": 2.6700973510742188, "learning_rate": 3.146357072289292e-05, "loss": 0.6076, "num_input_tokens_seen": 37030312, "step": 63795 }, { "epoch": 9.502532022639262, "grad_norm": 1.2503466606140137, "learning_rate": 3.146043172784166e-05, "loss": 0.7278, "num_input_tokens_seen": 37033128, "step": 63800 }, { "epoch": 9.50327673518022, "grad_norm": 1.1752551794052124, "learning_rate": 3.1457292623647976e-05, "loss": 0.6733, "num_input_tokens_seen": 37036040, "step": 63805 }, { "epoch": 9.504021447721179, "grad_norm": 1.0477495193481445, "learning_rate": 3.145415341036489e-05, "loss": 0.6553, "num_input_tokens_seen": 37038952, "step": 63810 }, { "epoch": 9.50476616026214, "grad_norm": 0.9890164136886597, "learning_rate": 3.1451014088045435e-05, "loss": 0.4189, "num_input_tokens_seen": 37041928, "step": 63815 }, { "epoch": 9.505510872803098, "grad_norm": 1.636123776435852, "learning_rate": 3.144787465674266e-05, "loss": 0.5756, "num_input_tokens_seen": 37044680, "step": 63820 }, { "epoch": 9.506255585344057, "grad_norm": 1.5890830755233765, "learning_rate": 3.14447351165096e-05, "loss": 0.5921, "num_input_tokens_seen": 37047784, "step": 63825 }, { "epoch": 9.507000297885016, "grad_norm": 1.116087794303894, "learning_rate": 3.1441595467399286e-05, "loss": 0.5766, "num_input_tokens_seen": 37050664, "step": 63830 }, { "epoch": 9.507745010425975, "grad_norm": 1.1401063203811646, "learning_rate": 3.143845570946477e-05, "loss": 0.8471, "num_input_tokens_seen": 37053640, "step": 63835 }, { "epoch": 9.508489722966935, "grad_norm": 1.4370098114013672, "learning_rate": 3.143531584275909e-05, "loss": 0.5548, "num_input_tokens_seen": 37056648, "step": 63840 }, { "epoch": 9.509234435507894, "grad_norm": 1.8714534044265747, "learning_rate": 3.1432175867335275e-05, "loss": 0.817, "num_input_tokens_seen": 37059560, "step": 63845 }, { "epoch": 9.509979148048853, "grad_norm": 1.0032634735107422, "learning_rate": 3.1429035783246395e-05, "loss": 0.5941, "num_input_tokens_seen": 37062728, "step": 63850 }, { "epoch": 9.510723860589813, "grad_norm": 1.1579302549362183, "learning_rate": 3.142589559054549e-05, "loss": 0.574, "num_input_tokens_seen": 37065672, "step": 63855 }, { "epoch": 9.511468573130772, "grad_norm": 0.9072567224502563, "learning_rate": 3.142275528928561e-05, "loss": 0.7616, "num_input_tokens_seen": 37068488, "step": 63860 }, { "epoch": 9.51221328567173, "grad_norm": 2.2711002826690674, "learning_rate": 3.141961487951981e-05, "loss": 0.7181, "num_input_tokens_seen": 37071240, "step": 63865 }, { "epoch": 9.51295799821269, "grad_norm": 1.8291722536087036, "learning_rate": 3.141647436130113e-05, "loss": 0.7295, "num_input_tokens_seen": 37074472, "step": 63870 }, { "epoch": 9.513702710753648, "grad_norm": 1.0053434371948242, "learning_rate": 3.1413333734682656e-05, "loss": 0.5859, "num_input_tokens_seen": 37077128, "step": 63875 }, { "epoch": 9.514447423294609, "grad_norm": 0.9208019971847534, "learning_rate": 3.141019299971741e-05, "loss": 0.711, "num_input_tokens_seen": 37080072, "step": 63880 }, { "epoch": 9.515192135835568, "grad_norm": 2.074321985244751, "learning_rate": 3.140705215645847e-05, "loss": 0.6906, "num_input_tokens_seen": 37082856, "step": 63885 }, { "epoch": 9.515936848376526, "grad_norm": 1.166447401046753, "learning_rate": 3.14039112049589e-05, "loss": 0.6874, "num_input_tokens_seen": 37085992, "step": 63890 }, { "epoch": 9.516681560917485, "grad_norm": 2.251046895980835, "learning_rate": 3.140077014527176e-05, "loss": 0.5314, "num_input_tokens_seen": 37088968, "step": 63895 }, { "epoch": 9.517426273458446, "grad_norm": 1.0587201118469238, "learning_rate": 3.139762897745011e-05, "loss": 0.6991, "num_input_tokens_seen": 37091976, "step": 63900 }, { "epoch": 9.518170985999404, "grad_norm": 1.9159992933273315, "learning_rate": 3.139448770154702e-05, "loss": 0.5484, "num_input_tokens_seen": 37095240, "step": 63905 }, { "epoch": 9.518915698540363, "grad_norm": 1.97203528881073, "learning_rate": 3.139134631761557e-05, "loss": 0.724, "num_input_tokens_seen": 37098376, "step": 63910 }, { "epoch": 9.519660411081322, "grad_norm": 2.0144145488739014, "learning_rate": 3.1388204825708815e-05, "loss": 0.6607, "num_input_tokens_seen": 37101352, "step": 63915 }, { "epoch": 9.520405123622282, "grad_norm": 0.650497317314148, "learning_rate": 3.138506322587982e-05, "loss": 0.5639, "num_input_tokens_seen": 37104104, "step": 63920 }, { "epoch": 9.521149836163241, "grad_norm": 3.0365283489227295, "learning_rate": 3.138192151818168e-05, "loss": 0.5669, "num_input_tokens_seen": 37106984, "step": 63925 }, { "epoch": 9.5218945487042, "grad_norm": 1.1743031740188599, "learning_rate": 3.137877970266746e-05, "loss": 0.6592, "num_input_tokens_seen": 37110024, "step": 63930 }, { "epoch": 9.522639261245159, "grad_norm": 1.1783711910247803, "learning_rate": 3.1375637779390244e-05, "loss": 0.6474, "num_input_tokens_seen": 37112872, "step": 63935 }, { "epoch": 9.52338397378612, "grad_norm": 1.4240161180496216, "learning_rate": 3.137249574840311e-05, "loss": 0.651, "num_input_tokens_seen": 37115656, "step": 63940 }, { "epoch": 9.524128686327078, "grad_norm": 1.6647874116897583, "learning_rate": 3.136935360975913e-05, "loss": 0.7193, "num_input_tokens_seen": 37118568, "step": 63945 }, { "epoch": 9.524873398868037, "grad_norm": 1.0398041009902954, "learning_rate": 3.1366211363511394e-05, "loss": 0.6122, "num_input_tokens_seen": 37121416, "step": 63950 }, { "epoch": 9.525618111408996, "grad_norm": 1.1203556060791016, "learning_rate": 3.1363069009712994e-05, "loss": 0.5838, "num_input_tokens_seen": 37124488, "step": 63955 }, { "epoch": 9.526362823949956, "grad_norm": 1.0524260997772217, "learning_rate": 3.1359926548417007e-05, "loss": 0.6738, "num_input_tokens_seen": 37127400, "step": 63960 }, { "epoch": 9.527107536490915, "grad_norm": 2.3184115886688232, "learning_rate": 3.135678397967652e-05, "loss": 0.6783, "num_input_tokens_seen": 37130152, "step": 63965 }, { "epoch": 9.527852249031874, "grad_norm": 1.282025694847107, "learning_rate": 3.135364130354464e-05, "loss": 0.5875, "num_input_tokens_seen": 37133096, "step": 63970 }, { "epoch": 9.528596961572832, "grad_norm": 1.1468710899353027, "learning_rate": 3.135049852007444e-05, "loss": 0.5067, "num_input_tokens_seen": 37136072, "step": 63975 }, { "epoch": 9.529341674113793, "grad_norm": 1.75310218334198, "learning_rate": 3.134735562931902e-05, "loss": 0.6227, "num_input_tokens_seen": 37139240, "step": 63980 }, { "epoch": 9.530086386654752, "grad_norm": 0.6173503994941711, "learning_rate": 3.1344212631331484e-05, "loss": 0.4794, "num_input_tokens_seen": 37142152, "step": 63985 }, { "epoch": 9.53083109919571, "grad_norm": 0.9354318976402283, "learning_rate": 3.134106952616491e-05, "loss": 0.5148, "num_input_tokens_seen": 37145000, "step": 63990 }, { "epoch": 9.53157581173667, "grad_norm": 1.1426905393600464, "learning_rate": 3.133792631387243e-05, "loss": 0.5227, "num_input_tokens_seen": 37148008, "step": 63995 }, { "epoch": 9.53232052427763, "grad_norm": 0.8495125770568848, "learning_rate": 3.133478299450712e-05, "loss": 0.4901, "num_input_tokens_seen": 37150664, "step": 64000 }, { "epoch": 9.533065236818588, "grad_norm": 1.3091927766799927, "learning_rate": 3.1331639568122084e-05, "loss": 0.6751, "num_input_tokens_seen": 37153768, "step": 64005 }, { "epoch": 9.533809949359547, "grad_norm": 2.127655029296875, "learning_rate": 3.132849603477044e-05, "loss": 0.5417, "num_input_tokens_seen": 37157000, "step": 64010 }, { "epoch": 9.534554661900506, "grad_norm": 0.9145933985710144, "learning_rate": 3.132535239450528e-05, "loss": 0.6798, "num_input_tokens_seen": 37160072, "step": 64015 }, { "epoch": 9.535299374441465, "grad_norm": 1.5242749452590942, "learning_rate": 3.1322208647379724e-05, "loss": 0.5516, "num_input_tokens_seen": 37162856, "step": 64020 }, { "epoch": 9.536044086982425, "grad_norm": 1.4593523740768433, "learning_rate": 3.1319064793446876e-05, "loss": 0.5888, "num_input_tokens_seen": 37166056, "step": 64025 }, { "epoch": 9.536788799523384, "grad_norm": 0.9288761019706726, "learning_rate": 3.131592083275986e-05, "loss": 0.5592, "num_input_tokens_seen": 37169128, "step": 64030 }, { "epoch": 9.537533512064343, "grad_norm": 1.3417203426361084, "learning_rate": 3.1312776765371765e-05, "loss": 0.665, "num_input_tokens_seen": 37172136, "step": 64035 }, { "epoch": 9.538278224605303, "grad_norm": 1.2168818712234497, "learning_rate": 3.1309632591335734e-05, "loss": 0.5787, "num_input_tokens_seen": 37175272, "step": 64040 }, { "epoch": 9.539022937146262, "grad_norm": 1.8267927169799805, "learning_rate": 3.1306488310704875e-05, "loss": 0.6304, "num_input_tokens_seen": 37178216, "step": 64045 }, { "epoch": 9.53976764968722, "grad_norm": 1.2269526720046997, "learning_rate": 3.1303343923532294e-05, "loss": 0.5028, "num_input_tokens_seen": 37181064, "step": 64050 }, { "epoch": 9.54051236222818, "grad_norm": 2.1292037963867188, "learning_rate": 3.130019942987114e-05, "loss": 0.6757, "num_input_tokens_seen": 37184104, "step": 64055 }, { "epoch": 9.541257074769138, "grad_norm": 0.9938913583755493, "learning_rate": 3.1297054829774505e-05, "loss": 0.64, "num_input_tokens_seen": 37186792, "step": 64060 }, { "epoch": 9.542001787310099, "grad_norm": 2.61785888671875, "learning_rate": 3.1293910123295535e-05, "loss": 0.6142, "num_input_tokens_seen": 37189704, "step": 64065 }, { "epoch": 9.542746499851058, "grad_norm": 1.6717108488082886, "learning_rate": 3.1290765310487346e-05, "loss": 0.71, "num_input_tokens_seen": 37192712, "step": 64070 }, { "epoch": 9.543491212392016, "grad_norm": 1.124265193939209, "learning_rate": 3.1287620391403086e-05, "loss": 0.7083, "num_input_tokens_seen": 37195656, "step": 64075 }, { "epoch": 9.544235924932975, "grad_norm": 1.5799658298492432, "learning_rate": 3.128447536609585e-05, "loss": 0.5969, "num_input_tokens_seen": 37198376, "step": 64080 }, { "epoch": 9.544980637473936, "grad_norm": 1.1392971277236938, "learning_rate": 3.12813302346188e-05, "loss": 0.5634, "num_input_tokens_seen": 37201448, "step": 64085 }, { "epoch": 9.545725350014894, "grad_norm": 1.4081212282180786, "learning_rate": 3.127818499702506e-05, "loss": 0.6335, "num_input_tokens_seen": 37204488, "step": 64090 }, { "epoch": 9.546470062555853, "grad_norm": 2.5829102993011475, "learning_rate": 3.127503965336776e-05, "loss": 0.646, "num_input_tokens_seen": 37207624, "step": 64095 }, { "epoch": 9.547214775096812, "grad_norm": 1.10471510887146, "learning_rate": 3.1271894203700045e-05, "loss": 0.7634, "num_input_tokens_seen": 37210664, "step": 64100 }, { "epoch": 9.547959487637772, "grad_norm": 1.5332034826278687, "learning_rate": 3.126874864807505e-05, "loss": 0.5705, "num_input_tokens_seen": 37213608, "step": 64105 }, { "epoch": 9.548704200178731, "grad_norm": 1.2315034866333008, "learning_rate": 3.126560298654593e-05, "loss": 0.6786, "num_input_tokens_seen": 37216584, "step": 64110 }, { "epoch": 9.54944891271969, "grad_norm": 3.1325955390930176, "learning_rate": 3.126245721916581e-05, "loss": 0.8221, "num_input_tokens_seen": 37219368, "step": 64115 }, { "epoch": 9.550193625260649, "grad_norm": 1.4150495529174805, "learning_rate": 3.125931134598783e-05, "loss": 0.5093, "num_input_tokens_seen": 37222280, "step": 64120 }, { "epoch": 9.55093833780161, "grad_norm": 1.4857124090194702, "learning_rate": 3.1256165367065155e-05, "loss": 0.4659, "num_input_tokens_seen": 37224968, "step": 64125 }, { "epoch": 9.551683050342568, "grad_norm": 3.244326591491699, "learning_rate": 3.125301928245092e-05, "loss": 0.4983, "num_input_tokens_seen": 37227848, "step": 64130 }, { "epoch": 9.552427762883527, "grad_norm": 1.3430322408676147, "learning_rate": 3.124987309219828e-05, "loss": 0.6683, "num_input_tokens_seen": 37230568, "step": 64135 }, { "epoch": 9.553172475424486, "grad_norm": 1.2378389835357666, "learning_rate": 3.124672679636039e-05, "loss": 0.5938, "num_input_tokens_seen": 37233512, "step": 64140 }, { "epoch": 9.553917187965446, "grad_norm": 1.1554678678512573, "learning_rate": 3.12435803949904e-05, "loss": 0.4352, "num_input_tokens_seen": 37236296, "step": 64145 }, { "epoch": 9.554661900506405, "grad_norm": 1.139890432357788, "learning_rate": 3.1240433888141466e-05, "loss": 0.6316, "num_input_tokens_seen": 37239336, "step": 64150 }, { "epoch": 9.555406613047364, "grad_norm": 1.4214028120040894, "learning_rate": 3.1237287275866736e-05, "loss": 0.6518, "num_input_tokens_seen": 37242344, "step": 64155 }, { "epoch": 9.556151325588322, "grad_norm": 1.4356194734573364, "learning_rate": 3.123414055821938e-05, "loss": 0.7033, "num_input_tokens_seen": 37244904, "step": 64160 }, { "epoch": 9.556896038129283, "grad_norm": 1.9511903524398804, "learning_rate": 3.1230993735252564e-05, "loss": 0.7405, "num_input_tokens_seen": 37247816, "step": 64165 }, { "epoch": 9.557640750670242, "grad_norm": 1.1208827495574951, "learning_rate": 3.1227846807019435e-05, "loss": 0.6965, "num_input_tokens_seen": 37250568, "step": 64170 }, { "epoch": 9.5583854632112, "grad_norm": 0.7921313047409058, "learning_rate": 3.1224699773573164e-05, "loss": 0.4753, "num_input_tokens_seen": 37253512, "step": 64175 }, { "epoch": 9.55913017575216, "grad_norm": 1.69418203830719, "learning_rate": 3.1221552634966914e-05, "loss": 0.5798, "num_input_tokens_seen": 37256616, "step": 64180 }, { "epoch": 9.55987488829312, "grad_norm": 2.595029830932617, "learning_rate": 3.1218405391253856e-05, "loss": 0.6715, "num_input_tokens_seen": 37259752, "step": 64185 }, { "epoch": 9.560619600834078, "grad_norm": 1.6693212985992432, "learning_rate": 3.121525804248716e-05, "loss": 0.4694, "num_input_tokens_seen": 37262824, "step": 64190 }, { "epoch": 9.561364313375037, "grad_norm": 2.5357553958892822, "learning_rate": 3.1212110588720004e-05, "loss": 0.4817, "num_input_tokens_seen": 37265544, "step": 64195 }, { "epoch": 9.562109025915996, "grad_norm": 1.0953724384307861, "learning_rate": 3.1208963030005543e-05, "loss": 0.4876, "num_input_tokens_seen": 37268520, "step": 64200 }, { "epoch": 9.562853738456955, "grad_norm": 1.0336050987243652, "learning_rate": 3.120581536639697e-05, "loss": 0.5354, "num_input_tokens_seen": 37271368, "step": 64205 }, { "epoch": 9.563598450997915, "grad_norm": 1.3594859838485718, "learning_rate": 3.120266759794745e-05, "loss": 0.6321, "num_input_tokens_seen": 37274216, "step": 64210 }, { "epoch": 9.564343163538874, "grad_norm": 1.5425782203674316, "learning_rate": 3.119951972471016e-05, "loss": 0.5816, "num_input_tokens_seen": 37277128, "step": 64215 }, { "epoch": 9.565087876079833, "grad_norm": 1.0717254877090454, "learning_rate": 3.119637174673829e-05, "loss": 0.5884, "num_input_tokens_seen": 37279912, "step": 64220 }, { "epoch": 9.565832588620792, "grad_norm": 1.751747727394104, "learning_rate": 3.119322366408501e-05, "loss": 0.4845, "num_input_tokens_seen": 37282632, "step": 64225 }, { "epoch": 9.566577301161752, "grad_norm": 1.0625587701797485, "learning_rate": 3.119007547680353e-05, "loss": 0.5865, "num_input_tokens_seen": 37285416, "step": 64230 }, { "epoch": 9.56732201370271, "grad_norm": 1.1526719331741333, "learning_rate": 3.1186927184947e-05, "loss": 0.5412, "num_input_tokens_seen": 37288200, "step": 64235 }, { "epoch": 9.56806672624367, "grad_norm": 2.241584300994873, "learning_rate": 3.118377878856863e-05, "loss": 0.6164, "num_input_tokens_seen": 37291048, "step": 64240 }, { "epoch": 9.568811438784628, "grad_norm": 0.770027756690979, "learning_rate": 3.1180630287721595e-05, "loss": 0.8338, "num_input_tokens_seen": 37294152, "step": 64245 }, { "epoch": 9.569556151325589, "grad_norm": 1.2813661098480225, "learning_rate": 3.11774816824591e-05, "loss": 0.6919, "num_input_tokens_seen": 37297320, "step": 64250 }, { "epoch": 9.570300863866548, "grad_norm": 1.592895746231079, "learning_rate": 3.1174332972834326e-05, "loss": 0.8174, "num_input_tokens_seen": 37300424, "step": 64255 }, { "epoch": 9.571045576407506, "grad_norm": 0.9147464632987976, "learning_rate": 3.117118415890047e-05, "loss": 0.6684, "num_input_tokens_seen": 37303144, "step": 64260 }, { "epoch": 9.571790288948465, "grad_norm": 1.307836651802063, "learning_rate": 3.116803524071074e-05, "loss": 0.5899, "num_input_tokens_seen": 37305928, "step": 64265 }, { "epoch": 9.572535001489426, "grad_norm": 1.4280624389648438, "learning_rate": 3.116488621831831e-05, "loss": 0.6386, "num_input_tokens_seen": 37308904, "step": 64270 }, { "epoch": 9.573279714030384, "grad_norm": 1.4388401508331299, "learning_rate": 3.1161737091776404e-05, "loss": 0.3901, "num_input_tokens_seen": 37311688, "step": 64275 }, { "epoch": 9.574024426571343, "grad_norm": 1.1751987934112549, "learning_rate": 3.115858786113821e-05, "loss": 0.6103, "num_input_tokens_seen": 37314984, "step": 64280 }, { "epoch": 9.574769139112302, "grad_norm": 2.098362445831299, "learning_rate": 3.115543852645693e-05, "loss": 0.6984, "num_input_tokens_seen": 37317864, "step": 64285 }, { "epoch": 9.575513851653263, "grad_norm": 2.1553092002868652, "learning_rate": 3.1152289087785776e-05, "loss": 0.8557, "num_input_tokens_seen": 37321096, "step": 64290 }, { "epoch": 9.576258564194221, "grad_norm": 1.072889804840088, "learning_rate": 3.114913954517794e-05, "loss": 0.6792, "num_input_tokens_seen": 37323784, "step": 64295 }, { "epoch": 9.57700327673518, "grad_norm": 2.0614161491394043, "learning_rate": 3.1145989898686656e-05, "loss": 0.7168, "num_input_tokens_seen": 37326632, "step": 64300 }, { "epoch": 9.577747989276139, "grad_norm": 1.148744821548462, "learning_rate": 3.114284014836512e-05, "loss": 0.5587, "num_input_tokens_seen": 37329768, "step": 64305 }, { "epoch": 9.5784927018171, "grad_norm": 1.114676594734192, "learning_rate": 3.1139690294266526e-05, "loss": 0.5843, "num_input_tokens_seen": 37332488, "step": 64310 }, { "epoch": 9.579237414358058, "grad_norm": 2.2160754203796387, "learning_rate": 3.1136540336444114e-05, "loss": 0.6181, "num_input_tokens_seen": 37335304, "step": 64315 }, { "epoch": 9.579982126899017, "grad_norm": 1.6198325157165527, "learning_rate": 3.11333902749511e-05, "loss": 0.7364, "num_input_tokens_seen": 37338312, "step": 64320 }, { "epoch": 9.580726839439976, "grad_norm": 1.2224161624908447, "learning_rate": 3.1130240109840676e-05, "loss": 0.6189, "num_input_tokens_seen": 37341256, "step": 64325 }, { "epoch": 9.581471551980936, "grad_norm": 1.903407335281372, "learning_rate": 3.112708984116608e-05, "loss": 0.6695, "num_input_tokens_seen": 37343816, "step": 64330 }, { "epoch": 9.582216264521895, "grad_norm": 1.134731650352478, "learning_rate": 3.1123939468980535e-05, "loss": 0.6312, "num_input_tokens_seen": 37346888, "step": 64335 }, { "epoch": 9.582960977062854, "grad_norm": 2.0860989093780518, "learning_rate": 3.1120788993337244e-05, "loss": 0.7101, "num_input_tokens_seen": 37349704, "step": 64340 }, { "epoch": 9.583705689603812, "grad_norm": 1.662068486213684, "learning_rate": 3.111763841428945e-05, "loss": 0.5038, "num_input_tokens_seen": 37353000, "step": 64345 }, { "epoch": 9.584450402144771, "grad_norm": 2.082254648208618, "learning_rate": 3.111448773189037e-05, "loss": 0.5715, "num_input_tokens_seen": 37355912, "step": 64350 }, { "epoch": 9.585195114685732, "grad_norm": 1.4242918491363525, "learning_rate": 3.1111336946193244e-05, "loss": 0.6013, "num_input_tokens_seen": 37359016, "step": 64355 }, { "epoch": 9.58593982722669, "grad_norm": 1.4052823781967163, "learning_rate": 3.1108186057251285e-05, "loss": 0.591, "num_input_tokens_seen": 37361640, "step": 64360 }, { "epoch": 9.58668453976765, "grad_norm": 0.9699105620384216, "learning_rate": 3.1105035065117735e-05, "loss": 0.6216, "num_input_tokens_seen": 37364840, "step": 64365 }, { "epoch": 9.58742925230861, "grad_norm": 1.725191593170166, "learning_rate": 3.110188396984582e-05, "loss": 0.5502, "num_input_tokens_seen": 37367656, "step": 64370 }, { "epoch": 9.588173964849569, "grad_norm": 1.4678586721420288, "learning_rate": 3.109873277148878e-05, "loss": 0.6635, "num_input_tokens_seen": 37370824, "step": 64375 }, { "epoch": 9.588918677390527, "grad_norm": 1.1838188171386719, "learning_rate": 3.109558147009984e-05, "loss": 0.6595, "num_input_tokens_seen": 37373960, "step": 64380 }, { "epoch": 9.589663389931486, "grad_norm": 2.0044453144073486, "learning_rate": 3.1092430065732246e-05, "loss": 0.7169, "num_input_tokens_seen": 37376584, "step": 64385 }, { "epoch": 9.590408102472445, "grad_norm": 1.174136996269226, "learning_rate": 3.108927855843924e-05, "loss": 0.5968, "num_input_tokens_seen": 37379304, "step": 64390 }, { "epoch": 9.591152815013405, "grad_norm": 1.2246896028518677, "learning_rate": 3.108612694827407e-05, "loss": 0.596, "num_input_tokens_seen": 37382280, "step": 64395 }, { "epoch": 9.591897527554364, "grad_norm": 1.8664219379425049, "learning_rate": 3.108297523528997e-05, "loss": 0.543, "num_input_tokens_seen": 37384904, "step": 64400 }, { "epoch": 9.592642240095323, "grad_norm": 1.1298840045928955, "learning_rate": 3.107982341954018e-05, "loss": 0.4763, "num_input_tokens_seen": 37387816, "step": 64405 }, { "epoch": 9.593386952636282, "grad_norm": 1.0629938840866089, "learning_rate": 3.1076671501077946e-05, "loss": 0.4972, "num_input_tokens_seen": 37390696, "step": 64410 }, { "epoch": 9.594131665177242, "grad_norm": 1.2394379377365112, "learning_rate": 3.1073519479956534e-05, "loss": 0.5704, "num_input_tokens_seen": 37393576, "step": 64415 }, { "epoch": 9.594876377718201, "grad_norm": 1.6527773141860962, "learning_rate": 3.107036735622918e-05, "loss": 0.7124, "num_input_tokens_seen": 37396360, "step": 64420 }, { "epoch": 9.59562109025916, "grad_norm": 1.7234928607940674, "learning_rate": 3.106721512994913e-05, "loss": 0.6754, "num_input_tokens_seen": 37399240, "step": 64425 }, { "epoch": 9.596365802800118, "grad_norm": 0.9429468512535095, "learning_rate": 3.1064062801169666e-05, "loss": 0.6373, "num_input_tokens_seen": 37402312, "step": 64430 }, { "epoch": 9.597110515341079, "grad_norm": 0.9441685080528259, "learning_rate": 3.106091036994401e-05, "loss": 0.526, "num_input_tokens_seen": 37405032, "step": 64435 }, { "epoch": 9.597855227882038, "grad_norm": 1.1106233596801758, "learning_rate": 3.105775783632544e-05, "loss": 0.5527, "num_input_tokens_seen": 37407912, "step": 64440 }, { "epoch": 9.598599940422996, "grad_norm": 0.8978886604309082, "learning_rate": 3.10546052003672e-05, "loss": 0.5311, "num_input_tokens_seen": 37410920, "step": 64445 }, { "epoch": 9.599344652963955, "grad_norm": 1.507659912109375, "learning_rate": 3.105145246212257e-05, "loss": 0.6494, "num_input_tokens_seen": 37413704, "step": 64450 }, { "epoch": 9.600089365504916, "grad_norm": 1.285992980003357, "learning_rate": 3.1048299621644794e-05, "loss": 0.6767, "num_input_tokens_seen": 37416808, "step": 64455 }, { "epoch": 9.600834078045875, "grad_norm": 1.144917607307434, "learning_rate": 3.1045146678987144e-05, "loss": 0.699, "num_input_tokens_seen": 37419656, "step": 64460 }, { "epoch": 9.601578790586833, "grad_norm": 1.3477684259414673, "learning_rate": 3.104199363420289e-05, "loss": 0.5366, "num_input_tokens_seen": 37422728, "step": 64465 }, { "epoch": 9.602323503127792, "grad_norm": 1.1845017671585083, "learning_rate": 3.1038840487345286e-05, "loss": 0.4549, "num_input_tokens_seen": 37425384, "step": 64470 }, { "epoch": 9.603068215668753, "grad_norm": 1.5185238122940063, "learning_rate": 3.103568723846761e-05, "loss": 0.5822, "num_input_tokens_seen": 37428232, "step": 64475 }, { "epoch": 9.603812928209711, "grad_norm": 0.8872346878051758, "learning_rate": 3.103253388762314e-05, "loss": 0.5411, "num_input_tokens_seen": 37430952, "step": 64480 }, { "epoch": 9.60455764075067, "grad_norm": 2.4600603580474854, "learning_rate": 3.102938043486513e-05, "loss": 0.5548, "num_input_tokens_seen": 37433768, "step": 64485 }, { "epoch": 9.605302353291629, "grad_norm": 2.9468986988067627, "learning_rate": 3.102622688024688e-05, "loss": 0.546, "num_input_tokens_seen": 37436520, "step": 64490 }, { "epoch": 9.60604706583259, "grad_norm": 1.4564955234527588, "learning_rate": 3.1023073223821643e-05, "loss": 0.7009, "num_input_tokens_seen": 37439208, "step": 64495 }, { "epoch": 9.606791778373548, "grad_norm": 1.310738205909729, "learning_rate": 3.101991946564271e-05, "loss": 0.4251, "num_input_tokens_seen": 37441960, "step": 64500 }, { "epoch": 9.607536490914507, "grad_norm": 2.2004199028015137, "learning_rate": 3.1016765605763346e-05, "loss": 0.5943, "num_input_tokens_seen": 37445096, "step": 64505 }, { "epoch": 9.608281203455466, "grad_norm": 1.2181397676467896, "learning_rate": 3.101361164423685e-05, "loss": 0.5803, "num_input_tokens_seen": 37448008, "step": 64510 }, { "epoch": 9.609025915996426, "grad_norm": 0.9592545032501221, "learning_rate": 3.1010457581116494e-05, "loss": 0.5676, "num_input_tokens_seen": 37451016, "step": 64515 }, { "epoch": 9.609770628537385, "grad_norm": 1.4898134469985962, "learning_rate": 3.100730341645557e-05, "loss": 0.6219, "num_input_tokens_seen": 37453864, "step": 64520 }, { "epoch": 9.610515341078344, "grad_norm": 1.3849717378616333, "learning_rate": 3.100414915030736e-05, "loss": 0.827, "num_input_tokens_seen": 37456456, "step": 64525 }, { "epoch": 9.611260053619302, "grad_norm": 1.5729238986968994, "learning_rate": 3.100099478272515e-05, "loss": 0.6155, "num_input_tokens_seen": 37459208, "step": 64530 }, { "epoch": 9.612004766160261, "grad_norm": 0.7384198307991028, "learning_rate": 3.099784031376224e-05, "loss": 0.5177, "num_input_tokens_seen": 37462024, "step": 64535 }, { "epoch": 9.612749478701222, "grad_norm": 1.448370337486267, "learning_rate": 3.09946857434719e-05, "loss": 0.7943, "num_input_tokens_seen": 37464936, "step": 64540 }, { "epoch": 9.61349419124218, "grad_norm": 0.6875277161598206, "learning_rate": 3.099153107190744e-05, "loss": 0.5102, "num_input_tokens_seen": 37467816, "step": 64545 }, { "epoch": 9.61423890378314, "grad_norm": 1.1549606323242188, "learning_rate": 3.0988376299122154e-05, "loss": 0.5906, "num_input_tokens_seen": 37470792, "step": 64550 }, { "epoch": 9.6149836163241, "grad_norm": 1.552667260169983, "learning_rate": 3.098522142516934e-05, "loss": 0.6623, "num_input_tokens_seen": 37473704, "step": 64555 }, { "epoch": 9.615728328865059, "grad_norm": 1.315064787864685, "learning_rate": 3.09820664501023e-05, "loss": 0.6836, "num_input_tokens_seen": 37476392, "step": 64560 }, { "epoch": 9.616473041406017, "grad_norm": 2.108584403991699, "learning_rate": 3.097891137397432e-05, "loss": 0.7356, "num_input_tokens_seen": 37479496, "step": 64565 }, { "epoch": 9.617217753946976, "grad_norm": 1.431195616722107, "learning_rate": 3.097575619683871e-05, "loss": 0.6168, "num_input_tokens_seen": 37482248, "step": 64570 }, { "epoch": 9.617962466487935, "grad_norm": 1.2302649021148682, "learning_rate": 3.097260091874877e-05, "loss": 0.5967, "num_input_tokens_seen": 37485064, "step": 64575 }, { "epoch": 9.618707179028895, "grad_norm": 1.2844319343566895, "learning_rate": 3.0969445539757805e-05, "loss": 0.6252, "num_input_tokens_seen": 37488136, "step": 64580 }, { "epoch": 9.619451891569854, "grad_norm": 1.1269316673278809, "learning_rate": 3.0966290059919126e-05, "loss": 0.6593, "num_input_tokens_seen": 37491144, "step": 64585 }, { "epoch": 9.620196604110813, "grad_norm": 0.9757700562477112, "learning_rate": 3.096313447928604e-05, "loss": 0.6218, "num_input_tokens_seen": 37493864, "step": 64590 }, { "epoch": 9.620941316651772, "grad_norm": 1.6500389575958252, "learning_rate": 3.095997879791187e-05, "loss": 0.6789, "num_input_tokens_seen": 37496648, "step": 64595 }, { "epoch": 9.621686029192732, "grad_norm": 1.812293291091919, "learning_rate": 3.095682301584991e-05, "loss": 0.8052, "num_input_tokens_seen": 37499528, "step": 64600 }, { "epoch": 9.622430741733691, "grad_norm": 1.2943941354751587, "learning_rate": 3.095366713315347e-05, "loss": 0.6513, "num_input_tokens_seen": 37502152, "step": 64605 }, { "epoch": 9.62317545427465, "grad_norm": 2.7539079189300537, "learning_rate": 3.095051114987588e-05, "loss": 0.5029, "num_input_tokens_seen": 37504968, "step": 64610 }, { "epoch": 9.623920166815608, "grad_norm": 1.7655967473983765, "learning_rate": 3.094735506607045e-05, "loss": 0.7387, "num_input_tokens_seen": 37507848, "step": 64615 }, { "epoch": 9.624664879356569, "grad_norm": 0.8456203937530518, "learning_rate": 3.09441988817905e-05, "loss": 0.5733, "num_input_tokens_seen": 37510888, "step": 64620 }, { "epoch": 9.625409591897528, "grad_norm": 1.2488858699798584, "learning_rate": 3.0941042597089356e-05, "loss": 0.6915, "num_input_tokens_seen": 37513768, "step": 64625 }, { "epoch": 9.626154304438487, "grad_norm": 0.9176185131072998, "learning_rate": 3.093788621202033e-05, "loss": 0.6623, "num_input_tokens_seen": 37516552, "step": 64630 }, { "epoch": 9.626899016979445, "grad_norm": 1.1362074613571167, "learning_rate": 3.0934729726636755e-05, "loss": 0.7279, "num_input_tokens_seen": 37519528, "step": 64635 }, { "epoch": 9.627643729520406, "grad_norm": 1.5797080993652344, "learning_rate": 3.093157314099196e-05, "loss": 0.4673, "num_input_tokens_seen": 37522440, "step": 64640 }, { "epoch": 9.628388442061365, "grad_norm": 0.7979972958564758, "learning_rate": 3.092841645513925e-05, "loss": 0.4474, "num_input_tokens_seen": 37525128, "step": 64645 }, { "epoch": 9.629133154602323, "grad_norm": 1.023781180381775, "learning_rate": 3.092525966913198e-05, "loss": 0.6755, "num_input_tokens_seen": 37527848, "step": 64650 }, { "epoch": 9.629877867143282, "grad_norm": 2.328941822052002, "learning_rate": 3.0922102783023466e-05, "loss": 0.5692, "num_input_tokens_seen": 37531112, "step": 64655 }, { "epoch": 9.630622579684243, "grad_norm": 1.3349460363388062, "learning_rate": 3.0918945796867044e-05, "loss": 0.6212, "num_input_tokens_seen": 37533768, "step": 64660 }, { "epoch": 9.631367292225201, "grad_norm": 1.9774000644683838, "learning_rate": 3.091578871071605e-05, "loss": 0.6756, "num_input_tokens_seen": 37536584, "step": 64665 }, { "epoch": 9.63211200476616, "grad_norm": 1.590657353401184, "learning_rate": 3.0912631524623826e-05, "loss": 0.6337, "num_input_tokens_seen": 37539592, "step": 64670 }, { "epoch": 9.632856717307119, "grad_norm": 0.9520993232727051, "learning_rate": 3.0909474238643694e-05, "loss": 0.678, "num_input_tokens_seen": 37542440, "step": 64675 }, { "epoch": 9.63360142984808, "grad_norm": 1.5900205373764038, "learning_rate": 3.090631685282901e-05, "loss": 0.6623, "num_input_tokens_seen": 37545000, "step": 64680 }, { "epoch": 9.634346142389038, "grad_norm": 1.2166599035263062, "learning_rate": 3.0903159367233086e-05, "loss": 0.5971, "num_input_tokens_seen": 37547752, "step": 64685 }, { "epoch": 9.635090854929997, "grad_norm": 0.9194265604019165, "learning_rate": 3.09000017819093e-05, "loss": 0.6973, "num_input_tokens_seen": 37550504, "step": 64690 }, { "epoch": 9.635835567470956, "grad_norm": 1.4577968120574951, "learning_rate": 3.089684409691097e-05, "loss": 0.5182, "num_input_tokens_seen": 37553576, "step": 64695 }, { "epoch": 9.636580280011916, "grad_norm": 1.7013969421386719, "learning_rate": 3.0893686312291466e-05, "loss": 0.7375, "num_input_tokens_seen": 37556584, "step": 64700 }, { "epoch": 9.637324992552875, "grad_norm": 1.037913203239441, "learning_rate": 3.089052842810411e-05, "loss": 0.6558, "num_input_tokens_seen": 37559496, "step": 64705 }, { "epoch": 9.638069705093834, "grad_norm": 1.160749912261963, "learning_rate": 3.088737044440226e-05, "loss": 0.6496, "num_input_tokens_seen": 37562696, "step": 64710 }, { "epoch": 9.638814417634793, "grad_norm": 1.6508139371871948, "learning_rate": 3.088421236123928e-05, "loss": 0.5198, "num_input_tokens_seen": 37565640, "step": 64715 }, { "epoch": 9.639559130175751, "grad_norm": 1.4670549631118774, "learning_rate": 3.0881054178668514e-05, "loss": 0.5845, "num_input_tokens_seen": 37568552, "step": 64720 }, { "epoch": 9.640303842716712, "grad_norm": 1.7075105905532837, "learning_rate": 3.087789589674331e-05, "loss": 0.5841, "num_input_tokens_seen": 37571688, "step": 64725 }, { "epoch": 9.64104855525767, "grad_norm": 1.3743764162063599, "learning_rate": 3.087473751551703e-05, "loss": 0.6827, "num_input_tokens_seen": 37574408, "step": 64730 }, { "epoch": 9.64179326779863, "grad_norm": 2.0356295108795166, "learning_rate": 3.087157903504303e-05, "loss": 0.548, "num_input_tokens_seen": 37577384, "step": 64735 }, { "epoch": 9.642537980339588, "grad_norm": 1.1837726831436157, "learning_rate": 3.086842045537467e-05, "loss": 0.604, "num_input_tokens_seen": 37580360, "step": 64740 }, { "epoch": 9.643282692880549, "grad_norm": 1.6174198389053345, "learning_rate": 3.0865261776565306e-05, "loss": 0.5899, "num_input_tokens_seen": 37583080, "step": 64745 }, { "epoch": 9.644027405421507, "grad_norm": 1.2881790399551392, "learning_rate": 3.0862102998668314e-05, "loss": 0.5914, "num_input_tokens_seen": 37585896, "step": 64750 }, { "epoch": 9.644772117962466, "grad_norm": 1.067039966583252, "learning_rate": 3.085894412173704e-05, "loss": 0.7268, "num_input_tokens_seen": 37588712, "step": 64755 }, { "epoch": 9.645516830503425, "grad_norm": 1.5694785118103027, "learning_rate": 3.085578514582487e-05, "loss": 0.6614, "num_input_tokens_seen": 37591752, "step": 64760 }, { "epoch": 9.646261543044385, "grad_norm": 0.8259007930755615, "learning_rate": 3.0852626070985164e-05, "loss": 0.5023, "num_input_tokens_seen": 37594440, "step": 64765 }, { "epoch": 9.647006255585344, "grad_norm": 1.3908137083053589, "learning_rate": 3.084946689727128e-05, "loss": 0.4626, "num_input_tokens_seen": 37597576, "step": 64770 }, { "epoch": 9.647750968126303, "grad_norm": 1.5050580501556396, "learning_rate": 3.08463076247366e-05, "loss": 0.5263, "num_input_tokens_seen": 37600392, "step": 64775 }, { "epoch": 9.648495680667262, "grad_norm": 1.6791492700576782, "learning_rate": 3.084314825343449e-05, "loss": 0.594, "num_input_tokens_seen": 37603528, "step": 64780 }, { "epoch": 9.649240393208222, "grad_norm": 1.2625770568847656, "learning_rate": 3.083998878341833e-05, "loss": 0.6881, "num_input_tokens_seen": 37606280, "step": 64785 }, { "epoch": 9.649985105749181, "grad_norm": 1.9540852308273315, "learning_rate": 3.0836829214741496e-05, "loss": 0.6046, "num_input_tokens_seen": 37608904, "step": 64790 }, { "epoch": 9.65072981829014, "grad_norm": 1.1986467838287354, "learning_rate": 3.0833669547457375e-05, "loss": 0.6211, "num_input_tokens_seen": 37611944, "step": 64795 }, { "epoch": 9.651474530831099, "grad_norm": 1.2389333248138428, "learning_rate": 3.083050978161933e-05, "loss": 0.6423, "num_input_tokens_seen": 37614792, "step": 64800 }, { "epoch": 9.652219243372059, "grad_norm": 1.1821074485778809, "learning_rate": 3.082734991728075e-05, "loss": 0.5765, "num_input_tokens_seen": 37617512, "step": 64805 }, { "epoch": 9.652963955913018, "grad_norm": 1.5423575639724731, "learning_rate": 3.0824189954495006e-05, "loss": 0.7344, "num_input_tokens_seen": 37620200, "step": 64810 }, { "epoch": 9.653708668453977, "grad_norm": 1.7862366437911987, "learning_rate": 3.08210298933155e-05, "loss": 0.5926, "num_input_tokens_seen": 37622984, "step": 64815 }, { "epoch": 9.654453380994935, "grad_norm": 0.6439999938011169, "learning_rate": 3.081786973379561e-05, "loss": 0.3645, "num_input_tokens_seen": 37625608, "step": 64820 }, { "epoch": 9.655198093535896, "grad_norm": 2.2944953441619873, "learning_rate": 3.081470947598872e-05, "loss": 0.6883, "num_input_tokens_seen": 37628328, "step": 64825 }, { "epoch": 9.655942806076855, "grad_norm": 1.2178090810775757, "learning_rate": 3.081154911994822e-05, "loss": 0.6495, "num_input_tokens_seen": 37631176, "step": 64830 }, { "epoch": 9.656687518617813, "grad_norm": 0.7503260970115662, "learning_rate": 3.080838866572752e-05, "loss": 0.654, "num_input_tokens_seen": 37633864, "step": 64835 }, { "epoch": 9.657432231158772, "grad_norm": 1.3717715740203857, "learning_rate": 3.0805228113379986e-05, "loss": 0.5957, "num_input_tokens_seen": 37636584, "step": 64840 }, { "epoch": 9.658176943699733, "grad_norm": 0.9144487977027893, "learning_rate": 3.080206746295902e-05, "loss": 0.4853, "num_input_tokens_seen": 37640840, "step": 64845 }, { "epoch": 9.658921656240691, "grad_norm": 2.1796133518218994, "learning_rate": 3.079890671451802e-05, "loss": 0.5906, "num_input_tokens_seen": 37643848, "step": 64850 }, { "epoch": 9.65966636878165, "grad_norm": 1.1898099184036255, "learning_rate": 3.079574586811039e-05, "loss": 0.6313, "num_input_tokens_seen": 37646600, "step": 64855 }, { "epoch": 9.660411081322609, "grad_norm": 0.901850700378418, "learning_rate": 3.0792584923789525e-05, "loss": 0.3654, "num_input_tokens_seen": 37649256, "step": 64860 }, { "epoch": 9.66115579386357, "grad_norm": 1.1803827285766602, "learning_rate": 3.078942388160883e-05, "loss": 0.7058, "num_input_tokens_seen": 37652264, "step": 64865 }, { "epoch": 9.661900506404528, "grad_norm": 1.190362811088562, "learning_rate": 3.07862627416217e-05, "loss": 0.6248, "num_input_tokens_seen": 37655400, "step": 64870 }, { "epoch": 9.662645218945487, "grad_norm": 1.1310837268829346, "learning_rate": 3.0783101503881526e-05, "loss": 0.4841, "num_input_tokens_seen": 37658216, "step": 64875 }, { "epoch": 9.663389931486446, "grad_norm": 1.1241575479507446, "learning_rate": 3.0779940168441754e-05, "loss": 0.7186, "num_input_tokens_seen": 37661352, "step": 64880 }, { "epoch": 9.664134644027406, "grad_norm": 1.6709762811660767, "learning_rate": 3.077677873535575e-05, "loss": 0.6638, "num_input_tokens_seen": 37664200, "step": 64885 }, { "epoch": 9.664879356568365, "grad_norm": 1.7454267740249634, "learning_rate": 3.0773617204676946e-05, "loss": 0.7342, "num_input_tokens_seen": 37667176, "step": 64890 }, { "epoch": 9.665624069109324, "grad_norm": 1.3817626237869263, "learning_rate": 3.077045557645875e-05, "loss": 0.748, "num_input_tokens_seen": 37669800, "step": 64895 }, { "epoch": 9.666368781650283, "grad_norm": 1.1210139989852905, "learning_rate": 3.0767293850754566e-05, "loss": 0.6819, "num_input_tokens_seen": 37673128, "step": 64900 }, { "epoch": 9.667113494191241, "grad_norm": 1.0793155431747437, "learning_rate": 3.076413202761782e-05, "loss": 0.5521, "num_input_tokens_seen": 37676136, "step": 64905 }, { "epoch": 9.667858206732202, "grad_norm": 1.4395527839660645, "learning_rate": 3.076097010710192e-05, "loss": 0.7077, "num_input_tokens_seen": 37679016, "step": 64910 }, { "epoch": 9.66860291927316, "grad_norm": 1.2936145067214966, "learning_rate": 3.075780808926028e-05, "loss": 0.5872, "num_input_tokens_seen": 37681992, "step": 64915 }, { "epoch": 9.66934763181412, "grad_norm": 1.5022931098937988, "learning_rate": 3.075464597414632e-05, "loss": 0.5642, "num_input_tokens_seen": 37685192, "step": 64920 }, { "epoch": 9.670092344355078, "grad_norm": 1.7129147052764893, "learning_rate": 3.075148376181348e-05, "loss": 0.6028, "num_input_tokens_seen": 37688040, "step": 64925 }, { "epoch": 9.670837056896039, "grad_norm": 1.206064224243164, "learning_rate": 3.074832145231517e-05, "loss": 0.3941, "num_input_tokens_seen": 37691016, "step": 64930 }, { "epoch": 9.671581769436997, "grad_norm": 1.8083769083023071, "learning_rate": 3.07451590457048e-05, "loss": 0.6177, "num_input_tokens_seen": 37693736, "step": 64935 }, { "epoch": 9.672326481977956, "grad_norm": 1.8597257137298584, "learning_rate": 3.0741996542035804e-05, "loss": 0.7057, "num_input_tokens_seen": 37696808, "step": 64940 }, { "epoch": 9.673071194518915, "grad_norm": 2.6833367347717285, "learning_rate": 3.073883394136162e-05, "loss": 0.5979, "num_input_tokens_seen": 37699944, "step": 64945 }, { "epoch": 9.673815907059875, "grad_norm": 2.329082727432251, "learning_rate": 3.073567124373567e-05, "loss": 0.7916, "num_input_tokens_seen": 37702760, "step": 64950 }, { "epoch": 9.674560619600834, "grad_norm": 0.904526948928833, "learning_rate": 3.0732508449211373e-05, "loss": 0.6613, "num_input_tokens_seen": 37705640, "step": 64955 }, { "epoch": 9.675305332141793, "grad_norm": 1.513417363166809, "learning_rate": 3.0729345557842184e-05, "loss": 0.8378, "num_input_tokens_seen": 37708648, "step": 64960 }, { "epoch": 9.676050044682752, "grad_norm": 1.3269940614700317, "learning_rate": 3.072618256968153e-05, "loss": 0.7345, "num_input_tokens_seen": 37711656, "step": 64965 }, { "epoch": 9.676794757223712, "grad_norm": 1.215582251548767, "learning_rate": 3.072301948478283e-05, "loss": 0.6531, "num_input_tokens_seen": 37714408, "step": 64970 }, { "epoch": 9.677539469764671, "grad_norm": 1.1375423669815063, "learning_rate": 3.0719856303199526e-05, "loss": 0.5147, "num_input_tokens_seen": 37717224, "step": 64975 }, { "epoch": 9.67828418230563, "grad_norm": 0.8905870914459229, "learning_rate": 3.071669302498508e-05, "loss": 0.6831, "num_input_tokens_seen": 37720040, "step": 64980 }, { "epoch": 9.679028894846589, "grad_norm": 1.3833963871002197, "learning_rate": 3.07135296501929e-05, "loss": 0.6522, "num_input_tokens_seen": 37723176, "step": 64985 }, { "epoch": 9.679773607387549, "grad_norm": 1.8093796968460083, "learning_rate": 3.071036617887645e-05, "loss": 0.7437, "num_input_tokens_seen": 37725928, "step": 64990 }, { "epoch": 9.680518319928508, "grad_norm": 1.601519227027893, "learning_rate": 3.070720261108917e-05, "loss": 0.5814, "num_input_tokens_seen": 37728680, "step": 64995 }, { "epoch": 9.681263032469467, "grad_norm": 1.875377893447876, "learning_rate": 3.07040389468845e-05, "loss": 0.5868, "num_input_tokens_seen": 37731912, "step": 65000 }, { "epoch": 9.682007745010425, "grad_norm": 2.8096864223480225, "learning_rate": 3.070087518631589e-05, "loss": 0.6784, "num_input_tokens_seen": 37734728, "step": 65005 }, { "epoch": 9.682752457551386, "grad_norm": 1.273510456085205, "learning_rate": 3.069771132943679e-05, "loss": 0.597, "num_input_tokens_seen": 37737576, "step": 65010 }, { "epoch": 9.683497170092345, "grad_norm": 1.2664566040039062, "learning_rate": 3.069454737630064e-05, "loss": 0.4773, "num_input_tokens_seen": 37740136, "step": 65015 }, { "epoch": 9.684241882633303, "grad_norm": 2.559032917022705, "learning_rate": 3.069138332696091e-05, "loss": 0.6938, "num_input_tokens_seen": 37743048, "step": 65020 }, { "epoch": 9.684986595174262, "grad_norm": 1.614657998085022, "learning_rate": 3.0688219181471036e-05, "loss": 0.7069, "num_input_tokens_seen": 37745864, "step": 65025 }, { "epoch": 9.685731307715223, "grad_norm": 1.4225794076919556, "learning_rate": 3.0685054939884485e-05, "loss": 0.5622, "num_input_tokens_seen": 37748712, "step": 65030 }, { "epoch": 9.686476020256181, "grad_norm": 1.3541240692138672, "learning_rate": 3.0681890602254704e-05, "loss": 0.5037, "num_input_tokens_seen": 37751720, "step": 65035 }, { "epoch": 9.68722073279714, "grad_norm": 1.337674856185913, "learning_rate": 3.067872616863516e-05, "loss": 0.3802, "num_input_tokens_seen": 37754472, "step": 65040 }, { "epoch": 9.687965445338099, "grad_norm": 1.5929052829742432, "learning_rate": 3.0675561639079306e-05, "loss": 0.5658, "num_input_tokens_seen": 37757288, "step": 65045 }, { "epoch": 9.688710157879058, "grad_norm": 1.264399766921997, "learning_rate": 3.0672397013640605e-05, "loss": 0.5512, "num_input_tokens_seen": 37760456, "step": 65050 }, { "epoch": 9.689454870420018, "grad_norm": 2.7243118286132812, "learning_rate": 3.066923229237253e-05, "loss": 0.7571, "num_input_tokens_seen": 37763208, "step": 65055 }, { "epoch": 9.690199582960977, "grad_norm": 1.7508857250213623, "learning_rate": 3.0666067475328534e-05, "loss": 0.6741, "num_input_tokens_seen": 37765896, "step": 65060 }, { "epoch": 9.690944295501936, "grad_norm": 1.2567082643508911, "learning_rate": 3.066290256256208e-05, "loss": 0.5308, "num_input_tokens_seen": 37768840, "step": 65065 }, { "epoch": 9.691689008042896, "grad_norm": 1.4877203702926636, "learning_rate": 3.065973755412665e-05, "loss": 0.698, "num_input_tokens_seen": 37771912, "step": 65070 }, { "epoch": 9.692433720583855, "grad_norm": 1.0993067026138306, "learning_rate": 3.06565724500757e-05, "loss": 0.6939, "num_input_tokens_seen": 37774696, "step": 65075 }, { "epoch": 9.693178433124814, "grad_norm": 1.5161607265472412, "learning_rate": 3.0653407250462716e-05, "loss": 0.4166, "num_input_tokens_seen": 37778088, "step": 65080 }, { "epoch": 9.693923145665773, "grad_norm": 2.760091543197632, "learning_rate": 3.065024195534116e-05, "loss": 0.5652, "num_input_tokens_seen": 37780872, "step": 65085 }, { "epoch": 9.694667858206731, "grad_norm": 1.2902966737747192, "learning_rate": 3.06470765647645e-05, "loss": 0.5442, "num_input_tokens_seen": 37784008, "step": 65090 }, { "epoch": 9.695412570747692, "grad_norm": 1.2406630516052246, "learning_rate": 3.064391107878623e-05, "loss": 0.5525, "num_input_tokens_seen": 37787048, "step": 65095 }, { "epoch": 9.69615728328865, "grad_norm": 2.337066650390625, "learning_rate": 3.064074549745982e-05, "loss": 0.7022, "num_input_tokens_seen": 37790024, "step": 65100 }, { "epoch": 9.69690199582961, "grad_norm": 1.116720199584961, "learning_rate": 3.063757982083874e-05, "loss": 0.5718, "num_input_tokens_seen": 37793224, "step": 65105 }, { "epoch": 9.697646708370568, "grad_norm": 0.8802424073219299, "learning_rate": 3.063441404897648e-05, "loss": 0.6239, "num_input_tokens_seen": 37796168, "step": 65110 }, { "epoch": 9.698391420911529, "grad_norm": 0.6637804508209229, "learning_rate": 3.063124818192652e-05, "loss": 0.6192, "num_input_tokens_seen": 37799080, "step": 65115 }, { "epoch": 9.699136133452487, "grad_norm": 2.2955923080444336, "learning_rate": 3.062808221974235e-05, "loss": 0.6119, "num_input_tokens_seen": 37801704, "step": 65120 }, { "epoch": 9.699880845993446, "grad_norm": 2.299567461013794, "learning_rate": 3.062491616247745e-05, "loss": 0.7788, "num_input_tokens_seen": 37804968, "step": 65125 }, { "epoch": 9.700625558534405, "grad_norm": 1.2395192384719849, "learning_rate": 3.0621750010185316e-05, "loss": 0.6849, "num_input_tokens_seen": 37807880, "step": 65130 }, { "epoch": 9.701370271075366, "grad_norm": 1.1346337795257568, "learning_rate": 3.0618583762919417e-05, "loss": 0.6462, "num_input_tokens_seen": 37811240, "step": 65135 }, { "epoch": 9.702114983616324, "grad_norm": 1.9380247592926025, "learning_rate": 3.0615417420733264e-05, "loss": 0.5078, "num_input_tokens_seen": 37814120, "step": 65140 }, { "epoch": 9.702859696157283, "grad_norm": 1.3407015800476074, "learning_rate": 3.0612250983680336e-05, "loss": 0.6164, "num_input_tokens_seen": 37816968, "step": 65145 }, { "epoch": 9.703604408698242, "grad_norm": 2.3477604389190674, "learning_rate": 3.060908445181413e-05, "loss": 0.639, "num_input_tokens_seen": 37819528, "step": 65150 }, { "epoch": 9.704349121239202, "grad_norm": 1.310502052307129, "learning_rate": 3.060591782518815e-05, "loss": 0.7109, "num_input_tokens_seen": 37822056, "step": 65155 }, { "epoch": 9.705093833780161, "grad_norm": 1.0626775026321411, "learning_rate": 3.060275110385588e-05, "loss": 0.5591, "num_input_tokens_seen": 37825128, "step": 65160 }, { "epoch": 9.70583854632112, "grad_norm": 1.063399314880371, "learning_rate": 3.059958428787083e-05, "loss": 0.5181, "num_input_tokens_seen": 37828200, "step": 65165 }, { "epoch": 9.706583258862079, "grad_norm": 1.1336573362350464, "learning_rate": 3.059641737728649e-05, "loss": 0.5589, "num_input_tokens_seen": 37830728, "step": 65170 }, { "epoch": 9.70732797140304, "grad_norm": 1.1965644359588623, "learning_rate": 3.059325037215637e-05, "loss": 0.6051, "num_input_tokens_seen": 37833864, "step": 65175 }, { "epoch": 9.708072683943998, "grad_norm": 1.6072704792022705, "learning_rate": 3.059008327253396e-05, "loss": 0.6516, "num_input_tokens_seen": 37836776, "step": 65180 }, { "epoch": 9.708817396484957, "grad_norm": 1.04733407497406, "learning_rate": 3.0586916078472785e-05, "loss": 0.5986, "num_input_tokens_seen": 37839560, "step": 65185 }, { "epoch": 9.709562109025915, "grad_norm": 1.2529608011245728, "learning_rate": 3.058374879002634e-05, "loss": 0.5238, "num_input_tokens_seen": 37842600, "step": 65190 }, { "epoch": 9.710306821566876, "grad_norm": 1.0124859809875488, "learning_rate": 3.0580581407248126e-05, "loss": 0.5212, "num_input_tokens_seen": 37845288, "step": 65195 }, { "epoch": 9.711051534107835, "grad_norm": 1.328338623046875, "learning_rate": 3.0577413930191666e-05, "loss": 0.6586, "num_input_tokens_seen": 37848232, "step": 65200 }, { "epoch": 9.711796246648793, "grad_norm": 1.2526127099990845, "learning_rate": 3.0574246358910474e-05, "loss": 0.5573, "num_input_tokens_seen": 37851048, "step": 65205 }, { "epoch": 9.712540959189752, "grad_norm": 1.131838321685791, "learning_rate": 3.057107869345804e-05, "loss": 0.6282, "num_input_tokens_seen": 37853928, "step": 65210 }, { "epoch": 9.713285671730713, "grad_norm": 1.1761103868484497, "learning_rate": 3.0567910933887905e-05, "loss": 0.6951, "num_input_tokens_seen": 37856840, "step": 65215 }, { "epoch": 9.714030384271672, "grad_norm": 2.7020175457000732, "learning_rate": 3.056474308025357e-05, "loss": 0.7509, "num_input_tokens_seen": 37859752, "step": 65220 }, { "epoch": 9.71477509681263, "grad_norm": 1.1301225423812866, "learning_rate": 3.056157513260856e-05, "loss": 0.4957, "num_input_tokens_seen": 37862440, "step": 65225 }, { "epoch": 9.715519809353589, "grad_norm": 0.9571013450622559, "learning_rate": 3.055840709100639e-05, "loss": 0.5369, "num_input_tokens_seen": 37865384, "step": 65230 }, { "epoch": 9.716264521894548, "grad_norm": 1.5788743495941162, "learning_rate": 3.055523895550058e-05, "loss": 0.6079, "num_input_tokens_seen": 37868168, "step": 65235 }, { "epoch": 9.717009234435508, "grad_norm": 0.8723615407943726, "learning_rate": 3.055207072614465e-05, "loss": 0.6189, "num_input_tokens_seen": 37871112, "step": 65240 }, { "epoch": 9.717753946976467, "grad_norm": 1.9199756383895874, "learning_rate": 3.0548902402992134e-05, "loss": 0.5147, "num_input_tokens_seen": 37874024, "step": 65245 }, { "epoch": 9.718498659517426, "grad_norm": 2.5695295333862305, "learning_rate": 3.0545733986096545e-05, "loss": 0.6379, "num_input_tokens_seen": 37877096, "step": 65250 }, { "epoch": 9.719243372058386, "grad_norm": 1.9570424556732178, "learning_rate": 3.054256547551142e-05, "loss": 0.6908, "num_input_tokens_seen": 37880104, "step": 65255 }, { "epoch": 9.719988084599345, "grad_norm": 1.0075486898422241, "learning_rate": 3.0539396871290294e-05, "loss": 0.6373, "num_input_tokens_seen": 37883144, "step": 65260 }, { "epoch": 9.720732797140304, "grad_norm": 0.8581801056861877, "learning_rate": 3.053622817348668e-05, "loss": 0.4782, "num_input_tokens_seen": 37886056, "step": 65265 }, { "epoch": 9.721477509681263, "grad_norm": 1.4873641729354858, "learning_rate": 3.053305938215411e-05, "loss": 0.6954, "num_input_tokens_seen": 37889096, "step": 65270 }, { "epoch": 9.722222222222221, "grad_norm": 2.4591662883758545, "learning_rate": 3.052989049734613e-05, "loss": 0.673, "num_input_tokens_seen": 37891912, "step": 65275 }, { "epoch": 9.722966934763182, "grad_norm": 1.5951285362243652, "learning_rate": 3.052672151911627e-05, "loss": 0.592, "num_input_tokens_seen": 37894504, "step": 65280 }, { "epoch": 9.72371164730414, "grad_norm": 1.2585827112197876, "learning_rate": 3.052355244751807e-05, "loss": 0.5769, "num_input_tokens_seen": 37897416, "step": 65285 }, { "epoch": 9.7244563598451, "grad_norm": 1.3386796712875366, "learning_rate": 3.052038328260507e-05, "loss": 0.6617, "num_input_tokens_seen": 37900552, "step": 65290 }, { "epoch": 9.725201072386058, "grad_norm": 1.2916518449783325, "learning_rate": 3.05172140244308e-05, "loss": 0.591, "num_input_tokens_seen": 37903624, "step": 65295 }, { "epoch": 9.725945784927019, "grad_norm": 1.820996880531311, "learning_rate": 3.051404467304881e-05, "loss": 0.7038, "num_input_tokens_seen": 37906664, "step": 65300 }, { "epoch": 9.726690497467978, "grad_norm": 1.2110445499420166, "learning_rate": 3.051087522851263e-05, "loss": 0.4873, "num_input_tokens_seen": 37909384, "step": 65305 }, { "epoch": 9.727435210008936, "grad_norm": 3.1477291584014893, "learning_rate": 3.050770569087582e-05, "loss": 0.7247, "num_input_tokens_seen": 37912616, "step": 65310 }, { "epoch": 9.728179922549895, "grad_norm": 1.548972487449646, "learning_rate": 3.0504536060191917e-05, "loss": 0.7865, "num_input_tokens_seen": 37915432, "step": 65315 }, { "epoch": 9.728924635090856, "grad_norm": 1.3865087032318115, "learning_rate": 3.0501366336514477e-05, "loss": 0.5814, "num_input_tokens_seen": 37918312, "step": 65320 }, { "epoch": 9.729669347631814, "grad_norm": 0.9975741505622864, "learning_rate": 3.0498196519897044e-05, "loss": 0.4538, "num_input_tokens_seen": 37921224, "step": 65325 }, { "epoch": 9.730414060172773, "grad_norm": 1.70147705078125, "learning_rate": 3.0495026610393168e-05, "loss": 0.5959, "num_input_tokens_seen": 37923944, "step": 65330 }, { "epoch": 9.731158772713732, "grad_norm": 1.8797162771224976, "learning_rate": 3.0491856608056403e-05, "loss": 0.7916, "num_input_tokens_seen": 37926824, "step": 65335 }, { "epoch": 9.731903485254692, "grad_norm": 1.3049426078796387, "learning_rate": 3.0488686512940297e-05, "loss": 0.6171, "num_input_tokens_seen": 37929640, "step": 65340 }, { "epoch": 9.732648197795651, "grad_norm": 1.3659932613372803, "learning_rate": 3.0485516325098413e-05, "loss": 0.6273, "num_input_tokens_seen": 37932328, "step": 65345 }, { "epoch": 9.73339291033661, "grad_norm": 1.3684879541397095, "learning_rate": 3.0482346044584305e-05, "loss": 0.8008, "num_input_tokens_seen": 37935432, "step": 65350 }, { "epoch": 9.734137622877569, "grad_norm": 2.5540964603424072, "learning_rate": 3.047917567145153e-05, "loss": 0.7466, "num_input_tokens_seen": 37938312, "step": 65355 }, { "epoch": 9.73488233541853, "grad_norm": 1.1570309400558472, "learning_rate": 3.0476005205753666e-05, "loss": 0.6869, "num_input_tokens_seen": 37940904, "step": 65360 }, { "epoch": 9.735627047959488, "grad_norm": 1.5794987678527832, "learning_rate": 3.047283464754425e-05, "loss": 0.6131, "num_input_tokens_seen": 37943560, "step": 65365 }, { "epoch": 9.736371760500447, "grad_norm": 2.185128688812256, "learning_rate": 3.0469663996876853e-05, "loss": 0.6234, "num_input_tokens_seen": 37946184, "step": 65370 }, { "epoch": 9.737116473041405, "grad_norm": 3.4900567531585693, "learning_rate": 3.046649325380504e-05, "loss": 0.6707, "num_input_tokens_seen": 37949032, "step": 65375 }, { "epoch": 9.737861185582366, "grad_norm": 1.2200400829315186, "learning_rate": 3.0463322418382384e-05, "loss": 0.6543, "num_input_tokens_seen": 37952072, "step": 65380 }, { "epoch": 9.738605898123325, "grad_norm": 1.1034797430038452, "learning_rate": 3.0460151490662442e-05, "loss": 0.6383, "num_input_tokens_seen": 37954920, "step": 65385 }, { "epoch": 9.739350610664284, "grad_norm": 1.0631932020187378, "learning_rate": 3.0456980470698803e-05, "loss": 0.576, "num_input_tokens_seen": 37957768, "step": 65390 }, { "epoch": 9.740095323205242, "grad_norm": 1.5770814418792725, "learning_rate": 3.0453809358545016e-05, "loss": 0.6792, "num_input_tokens_seen": 37960360, "step": 65395 }, { "epoch": 9.740840035746203, "grad_norm": 1.4687350988388062, "learning_rate": 3.0450638154254664e-05, "loss": 0.6921, "num_input_tokens_seen": 37962984, "step": 65400 }, { "epoch": 9.741584748287162, "grad_norm": 2.808259963989258, "learning_rate": 3.0447466857881318e-05, "loss": 0.6109, "num_input_tokens_seen": 37965672, "step": 65405 }, { "epoch": 9.74232946082812, "grad_norm": 1.8596309423446655, "learning_rate": 3.0444295469478557e-05, "loss": 0.7648, "num_input_tokens_seen": 37968584, "step": 65410 }, { "epoch": 9.743074173369079, "grad_norm": 1.4820895195007324, "learning_rate": 3.0441123989099958e-05, "loss": 0.6057, "num_input_tokens_seen": 37971464, "step": 65415 }, { "epoch": 9.743818885910038, "grad_norm": 0.8711822032928467, "learning_rate": 3.0437952416799097e-05, "loss": 0.5683, "num_input_tokens_seen": 37974632, "step": 65420 }, { "epoch": 9.744563598450998, "grad_norm": 1.0345454216003418, "learning_rate": 3.0434780752629567e-05, "loss": 0.692, "num_input_tokens_seen": 37977512, "step": 65425 }, { "epoch": 9.745308310991957, "grad_norm": 1.8189771175384521, "learning_rate": 3.043160899664493e-05, "loss": 0.53, "num_input_tokens_seen": 37980232, "step": 65430 }, { "epoch": 9.746053023532916, "grad_norm": 1.0973386764526367, "learning_rate": 3.042843714889878e-05, "loss": 0.6735, "num_input_tokens_seen": 37983208, "step": 65435 }, { "epoch": 9.746797736073875, "grad_norm": 1.2065222263336182, "learning_rate": 3.0425265209444704e-05, "loss": 0.5389, "num_input_tokens_seen": 37986056, "step": 65440 }, { "epoch": 9.747542448614835, "grad_norm": 1.4161790609359741, "learning_rate": 3.0422093178336287e-05, "loss": 0.6361, "num_input_tokens_seen": 37989256, "step": 65445 }, { "epoch": 9.748287161155794, "grad_norm": 1.4686908721923828, "learning_rate": 3.0418921055627115e-05, "loss": 0.6104, "num_input_tokens_seen": 37992168, "step": 65450 }, { "epoch": 9.749031873696753, "grad_norm": 1.6092842817306519, "learning_rate": 3.0415748841370778e-05, "loss": 0.5668, "num_input_tokens_seen": 37995272, "step": 65455 }, { "epoch": 9.749776586237711, "grad_norm": 1.2806305885314941, "learning_rate": 3.0412576535620873e-05, "loss": 0.7825, "num_input_tokens_seen": 37998184, "step": 65460 }, { "epoch": 9.750521298778672, "grad_norm": 1.9950580596923828, "learning_rate": 3.0409404138430986e-05, "loss": 0.4225, "num_input_tokens_seen": 38001192, "step": 65465 }, { "epoch": 9.75126601131963, "grad_norm": 0.6569063663482666, "learning_rate": 3.040623164985471e-05, "loss": 0.529, "num_input_tokens_seen": 38003912, "step": 65470 }, { "epoch": 9.75201072386059, "grad_norm": 2.017909049987793, "learning_rate": 3.040305906994565e-05, "loss": 0.758, "num_input_tokens_seen": 38006920, "step": 65475 }, { "epoch": 9.752755436401548, "grad_norm": 1.661419153213501, "learning_rate": 3.0399886398757394e-05, "loss": 0.5575, "num_input_tokens_seen": 38009576, "step": 65480 }, { "epoch": 9.753500148942509, "grad_norm": 1.4156817197799683, "learning_rate": 3.0396713636343553e-05, "loss": 0.7695, "num_input_tokens_seen": 38012360, "step": 65485 }, { "epoch": 9.754244861483468, "grad_norm": 0.7809630632400513, "learning_rate": 3.039354078275771e-05, "loss": 0.5102, "num_input_tokens_seen": 38015304, "step": 65490 }, { "epoch": 9.754989574024426, "grad_norm": 1.4212723970413208, "learning_rate": 3.039036783805349e-05, "loss": 0.6007, "num_input_tokens_seen": 38018280, "step": 65495 }, { "epoch": 9.755734286565385, "grad_norm": 0.9398754835128784, "learning_rate": 3.0387194802284474e-05, "loss": 0.5365, "num_input_tokens_seen": 38021096, "step": 65500 }, { "epoch": 9.756478999106346, "grad_norm": 1.4931073188781738, "learning_rate": 3.0384021675504283e-05, "loss": 0.6671, "num_input_tokens_seen": 38023976, "step": 65505 }, { "epoch": 9.757223711647304, "grad_norm": 1.4606406688690186, "learning_rate": 3.038084845776651e-05, "loss": 0.5601, "num_input_tokens_seen": 38026952, "step": 65510 }, { "epoch": 9.757968424188263, "grad_norm": 1.2954072952270508, "learning_rate": 3.0377675149124772e-05, "loss": 0.6315, "num_input_tokens_seen": 38029832, "step": 65515 }, { "epoch": 9.758713136729222, "grad_norm": 1.5002597570419312, "learning_rate": 3.0374501749632684e-05, "loss": 0.7522, "num_input_tokens_seen": 38032840, "step": 65520 }, { "epoch": 9.759457849270182, "grad_norm": 1.476449966430664, "learning_rate": 3.037132825934385e-05, "loss": 0.6178, "num_input_tokens_seen": 38036168, "step": 65525 }, { "epoch": 9.760202561811141, "grad_norm": 0.808265745639801, "learning_rate": 3.0368154678311888e-05, "loss": 0.5335, "num_input_tokens_seen": 38038920, "step": 65530 }, { "epoch": 9.7609472743521, "grad_norm": 1.163309097290039, "learning_rate": 3.0364981006590404e-05, "loss": 0.5477, "num_input_tokens_seen": 38042152, "step": 65535 }, { "epoch": 9.761691986893059, "grad_norm": 1.1465563774108887, "learning_rate": 3.0361807244233016e-05, "loss": 0.576, "num_input_tokens_seen": 38045320, "step": 65540 }, { "epoch": 9.76243669943402, "grad_norm": 2.118908405303955, "learning_rate": 3.0358633391293346e-05, "loss": 0.5816, "num_input_tokens_seen": 38048072, "step": 65545 }, { "epoch": 9.763181411974978, "grad_norm": 1.0274823904037476, "learning_rate": 3.0355459447825014e-05, "loss": 0.5837, "num_input_tokens_seen": 38050856, "step": 65550 }, { "epoch": 9.763926124515937, "grad_norm": 1.0415513515472412, "learning_rate": 3.0352285413881636e-05, "loss": 0.5795, "num_input_tokens_seen": 38053608, "step": 65555 }, { "epoch": 9.764670837056896, "grad_norm": 1.0439428091049194, "learning_rate": 3.0349111289516834e-05, "loss": 0.5391, "num_input_tokens_seen": 38056168, "step": 65560 }, { "epoch": 9.765415549597854, "grad_norm": 1.3969602584838867, "learning_rate": 3.0345937074784235e-05, "loss": 0.6949, "num_input_tokens_seen": 38059048, "step": 65565 }, { "epoch": 9.766160262138815, "grad_norm": 2.495058059692383, "learning_rate": 3.0342762769737464e-05, "loss": 0.5747, "num_input_tokens_seen": 38061928, "step": 65570 }, { "epoch": 9.766904974679774, "grad_norm": 1.2864747047424316, "learning_rate": 3.0339588374430146e-05, "loss": 0.5657, "num_input_tokens_seen": 38065032, "step": 65575 }, { "epoch": 9.767649687220732, "grad_norm": 1.0932071208953857, "learning_rate": 3.0336413888915903e-05, "loss": 0.6451, "num_input_tokens_seen": 38067944, "step": 65580 }, { "epoch": 9.768394399761693, "grad_norm": 2.365718126296997, "learning_rate": 3.0333239313248372e-05, "loss": 0.7181, "num_input_tokens_seen": 38070536, "step": 65585 }, { "epoch": 9.769139112302652, "grad_norm": 1.324175238609314, "learning_rate": 3.0330064647481195e-05, "loss": 0.6682, "num_input_tokens_seen": 38073480, "step": 65590 }, { "epoch": 9.76988382484361, "grad_norm": 1.9844732284545898, "learning_rate": 3.032688989166798e-05, "loss": 0.7466, "num_input_tokens_seen": 38076648, "step": 65595 }, { "epoch": 9.77062853738457, "grad_norm": 0.7062230110168457, "learning_rate": 3.0323715045862382e-05, "loss": 0.5773, "num_input_tokens_seen": 38080008, "step": 65600 }, { "epoch": 9.771373249925528, "grad_norm": 1.638135313987732, "learning_rate": 3.0320540110118022e-05, "loss": 0.5808, "num_input_tokens_seen": 38082696, "step": 65605 }, { "epoch": 9.772117962466488, "grad_norm": 2.2428488731384277, "learning_rate": 3.031736508448855e-05, "loss": 0.7064, "num_input_tokens_seen": 38085608, "step": 65610 }, { "epoch": 9.772862675007447, "grad_norm": 1.1927101612091064, "learning_rate": 3.03141899690276e-05, "loss": 0.5265, "num_input_tokens_seen": 38088200, "step": 65615 }, { "epoch": 9.773607387548406, "grad_norm": 1.8523231744766235, "learning_rate": 3.031101476378881e-05, "loss": 0.6799, "num_input_tokens_seen": 38090760, "step": 65620 }, { "epoch": 9.774352100089365, "grad_norm": 1.2439301013946533, "learning_rate": 3.0307839468825826e-05, "loss": 0.6091, "num_input_tokens_seen": 38093544, "step": 65625 }, { "epoch": 9.775096812630325, "grad_norm": 0.9486682415008545, "learning_rate": 3.0304664084192286e-05, "loss": 0.8385, "num_input_tokens_seen": 38096392, "step": 65630 }, { "epoch": 9.775841525171284, "grad_norm": 0.9310079216957092, "learning_rate": 3.0301488609941837e-05, "loss": 0.6613, "num_input_tokens_seen": 38099368, "step": 65635 }, { "epoch": 9.776586237712243, "grad_norm": 1.5742065906524658, "learning_rate": 3.0298313046128123e-05, "loss": 0.5327, "num_input_tokens_seen": 38102792, "step": 65640 }, { "epoch": 9.777330950253202, "grad_norm": 1.4348901510238647, "learning_rate": 3.0295137392804796e-05, "loss": 0.5268, "num_input_tokens_seen": 38105544, "step": 65645 }, { "epoch": 9.778075662794162, "grad_norm": 2.198096990585327, "learning_rate": 3.0291961650025512e-05, "loss": 0.6753, "num_input_tokens_seen": 38108680, "step": 65650 }, { "epoch": 9.77882037533512, "grad_norm": 1.138114333152771, "learning_rate": 3.0288785817843907e-05, "loss": 0.4646, "num_input_tokens_seen": 38111656, "step": 65655 }, { "epoch": 9.77956508787608, "grad_norm": 1.159349799156189, "learning_rate": 3.028560989631365e-05, "loss": 0.6255, "num_input_tokens_seen": 38114696, "step": 65660 }, { "epoch": 9.780309800417038, "grad_norm": 1.4727773666381836, "learning_rate": 3.0282433885488375e-05, "loss": 0.7379, "num_input_tokens_seen": 38117640, "step": 65665 }, { "epoch": 9.781054512957999, "grad_norm": 1.5784841775894165, "learning_rate": 3.0279257785421755e-05, "loss": 0.678, "num_input_tokens_seen": 38120424, "step": 65670 }, { "epoch": 9.781799225498958, "grad_norm": 1.172081708908081, "learning_rate": 3.0276081596167434e-05, "loss": 0.4736, "num_input_tokens_seen": 38123176, "step": 65675 }, { "epoch": 9.782543938039916, "grad_norm": 2.165588140487671, "learning_rate": 3.027290531777908e-05, "loss": 0.6844, "num_input_tokens_seen": 38126056, "step": 65680 }, { "epoch": 9.783288650580875, "grad_norm": 1.011857032775879, "learning_rate": 3.026972895031035e-05, "loss": 0.8558, "num_input_tokens_seen": 38129064, "step": 65685 }, { "epoch": 9.784033363121836, "grad_norm": 1.7346833944320679, "learning_rate": 3.026655249381491e-05, "loss": 0.6217, "num_input_tokens_seen": 38132008, "step": 65690 }, { "epoch": 9.784778075662794, "grad_norm": 1.4940190315246582, "learning_rate": 3.0263375948346416e-05, "loss": 0.5601, "num_input_tokens_seen": 38134760, "step": 65695 }, { "epoch": 9.785522788203753, "grad_norm": 2.0793004035949707, "learning_rate": 3.026019931395853e-05, "loss": 0.5633, "num_input_tokens_seen": 38137352, "step": 65700 }, { "epoch": 9.786267500744712, "grad_norm": 1.2736430168151855, "learning_rate": 3.0257022590704926e-05, "loss": 0.3821, "num_input_tokens_seen": 38140200, "step": 65705 }, { "epoch": 9.787012213285673, "grad_norm": 1.4224238395690918, "learning_rate": 3.0253845778639267e-05, "loss": 0.6691, "num_input_tokens_seen": 38143144, "step": 65710 }, { "epoch": 9.787756925826631, "grad_norm": 0.8166702389717102, "learning_rate": 3.0250668877815226e-05, "loss": 0.6339, "num_input_tokens_seen": 38146216, "step": 65715 }, { "epoch": 9.78850163836759, "grad_norm": 1.2834625244140625, "learning_rate": 3.024749188828647e-05, "loss": 0.5605, "num_input_tokens_seen": 38149064, "step": 65720 }, { "epoch": 9.789246350908549, "grad_norm": 4.375669956207275, "learning_rate": 3.024431481010667e-05, "loss": 0.5508, "num_input_tokens_seen": 38151944, "step": 65725 }, { "epoch": 9.78999106344951, "grad_norm": 2.3601791858673096, "learning_rate": 3.0241137643329508e-05, "loss": 0.6971, "num_input_tokens_seen": 38154568, "step": 65730 }, { "epoch": 9.790735775990468, "grad_norm": 0.9673590660095215, "learning_rate": 3.0237960388008647e-05, "loss": 0.47, "num_input_tokens_seen": 38157320, "step": 65735 }, { "epoch": 9.791480488531427, "grad_norm": 1.9931336641311646, "learning_rate": 3.0234783044197767e-05, "loss": 0.7119, "num_input_tokens_seen": 38161096, "step": 65740 }, { "epoch": 9.792225201072386, "grad_norm": 1.7219336032867432, "learning_rate": 3.0231605611950548e-05, "loss": 0.5098, "num_input_tokens_seen": 38163848, "step": 65745 }, { "epoch": 9.792969913613344, "grad_norm": 1.0532004833221436, "learning_rate": 3.0228428091320672e-05, "loss": 0.5057, "num_input_tokens_seen": 38166568, "step": 65750 }, { "epoch": 9.793714626154305, "grad_norm": 1.2630237340927124, "learning_rate": 3.0225250482361818e-05, "loss": 0.6246, "num_input_tokens_seen": 38169224, "step": 65755 }, { "epoch": 9.794459338695264, "grad_norm": 2.9824132919311523, "learning_rate": 3.0222072785127663e-05, "loss": 0.7215, "num_input_tokens_seen": 38172072, "step": 65760 }, { "epoch": 9.795204051236222, "grad_norm": 1.6305798292160034, "learning_rate": 3.02188949996719e-05, "loss": 0.5269, "num_input_tokens_seen": 38174888, "step": 65765 }, { "epoch": 9.795948763777183, "grad_norm": 2.0620477199554443, "learning_rate": 3.021571712604821e-05, "loss": 0.6989, "num_input_tokens_seen": 38177928, "step": 65770 }, { "epoch": 9.796693476318142, "grad_norm": 1.2498985528945923, "learning_rate": 3.0212539164310276e-05, "loss": 0.6655, "num_input_tokens_seen": 38180904, "step": 65775 }, { "epoch": 9.7974381888591, "grad_norm": 1.1255289316177368, "learning_rate": 3.0209361114511796e-05, "loss": 0.6991, "num_input_tokens_seen": 38183816, "step": 65780 }, { "epoch": 9.79818290140006, "grad_norm": 1.3796625137329102, "learning_rate": 3.0206182976706447e-05, "loss": 0.6084, "num_input_tokens_seen": 38186824, "step": 65785 }, { "epoch": 9.798927613941018, "grad_norm": 0.6538843512535095, "learning_rate": 3.0203004750947938e-05, "loss": 0.5307, "num_input_tokens_seen": 38189992, "step": 65790 }, { "epoch": 9.799672326481979, "grad_norm": 1.1898974180221558, "learning_rate": 3.0199826437289947e-05, "loss": 0.5962, "num_input_tokens_seen": 38193064, "step": 65795 }, { "epoch": 9.800417039022937, "grad_norm": 1.1298116445541382, "learning_rate": 3.0196648035786173e-05, "loss": 0.5627, "num_input_tokens_seen": 38196168, "step": 65800 }, { "epoch": 9.801161751563896, "grad_norm": 1.500687599182129, "learning_rate": 3.019346954649031e-05, "loss": 0.6769, "num_input_tokens_seen": 38198952, "step": 65805 }, { "epoch": 9.801906464104855, "grad_norm": 2.1065356731414795, "learning_rate": 3.0190290969456063e-05, "loss": 0.5388, "num_input_tokens_seen": 38201768, "step": 65810 }, { "epoch": 9.802651176645815, "grad_norm": 1.6670323610305786, "learning_rate": 3.0187112304737125e-05, "loss": 0.5126, "num_input_tokens_seen": 38204392, "step": 65815 }, { "epoch": 9.803395889186774, "grad_norm": 1.2165061235427856, "learning_rate": 3.0183933552387188e-05, "loss": 0.6423, "num_input_tokens_seen": 38207208, "step": 65820 }, { "epoch": 9.804140601727733, "grad_norm": 1.6595757007598877, "learning_rate": 3.0180754712459973e-05, "loss": 0.7318, "num_input_tokens_seen": 38210216, "step": 65825 }, { "epoch": 9.804885314268692, "grad_norm": 0.945991575717926, "learning_rate": 3.0177575785009172e-05, "loss": 0.5406, "num_input_tokens_seen": 38213000, "step": 65830 }, { "epoch": 9.805630026809652, "grad_norm": 1.8731971979141235, "learning_rate": 3.017439677008848e-05, "loss": 0.7454, "num_input_tokens_seen": 38216072, "step": 65835 }, { "epoch": 9.80637473935061, "grad_norm": 1.264923095703125, "learning_rate": 3.0171217667751617e-05, "loss": 0.4556, "num_input_tokens_seen": 38219080, "step": 65840 }, { "epoch": 9.80711945189157, "grad_norm": 1.6760616302490234, "learning_rate": 3.016803847805229e-05, "loss": 0.5003, "num_input_tokens_seen": 38222088, "step": 65845 }, { "epoch": 9.807864164432528, "grad_norm": 1.3108196258544922, "learning_rate": 3.016485920104421e-05, "loss": 0.5461, "num_input_tokens_seen": 38225544, "step": 65850 }, { "epoch": 9.808608876973489, "grad_norm": 0.8985755443572998, "learning_rate": 3.0161679836781076e-05, "loss": 0.6115, "num_input_tokens_seen": 38228456, "step": 65855 }, { "epoch": 9.809353589514448, "grad_norm": 0.8073721528053284, "learning_rate": 3.0158500385316612e-05, "loss": 0.6347, "num_input_tokens_seen": 38231592, "step": 65860 }, { "epoch": 9.810098302055406, "grad_norm": 1.608978271484375, "learning_rate": 3.0155320846704526e-05, "loss": 0.6384, "num_input_tokens_seen": 38234280, "step": 65865 }, { "epoch": 9.810843014596365, "grad_norm": 2.6926257610321045, "learning_rate": 3.015214122099853e-05, "loss": 0.584, "num_input_tokens_seen": 38237128, "step": 65870 }, { "epoch": 9.811587727137326, "grad_norm": 1.9618149995803833, "learning_rate": 3.0148961508252347e-05, "loss": 0.5087, "num_input_tokens_seen": 38239912, "step": 65875 }, { "epoch": 9.812332439678285, "grad_norm": 1.0223995447158813, "learning_rate": 3.0145781708519692e-05, "loss": 0.5133, "num_input_tokens_seen": 38242664, "step": 65880 }, { "epoch": 9.813077152219243, "grad_norm": 1.5813779830932617, "learning_rate": 3.0142601821854288e-05, "loss": 0.5906, "num_input_tokens_seen": 38245736, "step": 65885 }, { "epoch": 9.813821864760202, "grad_norm": 1.0513070821762085, "learning_rate": 3.0139421848309852e-05, "loss": 0.6203, "num_input_tokens_seen": 38248584, "step": 65890 }, { "epoch": 9.814566577301163, "grad_norm": 0.9155066013336182, "learning_rate": 3.0136241787940107e-05, "loss": 0.6138, "num_input_tokens_seen": 38251496, "step": 65895 }, { "epoch": 9.815311289842121, "grad_norm": 0.9722592830657959, "learning_rate": 3.0133061640798776e-05, "loss": 0.67, "num_input_tokens_seen": 38254280, "step": 65900 }, { "epoch": 9.81605600238308, "grad_norm": 1.4283661842346191, "learning_rate": 3.0129881406939587e-05, "loss": 0.5749, "num_input_tokens_seen": 38257128, "step": 65905 }, { "epoch": 9.816800714924039, "grad_norm": 2.3734958171844482, "learning_rate": 3.012670108641626e-05, "loss": 0.5637, "num_input_tokens_seen": 38260104, "step": 65910 }, { "epoch": 9.817545427465, "grad_norm": 1.4244581460952759, "learning_rate": 3.012352067928253e-05, "loss": 0.574, "num_input_tokens_seen": 38263336, "step": 65915 }, { "epoch": 9.818290140005958, "grad_norm": 1.4230307340621948, "learning_rate": 3.0120340185592132e-05, "loss": 0.4728, "num_input_tokens_seen": 38266280, "step": 65920 }, { "epoch": 9.819034852546917, "grad_norm": 1.5236343145370483, "learning_rate": 3.0117159605398786e-05, "loss": 0.5102, "num_input_tokens_seen": 38269448, "step": 65925 }, { "epoch": 9.819779565087876, "grad_norm": 2.3392128944396973, "learning_rate": 3.0113978938756237e-05, "loss": 0.8654, "num_input_tokens_seen": 38272200, "step": 65930 }, { "epoch": 9.820524277628834, "grad_norm": 0.9467536211013794, "learning_rate": 3.0110798185718202e-05, "loss": 0.7279, "num_input_tokens_seen": 38275208, "step": 65935 }, { "epoch": 9.821268990169795, "grad_norm": 1.45711088180542, "learning_rate": 3.0107617346338422e-05, "loss": 0.5754, "num_input_tokens_seen": 38278088, "step": 65940 }, { "epoch": 9.822013702710754, "grad_norm": 2.1813552379608154, "learning_rate": 3.0104436420670644e-05, "loss": 0.6906, "num_input_tokens_seen": 38280616, "step": 65945 }, { "epoch": 9.822758415251712, "grad_norm": 0.7919044494628906, "learning_rate": 3.0101255408768603e-05, "loss": 0.6594, "num_input_tokens_seen": 38283496, "step": 65950 }, { "epoch": 9.823503127792671, "grad_norm": 2.932976245880127, "learning_rate": 3.0098074310686042e-05, "loss": 0.6607, "num_input_tokens_seen": 38286408, "step": 65955 }, { "epoch": 9.824247840333632, "grad_norm": 2.0672085285186768, "learning_rate": 3.0094893126476686e-05, "loss": 0.5222, "num_input_tokens_seen": 38289352, "step": 65960 }, { "epoch": 9.82499255287459, "grad_norm": 0.7819859981536865, "learning_rate": 3.0091711856194295e-05, "loss": 0.6243, "num_input_tokens_seen": 38292104, "step": 65965 }, { "epoch": 9.82573726541555, "grad_norm": 0.929872453212738, "learning_rate": 3.0088530499892605e-05, "loss": 0.6541, "num_input_tokens_seen": 38295016, "step": 65970 }, { "epoch": 9.826481977956508, "grad_norm": 0.5425230860710144, "learning_rate": 3.008534905762536e-05, "loss": 0.4682, "num_input_tokens_seen": 38297896, "step": 65975 }, { "epoch": 9.827226690497469, "grad_norm": 1.6674151420593262, "learning_rate": 3.0082167529446314e-05, "loss": 0.4611, "num_input_tokens_seen": 38300808, "step": 65980 }, { "epoch": 9.827971403038427, "grad_norm": 1.2382287979125977, "learning_rate": 3.0078985915409214e-05, "loss": 0.5354, "num_input_tokens_seen": 38303496, "step": 65985 }, { "epoch": 9.828716115579386, "grad_norm": 1.1858875751495361, "learning_rate": 3.0075804215567817e-05, "loss": 0.659, "num_input_tokens_seen": 38306216, "step": 65990 }, { "epoch": 9.829460828120345, "grad_norm": 1.8684622049331665, "learning_rate": 3.0072622429975856e-05, "loss": 0.5269, "num_input_tokens_seen": 38308904, "step": 65995 }, { "epoch": 9.830205540661305, "grad_norm": 0.727007269859314, "learning_rate": 3.006944055868709e-05, "loss": 0.5161, "num_input_tokens_seen": 38311944, "step": 66000 }, { "epoch": 9.830950253202264, "grad_norm": 1.0206530094146729, "learning_rate": 3.0066258601755288e-05, "loss": 0.5795, "num_input_tokens_seen": 38314664, "step": 66005 }, { "epoch": 9.831694965743223, "grad_norm": 1.2170658111572266, "learning_rate": 3.0063076559234192e-05, "loss": 0.6497, "num_input_tokens_seen": 38317352, "step": 66010 }, { "epoch": 9.832439678284182, "grad_norm": 1.1687889099121094, "learning_rate": 3.0059894431177565e-05, "loss": 0.6094, "num_input_tokens_seen": 38320296, "step": 66015 }, { "epoch": 9.833184390825142, "grad_norm": 1.8876310586929321, "learning_rate": 3.0056712217639165e-05, "loss": 0.6189, "num_input_tokens_seen": 38323368, "step": 66020 }, { "epoch": 9.833929103366101, "grad_norm": 1.7710294723510742, "learning_rate": 3.005352991867275e-05, "loss": 0.6025, "num_input_tokens_seen": 38326344, "step": 66025 }, { "epoch": 9.83467381590706, "grad_norm": 1.416447639465332, "learning_rate": 3.0050347534332084e-05, "loss": 0.6247, "num_input_tokens_seen": 38329352, "step": 66030 }, { "epoch": 9.835418528448018, "grad_norm": 1.334113359451294, "learning_rate": 3.004716506467093e-05, "loss": 0.6783, "num_input_tokens_seen": 38332296, "step": 66035 }, { "epoch": 9.836163240988979, "grad_norm": 1.3767945766448975, "learning_rate": 3.0043982509743052e-05, "loss": 0.6304, "num_input_tokens_seen": 38335112, "step": 66040 }, { "epoch": 9.836907953529938, "grad_norm": 2.194589376449585, "learning_rate": 3.004079986960221e-05, "loss": 0.568, "num_input_tokens_seen": 38338120, "step": 66045 }, { "epoch": 9.837652666070897, "grad_norm": 1.465218424797058, "learning_rate": 3.0037617144302188e-05, "loss": 0.556, "num_input_tokens_seen": 38341160, "step": 66050 }, { "epoch": 9.838397378611855, "grad_norm": 2.2703070640563965, "learning_rate": 3.0034434333896737e-05, "loss": 0.6933, "num_input_tokens_seen": 38344168, "step": 66055 }, { "epoch": 9.839142091152816, "grad_norm": 0.9484546184539795, "learning_rate": 3.003125143843964e-05, "loss": 0.5244, "num_input_tokens_seen": 38347048, "step": 66060 }, { "epoch": 9.839886803693775, "grad_norm": 1.6554019451141357, "learning_rate": 3.002806845798466e-05, "loss": 0.7007, "num_input_tokens_seen": 38350056, "step": 66065 }, { "epoch": 9.840631516234733, "grad_norm": 1.691549301147461, "learning_rate": 3.002488539258557e-05, "loss": 0.5549, "num_input_tokens_seen": 38352776, "step": 66070 }, { "epoch": 9.841376228775692, "grad_norm": 1.8600763082504272, "learning_rate": 3.0021702242296153e-05, "loss": 0.4711, "num_input_tokens_seen": 38355816, "step": 66075 }, { "epoch": 9.842120941316653, "grad_norm": 2.1215546131134033, "learning_rate": 3.0018519007170177e-05, "loss": 0.71, "num_input_tokens_seen": 38358920, "step": 66080 }, { "epoch": 9.842865653857611, "grad_norm": 1.9885905981063843, "learning_rate": 3.0015335687261425e-05, "loss": 0.7094, "num_input_tokens_seen": 38361768, "step": 66085 }, { "epoch": 9.84361036639857, "grad_norm": 1.4717903137207031, "learning_rate": 3.001215228262368e-05, "loss": 0.3833, "num_input_tokens_seen": 38364616, "step": 66090 }, { "epoch": 9.844355078939529, "grad_norm": 1.4356601238250732, "learning_rate": 3.000896879331071e-05, "loss": 0.6603, "num_input_tokens_seen": 38367528, "step": 66095 }, { "epoch": 9.84509979148049, "grad_norm": 1.506812334060669, "learning_rate": 3.0005785219376304e-05, "loss": 0.654, "num_input_tokens_seen": 38370504, "step": 66100 }, { "epoch": 9.845844504021448, "grad_norm": 1.2063180208206177, "learning_rate": 3.000260156087424e-05, "loss": 0.6653, "num_input_tokens_seen": 38373320, "step": 66105 }, { "epoch": 9.846589216562407, "grad_norm": 1.2652628421783447, "learning_rate": 2.999941781785831e-05, "loss": 0.5267, "num_input_tokens_seen": 38376200, "step": 66110 }, { "epoch": 9.847333929103366, "grad_norm": 1.486365556716919, "learning_rate": 2.9996233990382296e-05, "loss": 0.4749, "num_input_tokens_seen": 38379176, "step": 66115 }, { "epoch": 9.848078641644324, "grad_norm": 1.410132884979248, "learning_rate": 2.9993050078499997e-05, "loss": 0.6463, "num_input_tokens_seen": 38382152, "step": 66120 }, { "epoch": 9.848823354185285, "grad_norm": 1.4384346008300781, "learning_rate": 2.9989866082265177e-05, "loss": 0.5856, "num_input_tokens_seen": 38385192, "step": 66125 }, { "epoch": 9.849568066726244, "grad_norm": 1.642803430557251, "learning_rate": 2.9986682001731647e-05, "loss": 0.6151, "num_input_tokens_seen": 38388328, "step": 66130 }, { "epoch": 9.850312779267203, "grad_norm": 1.7827337980270386, "learning_rate": 2.99834978369532e-05, "loss": 0.6502, "num_input_tokens_seen": 38391048, "step": 66135 }, { "epoch": 9.851057491808161, "grad_norm": 2.0120527744293213, "learning_rate": 2.998031358798361e-05, "loss": 0.6532, "num_input_tokens_seen": 38393832, "step": 66140 }, { "epoch": 9.851802204349122, "grad_norm": 1.2696055173873901, "learning_rate": 2.997712925487669e-05, "loss": 0.7526, "num_input_tokens_seen": 38396840, "step": 66145 }, { "epoch": 9.85254691689008, "grad_norm": 1.0641043186187744, "learning_rate": 2.9973944837686228e-05, "loss": 0.4644, "num_input_tokens_seen": 38399784, "step": 66150 }, { "epoch": 9.85329162943104, "grad_norm": 1.1603820323944092, "learning_rate": 2.9970760336466032e-05, "loss": 0.8666, "num_input_tokens_seen": 38403016, "step": 66155 }, { "epoch": 9.854036341971998, "grad_norm": 1.1549569368362427, "learning_rate": 2.9967575751269878e-05, "loss": 0.5109, "num_input_tokens_seen": 38406024, "step": 66160 }, { "epoch": 9.854781054512959, "grad_norm": 0.8556466698646545, "learning_rate": 2.9964391082151587e-05, "loss": 0.5246, "num_input_tokens_seen": 38409160, "step": 66165 }, { "epoch": 9.855525767053917, "grad_norm": 0.8576076626777649, "learning_rate": 2.9961206329164952e-05, "loss": 0.7385, "num_input_tokens_seen": 38412136, "step": 66170 }, { "epoch": 9.856270479594876, "grad_norm": 2.6577625274658203, "learning_rate": 2.9958021492363787e-05, "loss": 0.6682, "num_input_tokens_seen": 38414888, "step": 66175 }, { "epoch": 9.857015192135835, "grad_norm": 1.7215455770492554, "learning_rate": 2.9954836571801875e-05, "loss": 0.5004, "num_input_tokens_seen": 38417640, "step": 66180 }, { "epoch": 9.857759904676795, "grad_norm": 1.5395723581314087, "learning_rate": 2.9951651567533046e-05, "loss": 0.5941, "num_input_tokens_seen": 38420808, "step": 66185 }, { "epoch": 9.858504617217754, "grad_norm": 1.7881983518600464, "learning_rate": 2.994846647961109e-05, "loss": 0.5789, "num_input_tokens_seen": 38423848, "step": 66190 }, { "epoch": 9.859249329758713, "grad_norm": 1.0819669961929321, "learning_rate": 2.9945281308089824e-05, "loss": 0.587, "num_input_tokens_seen": 38426568, "step": 66195 }, { "epoch": 9.859994042299672, "grad_norm": 1.1340827941894531, "learning_rate": 2.9942096053023055e-05, "loss": 0.6572, "num_input_tokens_seen": 38429736, "step": 66200 }, { "epoch": 9.860738754840632, "grad_norm": 1.2076960802078247, "learning_rate": 2.9938910714464596e-05, "loss": 0.5805, "num_input_tokens_seen": 38432680, "step": 66205 }, { "epoch": 9.861483467381591, "grad_norm": 1.6118135452270508, "learning_rate": 2.9935725292468263e-05, "loss": 0.5811, "num_input_tokens_seen": 38435400, "step": 66210 }, { "epoch": 9.86222817992255, "grad_norm": 1.7136805057525635, "learning_rate": 2.9932539787087872e-05, "loss": 0.6943, "num_input_tokens_seen": 38438344, "step": 66215 }, { "epoch": 9.862972892463509, "grad_norm": 1.6622798442840576, "learning_rate": 2.9929354198377223e-05, "loss": 0.5839, "num_input_tokens_seen": 38441288, "step": 66220 }, { "epoch": 9.863717605004469, "grad_norm": 3.6940298080444336, "learning_rate": 2.9926168526390157e-05, "loss": 0.6015, "num_input_tokens_seen": 38444264, "step": 66225 }, { "epoch": 9.864462317545428, "grad_norm": 1.2157094478607178, "learning_rate": 2.9922982771180475e-05, "loss": 0.6792, "num_input_tokens_seen": 38447080, "step": 66230 }, { "epoch": 9.865207030086387, "grad_norm": 0.8085927367210388, "learning_rate": 2.9919796932801996e-05, "loss": 0.5377, "num_input_tokens_seen": 38449896, "step": 66235 }, { "epoch": 9.865951742627345, "grad_norm": 1.2619473934173584, "learning_rate": 2.9916611011308555e-05, "loss": 0.5287, "num_input_tokens_seen": 38452712, "step": 66240 }, { "epoch": 9.866696455168306, "grad_norm": 1.068676471710205, "learning_rate": 2.9913425006753965e-05, "loss": 0.8679, "num_input_tokens_seen": 38455432, "step": 66245 }, { "epoch": 9.867441167709265, "grad_norm": 0.9249298572540283, "learning_rate": 2.9910238919192058e-05, "loss": 0.6108, "num_input_tokens_seen": 38458504, "step": 66250 }, { "epoch": 9.868185880250223, "grad_norm": 0.9351729154586792, "learning_rate": 2.9907052748676656e-05, "loss": 0.5977, "num_input_tokens_seen": 38461416, "step": 66255 }, { "epoch": 9.868930592791182, "grad_norm": 1.1903935670852661, "learning_rate": 2.9903866495261578e-05, "loss": 0.6675, "num_input_tokens_seen": 38464296, "step": 66260 }, { "epoch": 9.86967530533214, "grad_norm": 1.3858541250228882, "learning_rate": 2.9900680159000666e-05, "loss": 0.5504, "num_input_tokens_seen": 38466952, "step": 66265 }, { "epoch": 9.870420017873101, "grad_norm": 1.4866740703582764, "learning_rate": 2.9897493739947736e-05, "loss": 0.5322, "num_input_tokens_seen": 38469960, "step": 66270 }, { "epoch": 9.87116473041406, "grad_norm": 1.7901052236557007, "learning_rate": 2.9894307238156634e-05, "loss": 0.5438, "num_input_tokens_seen": 38472872, "step": 66275 }, { "epoch": 9.871909442955019, "grad_norm": 1.456916332244873, "learning_rate": 2.989112065368118e-05, "loss": 0.546, "num_input_tokens_seen": 38475752, "step": 66280 }, { "epoch": 9.87265415549598, "grad_norm": 1.6529842615127563, "learning_rate": 2.9887933986575218e-05, "loss": 0.6646, "num_input_tokens_seen": 38478472, "step": 66285 }, { "epoch": 9.873398868036938, "grad_norm": 1.191497564315796, "learning_rate": 2.9884747236892578e-05, "loss": 0.6284, "num_input_tokens_seen": 38481128, "step": 66290 }, { "epoch": 9.874143580577897, "grad_norm": 0.7205085754394531, "learning_rate": 2.9881560404687103e-05, "loss": 0.6134, "num_input_tokens_seen": 38484104, "step": 66295 }, { "epoch": 9.874888293118856, "grad_norm": 2.3026084899902344, "learning_rate": 2.9878373490012617e-05, "loss": 0.6523, "num_input_tokens_seen": 38487016, "step": 66300 }, { "epoch": 9.875633005659815, "grad_norm": 2.334920644760132, "learning_rate": 2.9875186492922973e-05, "loss": 0.6063, "num_input_tokens_seen": 38489832, "step": 66305 }, { "epoch": 9.876377718200775, "grad_norm": 1.0132555961608887, "learning_rate": 2.9871999413472006e-05, "loss": 0.5499, "num_input_tokens_seen": 38492968, "step": 66310 }, { "epoch": 9.877122430741734, "grad_norm": 1.5082646608352661, "learning_rate": 2.9868812251713564e-05, "loss": 0.6422, "num_input_tokens_seen": 38495784, "step": 66315 }, { "epoch": 9.877867143282693, "grad_norm": 1.7927522659301758, "learning_rate": 2.9865625007701487e-05, "loss": 0.5736, "num_input_tokens_seen": 38498760, "step": 66320 }, { "epoch": 9.878611855823651, "grad_norm": 1.6884366273880005, "learning_rate": 2.986243768148962e-05, "loss": 0.6585, "num_input_tokens_seen": 38501832, "step": 66325 }, { "epoch": 9.879356568364612, "grad_norm": 2.6482491493225098, "learning_rate": 2.9859250273131812e-05, "loss": 0.5946, "num_input_tokens_seen": 38504648, "step": 66330 }, { "epoch": 9.88010128090557, "grad_norm": 1.2309982776641846, "learning_rate": 2.985606278268191e-05, "loss": 0.6096, "num_input_tokens_seen": 38507592, "step": 66335 }, { "epoch": 9.88084599344653, "grad_norm": 1.1794071197509766, "learning_rate": 2.985287521019376e-05, "loss": 0.6697, "num_input_tokens_seen": 38510664, "step": 66340 }, { "epoch": 9.881590705987488, "grad_norm": 2.5709028244018555, "learning_rate": 2.984968755572121e-05, "loss": 0.7946, "num_input_tokens_seen": 38513384, "step": 66345 }, { "epoch": 9.882335418528449, "grad_norm": 1.6553782224655151, "learning_rate": 2.9846499819318124e-05, "loss": 0.5043, "num_input_tokens_seen": 38516520, "step": 66350 }, { "epoch": 9.883080131069407, "grad_norm": 0.8968944549560547, "learning_rate": 2.9843312001038353e-05, "loss": 0.6289, "num_input_tokens_seen": 38519592, "step": 66355 }, { "epoch": 9.883824843610366, "grad_norm": 1.968286156654358, "learning_rate": 2.9840124100935744e-05, "loss": 0.5011, "num_input_tokens_seen": 38522408, "step": 66360 }, { "epoch": 9.884569556151325, "grad_norm": 0.8410263061523438, "learning_rate": 2.9836936119064156e-05, "loss": 0.6408, "num_input_tokens_seen": 38525512, "step": 66365 }, { "epoch": 9.885314268692285, "grad_norm": 1.4692105054855347, "learning_rate": 2.983374805547745e-05, "loss": 0.5757, "num_input_tokens_seen": 38528520, "step": 66370 }, { "epoch": 9.886058981233244, "grad_norm": 0.9304409623146057, "learning_rate": 2.983055991022949e-05, "loss": 0.6499, "num_input_tokens_seen": 38531496, "step": 66375 }, { "epoch": 9.886803693774203, "grad_norm": 1.2368632555007935, "learning_rate": 2.9827371683374116e-05, "loss": 0.6375, "num_input_tokens_seen": 38534184, "step": 66380 }, { "epoch": 9.887548406315162, "grad_norm": 1.1611328125, "learning_rate": 2.9824183374965214e-05, "loss": 0.5806, "num_input_tokens_seen": 38537256, "step": 66385 }, { "epoch": 9.888293118856122, "grad_norm": 2.1508266925811768, "learning_rate": 2.982099498505664e-05, "loss": 0.6039, "num_input_tokens_seen": 38540136, "step": 66390 }, { "epoch": 9.889037831397081, "grad_norm": 1.6590282917022705, "learning_rate": 2.9817806513702244e-05, "loss": 0.4168, "num_input_tokens_seen": 38542920, "step": 66395 }, { "epoch": 9.88978254393804, "grad_norm": 1.5437425374984741, "learning_rate": 2.9814617960955908e-05, "loss": 0.5903, "num_input_tokens_seen": 38545768, "step": 66400 }, { "epoch": 9.890527256478999, "grad_norm": 1.35367751121521, "learning_rate": 2.9811429326871498e-05, "loss": 0.5858, "num_input_tokens_seen": 38548552, "step": 66405 }, { "epoch": 9.891271969019959, "grad_norm": 1.7867536544799805, "learning_rate": 2.9808240611502873e-05, "loss": 0.6377, "num_input_tokens_seen": 38551208, "step": 66410 }, { "epoch": 9.892016681560918, "grad_norm": 1.6680271625518799, "learning_rate": 2.9805051814903923e-05, "loss": 0.4021, "num_input_tokens_seen": 38553896, "step": 66415 }, { "epoch": 9.892761394101877, "grad_norm": 1.5737553834915161, "learning_rate": 2.98018629371285e-05, "loss": 0.6229, "num_input_tokens_seen": 38557000, "step": 66420 }, { "epoch": 9.893506106642835, "grad_norm": 1.208712100982666, "learning_rate": 2.979867397823048e-05, "loss": 0.5523, "num_input_tokens_seen": 38560008, "step": 66425 }, { "epoch": 9.894250819183796, "grad_norm": 1.6378682851791382, "learning_rate": 2.979548493826374e-05, "loss": 0.655, "num_input_tokens_seen": 38562920, "step": 66430 }, { "epoch": 9.894995531724755, "grad_norm": 1.210453748703003, "learning_rate": 2.9792295817282157e-05, "loss": 0.5437, "num_input_tokens_seen": 38565864, "step": 66435 }, { "epoch": 9.895740244265713, "grad_norm": 1.1709967851638794, "learning_rate": 2.9789106615339603e-05, "loss": 0.712, "num_input_tokens_seen": 38568904, "step": 66440 }, { "epoch": 9.896484956806672, "grad_norm": 0.8372125625610352, "learning_rate": 2.9785917332489965e-05, "loss": 0.5727, "num_input_tokens_seen": 38571784, "step": 66445 }, { "epoch": 9.897229669347631, "grad_norm": 1.1723495721817017, "learning_rate": 2.9782727968787116e-05, "loss": 0.5536, "num_input_tokens_seen": 38574632, "step": 66450 }, { "epoch": 9.897974381888591, "grad_norm": 1.045375943183899, "learning_rate": 2.9779538524284943e-05, "loss": 0.6487, "num_input_tokens_seen": 38577448, "step": 66455 }, { "epoch": 9.89871909442955, "grad_norm": 2.740516185760498, "learning_rate": 2.9776348999037322e-05, "loss": 0.7179, "num_input_tokens_seen": 38580520, "step": 66460 }, { "epoch": 9.899463806970509, "grad_norm": 1.234033226966858, "learning_rate": 2.9773159393098137e-05, "loss": 0.6888, "num_input_tokens_seen": 38583528, "step": 66465 }, { "epoch": 9.900208519511468, "grad_norm": 1.3510289192199707, "learning_rate": 2.9769969706521277e-05, "loss": 0.5625, "num_input_tokens_seen": 38586248, "step": 66470 }, { "epoch": 9.900953232052428, "grad_norm": 1.8068352937698364, "learning_rate": 2.9766779939360623e-05, "loss": 0.4914, "num_input_tokens_seen": 38589000, "step": 66475 }, { "epoch": 9.901697944593387, "grad_norm": 1.2941927909851074, "learning_rate": 2.976359009167007e-05, "loss": 0.6494, "num_input_tokens_seen": 38592232, "step": 66480 }, { "epoch": 9.902442657134346, "grad_norm": 2.046295166015625, "learning_rate": 2.976040016350351e-05, "loss": 0.6363, "num_input_tokens_seen": 38595144, "step": 66485 }, { "epoch": 9.903187369675305, "grad_norm": 1.2376623153686523, "learning_rate": 2.9757210154914816e-05, "loss": 0.7139, "num_input_tokens_seen": 38597768, "step": 66490 }, { "epoch": 9.903932082216265, "grad_norm": 1.4487191438674927, "learning_rate": 2.9754020065957905e-05, "loss": 0.5429, "num_input_tokens_seen": 38600680, "step": 66495 }, { "epoch": 9.904676794757224, "grad_norm": 1.4960224628448486, "learning_rate": 2.9750829896686645e-05, "loss": 0.6586, "num_input_tokens_seen": 38603432, "step": 66500 }, { "epoch": 9.905421507298183, "grad_norm": 1.7897083759307861, "learning_rate": 2.9747639647154947e-05, "loss": 0.6271, "num_input_tokens_seen": 38606312, "step": 66505 }, { "epoch": 9.906166219839141, "grad_norm": 1.9602779150009155, "learning_rate": 2.97444493174167e-05, "loss": 0.6469, "num_input_tokens_seen": 38609256, "step": 66510 }, { "epoch": 9.906910932380102, "grad_norm": 1.5316718816757202, "learning_rate": 2.9741258907525805e-05, "loss": 0.4854, "num_input_tokens_seen": 38612360, "step": 66515 }, { "epoch": 9.90765564492106, "grad_norm": 0.9593804478645325, "learning_rate": 2.9738068417536165e-05, "loss": 0.6111, "num_input_tokens_seen": 38615048, "step": 66520 }, { "epoch": 9.90840035746202, "grad_norm": 2.1071414947509766, "learning_rate": 2.9734877847501664e-05, "loss": 0.8128, "num_input_tokens_seen": 38617960, "step": 66525 }, { "epoch": 9.909145070002978, "grad_norm": 1.473201870918274, "learning_rate": 2.973168719747622e-05, "loss": 0.6196, "num_input_tokens_seen": 38620776, "step": 66530 }, { "epoch": 9.909889782543939, "grad_norm": 1.248361587524414, "learning_rate": 2.9728496467513734e-05, "loss": 0.5457, "num_input_tokens_seen": 38623720, "step": 66535 }, { "epoch": 9.910634495084897, "grad_norm": 1.6851561069488525, "learning_rate": 2.9725305657668102e-05, "loss": 0.6204, "num_input_tokens_seen": 38626664, "step": 66540 }, { "epoch": 9.911379207625856, "grad_norm": 0.9430568814277649, "learning_rate": 2.9722114767993226e-05, "loss": 0.5673, "num_input_tokens_seen": 38629736, "step": 66545 }, { "epoch": 9.912123920166815, "grad_norm": 1.5394258499145508, "learning_rate": 2.971892379854303e-05, "loss": 0.5481, "num_input_tokens_seen": 38632616, "step": 66550 }, { "epoch": 9.912868632707776, "grad_norm": 2.16778564453125, "learning_rate": 2.9715732749371412e-05, "loss": 0.5756, "num_input_tokens_seen": 38635176, "step": 66555 }, { "epoch": 9.913613345248734, "grad_norm": 1.228502869606018, "learning_rate": 2.971254162053228e-05, "loss": 0.5321, "num_input_tokens_seen": 38637928, "step": 66560 }, { "epoch": 9.914358057789693, "grad_norm": 1.2498875856399536, "learning_rate": 2.9709350412079544e-05, "loss": 0.5745, "num_input_tokens_seen": 38640776, "step": 66565 }, { "epoch": 9.915102770330652, "grad_norm": 3.0680952072143555, "learning_rate": 2.9706159124067123e-05, "loss": 0.6035, "num_input_tokens_seen": 38643720, "step": 66570 }, { "epoch": 9.915847482871612, "grad_norm": 0.7723557353019714, "learning_rate": 2.9702967756548927e-05, "loss": 0.4653, "num_input_tokens_seen": 38646600, "step": 66575 }, { "epoch": 9.916592195412571, "grad_norm": 3.8306360244750977, "learning_rate": 2.969977630957887e-05, "loss": 0.7683, "num_input_tokens_seen": 38649512, "step": 66580 }, { "epoch": 9.91733690795353, "grad_norm": 1.404321312904358, "learning_rate": 2.9696584783210874e-05, "loss": 0.6007, "num_input_tokens_seen": 38652200, "step": 66585 }, { "epoch": 9.918081620494489, "grad_norm": 1.2893060445785522, "learning_rate": 2.969339317749884e-05, "loss": 0.7468, "num_input_tokens_seen": 38654984, "step": 66590 }, { "epoch": 9.91882633303545, "grad_norm": 1.0705013275146484, "learning_rate": 2.9690201492496704e-05, "loss": 0.6556, "num_input_tokens_seen": 38658120, "step": 66595 }, { "epoch": 9.919571045576408, "grad_norm": 1.3745644092559814, "learning_rate": 2.968700972825838e-05, "loss": 0.4531, "num_input_tokens_seen": 38661096, "step": 66600 }, { "epoch": 9.920315758117367, "grad_norm": 1.320779800415039, "learning_rate": 2.9683817884837788e-05, "loss": 0.8081, "num_input_tokens_seen": 38663880, "step": 66605 }, { "epoch": 9.921060470658325, "grad_norm": 1.7083262205123901, "learning_rate": 2.9680625962288856e-05, "loss": 0.667, "num_input_tokens_seen": 38667208, "step": 66610 }, { "epoch": 9.921805183199286, "grad_norm": 1.3575077056884766, "learning_rate": 2.9677433960665512e-05, "loss": 0.5885, "num_input_tokens_seen": 38669928, "step": 66615 }, { "epoch": 9.922549895740245, "grad_norm": 1.2654515504837036, "learning_rate": 2.967424188002167e-05, "loss": 0.5385, "num_input_tokens_seen": 38672840, "step": 66620 }, { "epoch": 9.923294608281203, "grad_norm": 1.602513313293457, "learning_rate": 2.967104972041126e-05, "loss": 0.61, "num_input_tokens_seen": 38675688, "step": 66625 }, { "epoch": 9.924039320822162, "grad_norm": 0.9253864884376526, "learning_rate": 2.9667857481888218e-05, "loss": 0.4963, "num_input_tokens_seen": 38678568, "step": 66630 }, { "epoch": 9.924784033363121, "grad_norm": 1.7269513607025146, "learning_rate": 2.9664665164506455e-05, "loss": 0.7166, "num_input_tokens_seen": 38681288, "step": 66635 }, { "epoch": 9.925528745904082, "grad_norm": 1.305349349975586, "learning_rate": 2.9661472768319924e-05, "loss": 0.5253, "num_input_tokens_seen": 38683880, "step": 66640 }, { "epoch": 9.92627345844504, "grad_norm": 1.0041961669921875, "learning_rate": 2.9658280293382545e-05, "loss": 0.5625, "num_input_tokens_seen": 38686792, "step": 66645 }, { "epoch": 9.927018170985999, "grad_norm": 1.4274282455444336, "learning_rate": 2.9655087739748267e-05, "loss": 0.5224, "num_input_tokens_seen": 38689512, "step": 66650 }, { "epoch": 9.927762883526958, "grad_norm": 0.8652595281600952, "learning_rate": 2.9651895107471004e-05, "loss": 0.4169, "num_input_tokens_seen": 38692744, "step": 66655 }, { "epoch": 9.928507596067918, "grad_norm": 0.854693591594696, "learning_rate": 2.96487023966047e-05, "loss": 0.5521, "num_input_tokens_seen": 38695816, "step": 66660 }, { "epoch": 9.929252308608877, "grad_norm": 2.137963056564331, "learning_rate": 2.9645509607203294e-05, "loss": 0.6109, "num_input_tokens_seen": 38698792, "step": 66665 }, { "epoch": 9.929997021149836, "grad_norm": 1.3130772113800049, "learning_rate": 2.9642316739320724e-05, "loss": 0.5709, "num_input_tokens_seen": 38701608, "step": 66670 }, { "epoch": 9.930741733690795, "grad_norm": 1.403039813041687, "learning_rate": 2.9639123793010933e-05, "loss": 0.5076, "num_input_tokens_seen": 38704424, "step": 66675 }, { "epoch": 9.931486446231755, "grad_norm": 1.3233585357666016, "learning_rate": 2.9635930768327856e-05, "loss": 0.9449, "num_input_tokens_seen": 38707400, "step": 66680 }, { "epoch": 9.932231158772714, "grad_norm": 2.3972790241241455, "learning_rate": 2.963273766532545e-05, "loss": 0.7264, "num_input_tokens_seen": 38710248, "step": 66685 }, { "epoch": 9.932975871313673, "grad_norm": 1.454833745956421, "learning_rate": 2.962954448405764e-05, "loss": 0.5067, "num_input_tokens_seen": 38713096, "step": 66690 }, { "epoch": 9.933720583854631, "grad_norm": 1.3945121765136719, "learning_rate": 2.9626351224578386e-05, "loss": 0.7478, "num_input_tokens_seen": 38716328, "step": 66695 }, { "epoch": 9.934465296395592, "grad_norm": 2.4876885414123535, "learning_rate": 2.9623157886941633e-05, "loss": 0.6796, "num_input_tokens_seen": 38719400, "step": 66700 }, { "epoch": 9.93521000893655, "grad_norm": 1.5772614479064941, "learning_rate": 2.961996447120132e-05, "loss": 0.4379, "num_input_tokens_seen": 38722344, "step": 66705 }, { "epoch": 9.93595472147751, "grad_norm": 1.3576263189315796, "learning_rate": 2.9616770977411408e-05, "loss": 0.6106, "num_input_tokens_seen": 38725192, "step": 66710 }, { "epoch": 9.936699434018468, "grad_norm": 1.781828761100769, "learning_rate": 2.9613577405625838e-05, "loss": 0.7782, "num_input_tokens_seen": 38728072, "step": 66715 }, { "epoch": 9.937444146559429, "grad_norm": 1.4219013452529907, "learning_rate": 2.961038375589857e-05, "loss": 0.5176, "num_input_tokens_seen": 38730856, "step": 66720 }, { "epoch": 9.938188859100388, "grad_norm": 1.4068272113800049, "learning_rate": 2.9607190028283548e-05, "loss": 0.637, "num_input_tokens_seen": 38733640, "step": 66725 }, { "epoch": 9.938933571641346, "grad_norm": 1.523262619972229, "learning_rate": 2.960399622283474e-05, "loss": 0.5855, "num_input_tokens_seen": 38736488, "step": 66730 }, { "epoch": 9.939678284182305, "grad_norm": 2.712205410003662, "learning_rate": 2.960080233960609e-05, "loss": 0.7508, "num_input_tokens_seen": 38739816, "step": 66735 }, { "epoch": 9.940422996723266, "grad_norm": 1.759966492652893, "learning_rate": 2.959760837865157e-05, "loss": 0.731, "num_input_tokens_seen": 38742760, "step": 66740 }, { "epoch": 9.941167709264224, "grad_norm": 1.4337265491485596, "learning_rate": 2.9594414340025118e-05, "loss": 0.5195, "num_input_tokens_seen": 38745288, "step": 66745 }, { "epoch": 9.941912421805183, "grad_norm": 1.4220932722091675, "learning_rate": 2.9591220223780714e-05, "loss": 0.5408, "num_input_tokens_seen": 38747944, "step": 66750 }, { "epoch": 9.942657134346142, "grad_norm": 2.444946765899658, "learning_rate": 2.9588026029972305e-05, "loss": 0.7148, "num_input_tokens_seen": 38750792, "step": 66755 }, { "epoch": 9.943401846887102, "grad_norm": 1.1001417636871338, "learning_rate": 2.9584831758653865e-05, "loss": 0.5713, "num_input_tokens_seen": 38753704, "step": 66760 }, { "epoch": 9.944146559428061, "grad_norm": 1.130600094795227, "learning_rate": 2.9581637409879344e-05, "loss": 0.5652, "num_input_tokens_seen": 38756744, "step": 66765 }, { "epoch": 9.94489127196902, "grad_norm": 0.907679557800293, "learning_rate": 2.9578442983702716e-05, "loss": 0.5613, "num_input_tokens_seen": 38759656, "step": 66770 }, { "epoch": 9.945635984509979, "grad_norm": 1.4741723537445068, "learning_rate": 2.9575248480177952e-05, "loss": 0.6107, "num_input_tokens_seen": 38763720, "step": 66775 }, { "epoch": 9.946380697050937, "grad_norm": 0.9182102084159851, "learning_rate": 2.9572053899359013e-05, "loss": 0.4936, "num_input_tokens_seen": 38766376, "step": 66780 }, { "epoch": 9.947125409591898, "grad_norm": 1.4255449771881104, "learning_rate": 2.9568859241299878e-05, "loss": 0.689, "num_input_tokens_seen": 38769512, "step": 66785 }, { "epoch": 9.947870122132857, "grad_norm": 2.2568867206573486, "learning_rate": 2.9565664506054503e-05, "loss": 0.6835, "num_input_tokens_seen": 38772648, "step": 66790 }, { "epoch": 9.948614834673815, "grad_norm": 1.9167166948318481, "learning_rate": 2.9562469693676865e-05, "loss": 0.8042, "num_input_tokens_seen": 38775528, "step": 66795 }, { "epoch": 9.949359547214776, "grad_norm": 1.8149876594543457, "learning_rate": 2.9559274804220936e-05, "loss": 0.7133, "num_input_tokens_seen": 38778568, "step": 66800 }, { "epoch": 9.950104259755735, "grad_norm": 1.311571478843689, "learning_rate": 2.9556079837740697e-05, "loss": 0.6515, "num_input_tokens_seen": 38781384, "step": 66805 }, { "epoch": 9.950848972296694, "grad_norm": 3.047034740447998, "learning_rate": 2.9552884794290116e-05, "loss": 0.5875, "num_input_tokens_seen": 38784040, "step": 66810 }, { "epoch": 9.951593684837652, "grad_norm": 0.9270668625831604, "learning_rate": 2.954968967392318e-05, "loss": 0.5988, "num_input_tokens_seen": 38787176, "step": 66815 }, { "epoch": 9.952338397378611, "grad_norm": 1.0136257410049438, "learning_rate": 2.9546494476693865e-05, "loss": 0.4684, "num_input_tokens_seen": 38789992, "step": 66820 }, { "epoch": 9.953083109919572, "grad_norm": 1.2631666660308838, "learning_rate": 2.954329920265614e-05, "loss": 0.6519, "num_input_tokens_seen": 38793320, "step": 66825 }, { "epoch": 9.95382782246053, "grad_norm": 2.2685110569000244, "learning_rate": 2.9540103851863986e-05, "loss": 0.6943, "num_input_tokens_seen": 38796200, "step": 66830 }, { "epoch": 9.954572535001489, "grad_norm": 1.9874258041381836, "learning_rate": 2.95369084243714e-05, "loss": 0.4058, "num_input_tokens_seen": 38799272, "step": 66835 }, { "epoch": 9.955317247542448, "grad_norm": 0.9024220705032349, "learning_rate": 2.9533712920232353e-05, "loss": 0.4361, "num_input_tokens_seen": 38802024, "step": 66840 }, { "epoch": 9.956061960083408, "grad_norm": 1.4155182838439941, "learning_rate": 2.9530517339500835e-05, "loss": 0.6119, "num_input_tokens_seen": 38804776, "step": 66845 }, { "epoch": 9.956806672624367, "grad_norm": 2.133333921432495, "learning_rate": 2.952732168223084e-05, "loss": 0.578, "num_input_tokens_seen": 38807688, "step": 66850 }, { "epoch": 9.957551385165326, "grad_norm": 1.837371826171875, "learning_rate": 2.952412594847634e-05, "loss": 0.8262, "num_input_tokens_seen": 38810280, "step": 66855 }, { "epoch": 9.958296097706285, "grad_norm": 1.1596485376358032, "learning_rate": 2.952093013829133e-05, "loss": 0.8076, "num_input_tokens_seen": 38813128, "step": 66860 }, { "epoch": 9.959040810247245, "grad_norm": 0.6796662211418152, "learning_rate": 2.95177342517298e-05, "loss": 0.5924, "num_input_tokens_seen": 38816072, "step": 66865 }, { "epoch": 9.959785522788204, "grad_norm": 1.8491846323013306, "learning_rate": 2.951453828884574e-05, "loss": 0.7443, "num_input_tokens_seen": 38819240, "step": 66870 }, { "epoch": 9.960530235329163, "grad_norm": 1.8342283964157104, "learning_rate": 2.951134224969314e-05, "loss": 0.5444, "num_input_tokens_seen": 38821928, "step": 66875 }, { "epoch": 9.961274947870121, "grad_norm": 1.4464001655578613, "learning_rate": 2.9508146134326004e-05, "loss": 0.6901, "num_input_tokens_seen": 38825032, "step": 66880 }, { "epoch": 9.962019660411082, "grad_norm": 1.3164396286010742, "learning_rate": 2.950494994279832e-05, "loss": 0.5774, "num_input_tokens_seen": 38827656, "step": 66885 }, { "epoch": 9.96276437295204, "grad_norm": 0.9666523933410645, "learning_rate": 2.950175367516409e-05, "loss": 0.6, "num_input_tokens_seen": 38830568, "step": 66890 }, { "epoch": 9.963509085493, "grad_norm": 2.133125066757202, "learning_rate": 2.94985573314773e-05, "loss": 0.561, "num_input_tokens_seen": 38833640, "step": 66895 }, { "epoch": 9.964253798033958, "grad_norm": 1.186772346496582, "learning_rate": 2.949536091179196e-05, "loss": 0.7655, "num_input_tokens_seen": 38836456, "step": 66900 }, { "epoch": 9.964998510574919, "grad_norm": 1.2921497821807861, "learning_rate": 2.9492164416162066e-05, "loss": 0.7009, "num_input_tokens_seen": 38839208, "step": 66905 }, { "epoch": 9.965743223115878, "grad_norm": 1.631083607673645, "learning_rate": 2.9488967844641612e-05, "loss": 0.5206, "num_input_tokens_seen": 38841992, "step": 66910 }, { "epoch": 9.966487935656836, "grad_norm": 1.7914390563964844, "learning_rate": 2.9485771197284625e-05, "loss": 0.5857, "num_input_tokens_seen": 38844808, "step": 66915 }, { "epoch": 9.967232648197795, "grad_norm": 2.0277559757232666, "learning_rate": 2.948257447414508e-05, "loss": 0.7068, "num_input_tokens_seen": 38847656, "step": 66920 }, { "epoch": 9.967977360738756, "grad_norm": 3.037587881088257, "learning_rate": 2.9479377675276998e-05, "loss": 0.6196, "num_input_tokens_seen": 38850696, "step": 66925 }, { "epoch": 9.968722073279714, "grad_norm": 0.9788233637809753, "learning_rate": 2.9476180800734376e-05, "loss": 0.6911, "num_input_tokens_seen": 38854056, "step": 66930 }, { "epoch": 9.969466785820673, "grad_norm": 1.2605847120285034, "learning_rate": 2.9472983850571235e-05, "loss": 0.5607, "num_input_tokens_seen": 38857032, "step": 66935 }, { "epoch": 9.970211498361632, "grad_norm": 0.6194881796836853, "learning_rate": 2.9469786824841584e-05, "loss": 0.5787, "num_input_tokens_seen": 38860136, "step": 66940 }, { "epoch": 9.970956210902592, "grad_norm": 1.1959729194641113, "learning_rate": 2.946658972359942e-05, "loss": 0.6078, "num_input_tokens_seen": 38863048, "step": 66945 }, { "epoch": 9.971700923443551, "grad_norm": 1.460817575454712, "learning_rate": 2.946339254689877e-05, "loss": 0.672, "num_input_tokens_seen": 38865992, "step": 66950 }, { "epoch": 9.97244563598451, "grad_norm": 1.0655789375305176, "learning_rate": 2.946019529479363e-05, "loss": 0.5041, "num_input_tokens_seen": 38869032, "step": 66955 }, { "epoch": 9.973190348525469, "grad_norm": 1.2859621047973633, "learning_rate": 2.945699796733803e-05, "loss": 0.6255, "num_input_tokens_seen": 38871944, "step": 66960 }, { "epoch": 9.973935061066427, "grad_norm": 2.7655887603759766, "learning_rate": 2.945380056458597e-05, "loss": 0.4973, "num_input_tokens_seen": 38874408, "step": 66965 }, { "epoch": 9.974679773607388, "grad_norm": 1.0539112091064453, "learning_rate": 2.9450603086591484e-05, "loss": 0.5878, "num_input_tokens_seen": 38877256, "step": 66970 }, { "epoch": 9.975424486148347, "grad_norm": 1.0947083234786987, "learning_rate": 2.944740553340858e-05, "loss": 0.5079, "num_input_tokens_seen": 38879912, "step": 66975 }, { "epoch": 9.976169198689306, "grad_norm": 3.0070908069610596, "learning_rate": 2.944420790509128e-05, "loss": 0.7568, "num_input_tokens_seen": 38882600, "step": 66980 }, { "epoch": 9.976913911230266, "grad_norm": 1.7628895044326782, "learning_rate": 2.9441010201693614e-05, "loss": 0.5764, "num_input_tokens_seen": 38885672, "step": 66985 }, { "epoch": 9.977658623771225, "grad_norm": 1.5594042539596558, "learning_rate": 2.9437812423269585e-05, "loss": 0.5776, "num_input_tokens_seen": 38888808, "step": 66990 }, { "epoch": 9.978403336312184, "grad_norm": 0.8609879016876221, "learning_rate": 2.943461456987322e-05, "loss": 0.6574, "num_input_tokens_seen": 38891816, "step": 66995 }, { "epoch": 9.979148048853142, "grad_norm": 1.0723590850830078, "learning_rate": 2.9431416641558558e-05, "loss": 0.567, "num_input_tokens_seen": 38894728, "step": 67000 }, { "epoch": 9.979892761394101, "grad_norm": 0.8226540088653564, "learning_rate": 2.9428218638379608e-05, "loss": 0.566, "num_input_tokens_seen": 38897864, "step": 67005 }, { "epoch": 9.980637473935062, "grad_norm": 1.4075160026550293, "learning_rate": 2.942502056039041e-05, "loss": 0.723, "num_input_tokens_seen": 38901096, "step": 67010 }, { "epoch": 9.98138218647602, "grad_norm": 2.0471956729888916, "learning_rate": 2.9421822407644987e-05, "loss": 0.7105, "num_input_tokens_seen": 38904040, "step": 67015 }, { "epoch": 9.98212689901698, "grad_norm": 1.5799835920333862, "learning_rate": 2.9418624180197375e-05, "loss": 0.6638, "num_input_tokens_seen": 38906888, "step": 67020 }, { "epoch": 9.982871611557938, "grad_norm": 2.3263955116271973, "learning_rate": 2.941542587810159e-05, "loss": 0.5185, "num_input_tokens_seen": 38909864, "step": 67025 }, { "epoch": 9.983616324098898, "grad_norm": 3.5359840393066406, "learning_rate": 2.941222750141167e-05, "loss": 0.7396, "num_input_tokens_seen": 38912616, "step": 67030 }, { "epoch": 9.984361036639857, "grad_norm": 1.0024433135986328, "learning_rate": 2.9409029050181652e-05, "loss": 0.609, "num_input_tokens_seen": 38915368, "step": 67035 }, { "epoch": 9.985105749180816, "grad_norm": 1.0611846446990967, "learning_rate": 2.9405830524465573e-05, "loss": 0.4859, "num_input_tokens_seen": 38918312, "step": 67040 }, { "epoch": 9.985850461721775, "grad_norm": 1.3076491355895996, "learning_rate": 2.940263192431746e-05, "loss": 0.6776, "num_input_tokens_seen": 38921320, "step": 67045 }, { "epoch": 9.986595174262735, "grad_norm": 1.115376591682434, "learning_rate": 2.9399433249791363e-05, "loss": 0.4531, "num_input_tokens_seen": 38924360, "step": 67050 }, { "epoch": 9.987339886803694, "grad_norm": 1.4886891841888428, "learning_rate": 2.9396234500941307e-05, "loss": 0.7487, "num_input_tokens_seen": 38927336, "step": 67055 }, { "epoch": 9.988084599344653, "grad_norm": 0.9985468983650208, "learning_rate": 2.939303567782134e-05, "loss": 0.559, "num_input_tokens_seen": 38930056, "step": 67060 }, { "epoch": 9.988829311885612, "grad_norm": 0.8723289966583252, "learning_rate": 2.9389836780485502e-05, "loss": 0.5449, "num_input_tokens_seen": 38932904, "step": 67065 }, { "epoch": 9.989574024426572, "grad_norm": 1.4071273803710938, "learning_rate": 2.9386637808987828e-05, "loss": 0.6104, "num_input_tokens_seen": 38935912, "step": 67070 }, { "epoch": 9.99031873696753, "grad_norm": 1.429028868675232, "learning_rate": 2.9383438763382363e-05, "loss": 0.7236, "num_input_tokens_seen": 38938920, "step": 67075 }, { "epoch": 9.99106344950849, "grad_norm": 1.3550400733947754, "learning_rate": 2.9380239643723167e-05, "loss": 0.5274, "num_input_tokens_seen": 38941672, "step": 67080 }, { "epoch": 9.991808162049448, "grad_norm": 3.552730083465576, "learning_rate": 2.9377040450064268e-05, "loss": 0.8637, "num_input_tokens_seen": 38945064, "step": 67085 }, { "epoch": 9.992552874590409, "grad_norm": 1.9961223602294922, "learning_rate": 2.9373841182459715e-05, "loss": 0.5829, "num_input_tokens_seen": 38947816, "step": 67090 }, { "epoch": 9.993297587131368, "grad_norm": 2.274061918258667, "learning_rate": 2.9370641840963565e-05, "loss": 0.6211, "num_input_tokens_seen": 38951048, "step": 67095 }, { "epoch": 9.994042299672326, "grad_norm": 1.10187566280365, "learning_rate": 2.9367442425629866e-05, "loss": 0.6657, "num_input_tokens_seen": 38953864, "step": 67100 }, { "epoch": 9.994787012213285, "grad_norm": 2.120513916015625, "learning_rate": 2.9364242936512665e-05, "loss": 0.6629, "num_input_tokens_seen": 38956616, "step": 67105 }, { "epoch": 9.995531724754246, "grad_norm": 0.5553563237190247, "learning_rate": 2.936104337366601e-05, "loss": 0.5138, "num_input_tokens_seen": 38959272, "step": 67110 }, { "epoch": 9.996276437295204, "grad_norm": 1.3200732469558716, "learning_rate": 2.935784373714397e-05, "loss": 0.3737, "num_input_tokens_seen": 38961960, "step": 67115 }, { "epoch": 9.997021149836163, "grad_norm": 3.3492672443389893, "learning_rate": 2.9354644027000577e-05, "loss": 0.8623, "num_input_tokens_seen": 38965192, "step": 67120 }, { "epoch": 9.997765862377122, "grad_norm": 1.1755489110946655, "learning_rate": 2.9351444243289904e-05, "loss": 0.7847, "num_input_tokens_seen": 38968424, "step": 67125 }, { "epoch": 9.998510574918082, "grad_norm": 0.9483610987663269, "learning_rate": 2.9348244386066005e-05, "loss": 0.5721, "num_input_tokens_seen": 38971208, "step": 67130 }, { "epoch": 9.999255287459041, "grad_norm": 1.221453309059143, "learning_rate": 2.9345044455382932e-05, "loss": 0.5561, "num_input_tokens_seen": 38973960, "step": 67135 }, { "epoch": 10.0, "grad_norm": 3.2319910526275635, "learning_rate": 2.9341844451294754e-05, "loss": 0.7712, "num_input_tokens_seen": 38976336, "step": 67140 }, { "epoch": 10.0, "eval_loss": 0.6526756286621094, "eval_runtime": 47.0569, "eval_samples_per_second": 63.413, "eval_steps_per_second": 15.853, "num_input_tokens_seen": 38976336, "step": 67140 }, { "epoch": 10.000744712540959, "grad_norm": 1.6175518035888672, "learning_rate": 2.9338644373855522e-05, "loss": 0.5217, "num_input_tokens_seen": 38979152, "step": 67145 }, { "epoch": 10.001489425081918, "grad_norm": 1.568398118019104, "learning_rate": 2.9335444223119314e-05, "loss": 0.8474, "num_input_tokens_seen": 38982416, "step": 67150 }, { "epoch": 10.002234137622878, "grad_norm": 1.1526615619659424, "learning_rate": 2.9332243999140167e-05, "loss": 0.5736, "num_input_tokens_seen": 38985136, "step": 67155 }, { "epoch": 10.002978850163837, "grad_norm": 2.9479169845581055, "learning_rate": 2.932904370197217e-05, "loss": 0.5849, "num_input_tokens_seen": 38988048, "step": 67160 }, { "epoch": 10.003723562704796, "grad_norm": 1.2641793489456177, "learning_rate": 2.932584333166937e-05, "loss": 0.5967, "num_input_tokens_seen": 38991152, "step": 67165 }, { "epoch": 10.004468275245754, "grad_norm": 1.1666297912597656, "learning_rate": 2.9322642888285855e-05, "loss": 0.4682, "num_input_tokens_seen": 38994256, "step": 67170 }, { "epoch": 10.005212987786715, "grad_norm": 1.084997296333313, "learning_rate": 2.931944237187567e-05, "loss": 0.6367, "num_input_tokens_seen": 38996976, "step": 67175 }, { "epoch": 10.005957700327674, "grad_norm": 1.542230248451233, "learning_rate": 2.931624178249291e-05, "loss": 0.9145, "num_input_tokens_seen": 38999792, "step": 67180 }, { "epoch": 10.006702412868632, "grad_norm": 1.080108642578125, "learning_rate": 2.931304112019163e-05, "loss": 0.6487, "num_input_tokens_seen": 39003152, "step": 67185 }, { "epoch": 10.007447125409591, "grad_norm": 1.1524885892868042, "learning_rate": 2.93098403850259e-05, "loss": 0.6983, "num_input_tokens_seen": 39005840, "step": 67190 }, { "epoch": 10.008191837950552, "grad_norm": 1.2317708730697632, "learning_rate": 2.9306639577049793e-05, "loss": 0.4912, "num_input_tokens_seen": 39008496, "step": 67195 }, { "epoch": 10.00893655049151, "grad_norm": 2.0395538806915283, "learning_rate": 2.9303438696317385e-05, "loss": 0.5331, "num_input_tokens_seen": 39011504, "step": 67200 }, { "epoch": 10.00968126303247, "grad_norm": 1.605465292930603, "learning_rate": 2.9300237742882764e-05, "loss": 0.7148, "num_input_tokens_seen": 39014608, "step": 67205 }, { "epoch": 10.010425975573428, "grad_norm": 0.9031603932380676, "learning_rate": 2.929703671679999e-05, "loss": 0.5516, "num_input_tokens_seen": 39017648, "step": 67210 }, { "epoch": 10.011170688114388, "grad_norm": 1.3739523887634277, "learning_rate": 2.9293835618123157e-05, "loss": 0.6668, "num_input_tokens_seen": 39020528, "step": 67215 }, { "epoch": 10.011915400655347, "grad_norm": 1.2905625104904175, "learning_rate": 2.929063444690633e-05, "loss": 0.6123, "num_input_tokens_seen": 39023568, "step": 67220 }, { "epoch": 10.012660113196306, "grad_norm": 0.8676595687866211, "learning_rate": 2.9287433203203598e-05, "loss": 0.5504, "num_input_tokens_seen": 39026640, "step": 67225 }, { "epoch": 10.013404825737265, "grad_norm": 1.0148673057556152, "learning_rate": 2.928423188706903e-05, "loss": 0.5651, "num_input_tokens_seen": 39029360, "step": 67230 }, { "epoch": 10.014149538278225, "grad_norm": 0.987930417060852, "learning_rate": 2.9281030498556723e-05, "loss": 0.5698, "num_input_tokens_seen": 39032528, "step": 67235 }, { "epoch": 10.014894250819184, "grad_norm": 1.1065871715545654, "learning_rate": 2.9277829037720754e-05, "loss": 0.4921, "num_input_tokens_seen": 39035600, "step": 67240 }, { "epoch": 10.015638963360143, "grad_norm": 2.0676300525665283, "learning_rate": 2.927462750461522e-05, "loss": 0.6865, "num_input_tokens_seen": 39038416, "step": 67245 }, { "epoch": 10.016383675901102, "grad_norm": 1.5257304906845093, "learning_rate": 2.9271425899294193e-05, "loss": 0.6304, "num_input_tokens_seen": 39041136, "step": 67250 }, { "epoch": 10.017128388442062, "grad_norm": 1.5549875497817993, "learning_rate": 2.9268224221811763e-05, "loss": 0.7326, "num_input_tokens_seen": 39044240, "step": 67255 }, { "epoch": 10.01787310098302, "grad_norm": 2.074019193649292, "learning_rate": 2.9265022472222032e-05, "loss": 0.7447, "num_input_tokens_seen": 39047216, "step": 67260 }, { "epoch": 10.01861781352398, "grad_norm": 2.6718780994415283, "learning_rate": 2.9261820650579074e-05, "loss": 0.6471, "num_input_tokens_seen": 39050032, "step": 67265 }, { "epoch": 10.019362526064938, "grad_norm": 1.2342182397842407, "learning_rate": 2.925861875693699e-05, "loss": 0.493, "num_input_tokens_seen": 39053104, "step": 67270 }, { "epoch": 10.020107238605899, "grad_norm": 2.808748960494995, "learning_rate": 2.9255416791349867e-05, "loss": 0.7175, "num_input_tokens_seen": 39056080, "step": 67275 }, { "epoch": 10.020851951146858, "grad_norm": 1.4944578409194946, "learning_rate": 2.925221475387181e-05, "loss": 0.4776, "num_input_tokens_seen": 39058800, "step": 67280 }, { "epoch": 10.021596663687816, "grad_norm": 1.164030909538269, "learning_rate": 2.92490126445569e-05, "loss": 0.589, "num_input_tokens_seen": 39061520, "step": 67285 }, { "epoch": 10.022341376228775, "grad_norm": 1.694649577140808, "learning_rate": 2.9245810463459245e-05, "loss": 0.7511, "num_input_tokens_seen": 39064400, "step": 67290 }, { "epoch": 10.023086088769736, "grad_norm": 1.0486420392990112, "learning_rate": 2.9242608210632932e-05, "loss": 0.5208, "num_input_tokens_seen": 39067120, "step": 67295 }, { "epoch": 10.023830801310694, "grad_norm": 1.2561553716659546, "learning_rate": 2.9239405886132066e-05, "loss": 0.7144, "num_input_tokens_seen": 39070160, "step": 67300 }, { "epoch": 10.024575513851653, "grad_norm": 1.5046484470367432, "learning_rate": 2.923620349001075e-05, "loss": 0.5375, "num_input_tokens_seen": 39073200, "step": 67305 }, { "epoch": 10.025320226392612, "grad_norm": 1.1551775932312012, "learning_rate": 2.923300102232308e-05, "loss": 0.6042, "num_input_tokens_seen": 39075824, "step": 67310 }, { "epoch": 10.02606493893357, "grad_norm": 2.2498366832733154, "learning_rate": 2.9229798483123162e-05, "loss": 0.5422, "num_input_tokens_seen": 39078544, "step": 67315 }, { "epoch": 10.026809651474531, "grad_norm": 2.5510191917419434, "learning_rate": 2.9226595872465097e-05, "loss": 0.7326, "num_input_tokens_seen": 39081328, "step": 67320 }, { "epoch": 10.02755436401549, "grad_norm": 1.7511404752731323, "learning_rate": 2.922339319040298e-05, "loss": 0.6262, "num_input_tokens_seen": 39084272, "step": 67325 }, { "epoch": 10.028299076556449, "grad_norm": 1.4094220399856567, "learning_rate": 2.922019043699094e-05, "loss": 0.8283, "num_input_tokens_seen": 39087344, "step": 67330 }, { "epoch": 10.029043789097408, "grad_norm": 1.86165452003479, "learning_rate": 2.9216987612283064e-05, "loss": 0.8048, "num_input_tokens_seen": 39090128, "step": 67335 }, { "epoch": 10.029788501638368, "grad_norm": 1.2665718793869019, "learning_rate": 2.921378471633347e-05, "loss": 0.6119, "num_input_tokens_seen": 39093200, "step": 67340 }, { "epoch": 10.030533214179327, "grad_norm": 0.7679821848869324, "learning_rate": 2.9210581749196274e-05, "loss": 0.4079, "num_input_tokens_seen": 39096464, "step": 67345 }, { "epoch": 10.031277926720286, "grad_norm": 1.9866799116134644, "learning_rate": 2.9207378710925575e-05, "loss": 0.5277, "num_input_tokens_seen": 39099664, "step": 67350 }, { "epoch": 10.032022639261244, "grad_norm": 2.0278656482696533, "learning_rate": 2.920417560157549e-05, "loss": 0.4597, "num_input_tokens_seen": 39102736, "step": 67355 }, { "epoch": 10.032767351802205, "grad_norm": 1.6418551206588745, "learning_rate": 2.9200972421200124e-05, "loss": 0.6921, "num_input_tokens_seen": 39105872, "step": 67360 }, { "epoch": 10.033512064343164, "grad_norm": 1.4458303451538086, "learning_rate": 2.91977691698536e-05, "loss": 0.4626, "num_input_tokens_seen": 39108720, "step": 67365 }, { "epoch": 10.034256776884122, "grad_norm": 2.2900686264038086, "learning_rate": 2.919456584759003e-05, "loss": 0.666, "num_input_tokens_seen": 39111504, "step": 67370 }, { "epoch": 10.035001489425081, "grad_norm": 1.4252245426177979, "learning_rate": 2.919136245446354e-05, "loss": 0.62, "num_input_tokens_seen": 39114416, "step": 67375 }, { "epoch": 10.035746201966042, "grad_norm": 1.0587972402572632, "learning_rate": 2.918815899052824e-05, "loss": 0.7056, "num_input_tokens_seen": 39117360, "step": 67380 }, { "epoch": 10.036490914507, "grad_norm": 2.3132128715515137, "learning_rate": 2.9184955455838258e-05, "loss": 0.7341, "num_input_tokens_seen": 39120464, "step": 67385 }, { "epoch": 10.03723562704796, "grad_norm": 1.8265036344528198, "learning_rate": 2.9181751850447698e-05, "loss": 0.5646, "num_input_tokens_seen": 39123376, "step": 67390 }, { "epoch": 10.037980339588918, "grad_norm": 2.7994587421417236, "learning_rate": 2.9178548174410687e-05, "loss": 0.6584, "num_input_tokens_seen": 39126224, "step": 67395 }, { "epoch": 10.038725052129879, "grad_norm": 2.2339870929718018, "learning_rate": 2.9175344427781354e-05, "loss": 0.5337, "num_input_tokens_seen": 39129072, "step": 67400 }, { "epoch": 10.039469764670837, "grad_norm": 1.0310487747192383, "learning_rate": 2.9172140610613825e-05, "loss": 0.6299, "num_input_tokens_seen": 39131952, "step": 67405 }, { "epoch": 10.040214477211796, "grad_norm": 2.102932929992676, "learning_rate": 2.916893672296222e-05, "loss": 0.6391, "num_input_tokens_seen": 39134864, "step": 67410 }, { "epoch": 10.040959189752755, "grad_norm": 0.5769741535186768, "learning_rate": 2.916573276488066e-05, "loss": 0.5432, "num_input_tokens_seen": 39137552, "step": 67415 }, { "epoch": 10.041703902293715, "grad_norm": 2.1343255043029785, "learning_rate": 2.9162528736423283e-05, "loss": 0.5216, "num_input_tokens_seen": 39140720, "step": 67420 }, { "epoch": 10.042448614834674, "grad_norm": 0.9540033936500549, "learning_rate": 2.915932463764422e-05, "loss": 0.6562, "num_input_tokens_seen": 39143856, "step": 67425 }, { "epoch": 10.043193327375633, "grad_norm": 1.6641219854354858, "learning_rate": 2.9156120468597588e-05, "loss": 0.7205, "num_input_tokens_seen": 39146544, "step": 67430 }, { "epoch": 10.043938039916592, "grad_norm": 0.7275426983833313, "learning_rate": 2.9152916229337525e-05, "loss": 0.5946, "num_input_tokens_seen": 39149296, "step": 67435 }, { "epoch": 10.044682752457552, "grad_norm": 1.248292088508606, "learning_rate": 2.9149711919918154e-05, "loss": 0.5683, "num_input_tokens_seen": 39152336, "step": 67440 }, { "epoch": 10.045427464998511, "grad_norm": 1.3949215412139893, "learning_rate": 2.9146507540393636e-05, "loss": 0.4834, "num_input_tokens_seen": 39155440, "step": 67445 }, { "epoch": 10.04617217753947, "grad_norm": 1.5905598402023315, "learning_rate": 2.9143303090818074e-05, "loss": 0.7501, "num_input_tokens_seen": 39158480, "step": 67450 }, { "epoch": 10.046916890080428, "grad_norm": 1.2828539609909058, "learning_rate": 2.9140098571245623e-05, "loss": 0.5316, "num_input_tokens_seen": 39161712, "step": 67455 }, { "epoch": 10.047661602621389, "grad_norm": 2.2882895469665527, "learning_rate": 2.9136893981730406e-05, "loss": 0.8047, "num_input_tokens_seen": 39164624, "step": 67460 }, { "epoch": 10.048406315162348, "grad_norm": 1.4921802282333374, "learning_rate": 2.9133689322326586e-05, "loss": 0.5619, "num_input_tokens_seen": 39167696, "step": 67465 }, { "epoch": 10.049151027703306, "grad_norm": 0.9710835218429565, "learning_rate": 2.9130484593088276e-05, "loss": 0.7372, "num_input_tokens_seen": 39170512, "step": 67470 }, { "epoch": 10.049895740244265, "grad_norm": 1.418386459350586, "learning_rate": 2.9127279794069624e-05, "loss": 0.5364, "num_input_tokens_seen": 39173648, "step": 67475 }, { "epoch": 10.050640452785226, "grad_norm": 1.4953796863555908, "learning_rate": 2.9124074925324785e-05, "loss": 0.4902, "num_input_tokens_seen": 39176496, "step": 67480 }, { "epoch": 10.051385165326185, "grad_norm": 1.2016745805740356, "learning_rate": 2.9120869986907885e-05, "loss": 0.4819, "num_input_tokens_seen": 39179408, "step": 67485 }, { "epoch": 10.052129877867143, "grad_norm": 1.3166248798370361, "learning_rate": 2.9117664978873072e-05, "loss": 0.6552, "num_input_tokens_seen": 39182480, "step": 67490 }, { "epoch": 10.052874590408102, "grad_norm": 1.417772650718689, "learning_rate": 2.9114459901274493e-05, "loss": 0.5423, "num_input_tokens_seen": 39185296, "step": 67495 }, { "epoch": 10.05361930294906, "grad_norm": 1.1274495124816895, "learning_rate": 2.91112547541663e-05, "loss": 0.6676, "num_input_tokens_seen": 39188080, "step": 67500 }, { "epoch": 10.054364015490021, "grad_norm": 2.3225760459899902, "learning_rate": 2.9108049537602637e-05, "loss": 0.7299, "num_input_tokens_seen": 39191024, "step": 67505 }, { "epoch": 10.05510872803098, "grad_norm": 0.9112967848777771, "learning_rate": 2.9104844251637652e-05, "loss": 0.67, "num_input_tokens_seen": 39194000, "step": 67510 }, { "epoch": 10.055853440571939, "grad_norm": 1.1430792808532715, "learning_rate": 2.91016388963255e-05, "loss": 0.6701, "num_input_tokens_seen": 39196784, "step": 67515 }, { "epoch": 10.056598153112898, "grad_norm": 2.5434961318969727, "learning_rate": 2.9098433471720322e-05, "loss": 0.6034, "num_input_tokens_seen": 39199632, "step": 67520 }, { "epoch": 10.057342865653858, "grad_norm": 1.4785388708114624, "learning_rate": 2.909522797787627e-05, "loss": 0.6616, "num_input_tokens_seen": 39202288, "step": 67525 }, { "epoch": 10.058087578194817, "grad_norm": 2.5089809894561768, "learning_rate": 2.9092022414847514e-05, "loss": 0.7055, "num_input_tokens_seen": 39205200, "step": 67530 }, { "epoch": 10.058832290735776, "grad_norm": 1.128392219543457, "learning_rate": 2.908881678268819e-05, "loss": 0.4528, "num_input_tokens_seen": 39208624, "step": 67535 }, { "epoch": 10.059577003276734, "grad_norm": 1.639496088027954, "learning_rate": 2.908561108145247e-05, "loss": 0.5003, "num_input_tokens_seen": 39211312, "step": 67540 }, { "epoch": 10.060321715817695, "grad_norm": 1.534713864326477, "learning_rate": 2.90824053111945e-05, "loss": 0.7179, "num_input_tokens_seen": 39214480, "step": 67545 }, { "epoch": 10.061066428358654, "grad_norm": 1.0998667478561401, "learning_rate": 2.9079199471968444e-05, "loss": 0.5209, "num_input_tokens_seen": 39217424, "step": 67550 }, { "epoch": 10.061811140899612, "grad_norm": 1.667065978050232, "learning_rate": 2.9075993563828452e-05, "loss": 0.7648, "num_input_tokens_seen": 39220016, "step": 67555 }, { "epoch": 10.062555853440571, "grad_norm": 1.542694091796875, "learning_rate": 2.9072787586828697e-05, "loss": 0.6641, "num_input_tokens_seen": 39222832, "step": 67560 }, { "epoch": 10.063300565981532, "grad_norm": 1.201972246170044, "learning_rate": 2.9069581541023333e-05, "loss": 0.5199, "num_input_tokens_seen": 39225808, "step": 67565 }, { "epoch": 10.06404527852249, "grad_norm": 1.796857237815857, "learning_rate": 2.9066375426466518e-05, "loss": 0.449, "num_input_tokens_seen": 39228944, "step": 67570 }, { "epoch": 10.06478999106345, "grad_norm": 1.502514123916626, "learning_rate": 2.906316924321244e-05, "loss": 0.5582, "num_input_tokens_seen": 39231920, "step": 67575 }, { "epoch": 10.065534703604408, "grad_norm": 2.672168016433716, "learning_rate": 2.9059962991315237e-05, "loss": 0.6807, "num_input_tokens_seen": 39235088, "step": 67580 }, { "epoch": 10.066279416145369, "grad_norm": 1.647932767868042, "learning_rate": 2.9056756670829087e-05, "loss": 0.7793, "num_input_tokens_seen": 39237840, "step": 67585 }, { "epoch": 10.067024128686327, "grad_norm": 1.9410014152526855, "learning_rate": 2.9053550281808155e-05, "loss": 0.616, "num_input_tokens_seen": 39240400, "step": 67590 }, { "epoch": 10.067768841227286, "grad_norm": 1.052118182182312, "learning_rate": 2.905034382430661e-05, "loss": 0.4819, "num_input_tokens_seen": 39243184, "step": 67595 }, { "epoch": 10.068513553768245, "grad_norm": 1.165643572807312, "learning_rate": 2.9047137298378624e-05, "loss": 0.6523, "num_input_tokens_seen": 39246160, "step": 67600 }, { "epoch": 10.069258266309205, "grad_norm": 2.3826866149902344, "learning_rate": 2.9043930704078364e-05, "loss": 0.806, "num_input_tokens_seen": 39249008, "step": 67605 }, { "epoch": 10.070002978850164, "grad_norm": 2.000530958175659, "learning_rate": 2.904072404146001e-05, "loss": 0.5768, "num_input_tokens_seen": 39251888, "step": 67610 }, { "epoch": 10.070747691391123, "grad_norm": 2.8687150478363037, "learning_rate": 2.9037517310577726e-05, "loss": 0.6743, "num_input_tokens_seen": 39254896, "step": 67615 }, { "epoch": 10.071492403932082, "grad_norm": 0.8444445133209229, "learning_rate": 2.9034310511485692e-05, "loss": 0.6203, "num_input_tokens_seen": 39257520, "step": 67620 }, { "epoch": 10.072237116473042, "grad_norm": 0.9079475402832031, "learning_rate": 2.903110364423809e-05, "loss": 0.5776, "num_input_tokens_seen": 39260432, "step": 67625 }, { "epoch": 10.072981829014001, "grad_norm": 1.1709898710250854, "learning_rate": 2.9027896708889073e-05, "loss": 0.5875, "num_input_tokens_seen": 39263312, "step": 67630 }, { "epoch": 10.07372654155496, "grad_norm": 1.9507640600204468, "learning_rate": 2.9024689705492847e-05, "loss": 0.6289, "num_input_tokens_seen": 39266000, "step": 67635 }, { "epoch": 10.074471254095918, "grad_norm": 1.2849479913711548, "learning_rate": 2.902148263410357e-05, "loss": 0.4994, "num_input_tokens_seen": 39268976, "step": 67640 }, { "epoch": 10.075215966636879, "grad_norm": 1.1428048610687256, "learning_rate": 2.9018275494775442e-05, "loss": 0.4581, "num_input_tokens_seen": 39271792, "step": 67645 }, { "epoch": 10.075960679177838, "grad_norm": 1.1902841329574585, "learning_rate": 2.9015068287562626e-05, "loss": 0.5234, "num_input_tokens_seen": 39274704, "step": 67650 }, { "epoch": 10.076705391718797, "grad_norm": 1.0782207250595093, "learning_rate": 2.9011861012519316e-05, "loss": 0.5127, "num_input_tokens_seen": 39277680, "step": 67655 }, { "epoch": 10.077450104259755, "grad_norm": 1.7226717472076416, "learning_rate": 2.900865366969968e-05, "loss": 0.6415, "num_input_tokens_seen": 39280624, "step": 67660 }, { "epoch": 10.078194816800714, "grad_norm": 2.6880922317504883, "learning_rate": 2.900544625915793e-05, "loss": 0.6855, "num_input_tokens_seen": 39283600, "step": 67665 }, { "epoch": 10.078939529341675, "grad_norm": 1.3135181665420532, "learning_rate": 2.9002238780948232e-05, "loss": 0.7023, "num_input_tokens_seen": 39286800, "step": 67670 }, { "epoch": 10.079684241882633, "grad_norm": 2.920058012008667, "learning_rate": 2.8999031235124775e-05, "loss": 0.6755, "num_input_tokens_seen": 39289488, "step": 67675 }, { "epoch": 10.080428954423592, "grad_norm": 0.934872031211853, "learning_rate": 2.8995823621741754e-05, "loss": 0.5019, "num_input_tokens_seen": 39292720, "step": 67680 }, { "epoch": 10.08117366696455, "grad_norm": 0.9690937995910645, "learning_rate": 2.8992615940853347e-05, "loss": 0.5693, "num_input_tokens_seen": 39295472, "step": 67685 }, { "epoch": 10.081918379505511, "grad_norm": 1.2605407238006592, "learning_rate": 2.8989408192513756e-05, "loss": 0.5545, "num_input_tokens_seen": 39298288, "step": 67690 }, { "epoch": 10.08266309204647, "grad_norm": 2.2268121242523193, "learning_rate": 2.898620037677717e-05, "loss": 0.6095, "num_input_tokens_seen": 39300944, "step": 67695 }, { "epoch": 10.083407804587429, "grad_norm": 1.5936492681503296, "learning_rate": 2.898299249369777e-05, "loss": 0.7512, "num_input_tokens_seen": 39303952, "step": 67700 }, { "epoch": 10.084152517128388, "grad_norm": 1.6605886220932007, "learning_rate": 2.8979784543329775e-05, "loss": 0.6276, "num_input_tokens_seen": 39306832, "step": 67705 }, { "epoch": 10.084897229669348, "grad_norm": 1.6944911479949951, "learning_rate": 2.897657652572735e-05, "loss": 0.5761, "num_input_tokens_seen": 39310224, "step": 67710 }, { "epoch": 10.085641942210307, "grad_norm": 1.560459017753601, "learning_rate": 2.897336844094472e-05, "loss": 0.6495, "num_input_tokens_seen": 39313232, "step": 67715 }, { "epoch": 10.086386654751266, "grad_norm": 1.6308257579803467, "learning_rate": 2.8970160289036064e-05, "loss": 0.6298, "num_input_tokens_seen": 39315760, "step": 67720 }, { "epoch": 10.087131367292224, "grad_norm": 1.5185937881469727, "learning_rate": 2.8966952070055582e-05, "loss": 0.7492, "num_input_tokens_seen": 39318544, "step": 67725 }, { "epoch": 10.087876079833185, "grad_norm": 2.240983247756958, "learning_rate": 2.8963743784057474e-05, "loss": 0.5441, "num_input_tokens_seen": 39321328, "step": 67730 }, { "epoch": 10.088620792374144, "grad_norm": 1.8529516458511353, "learning_rate": 2.896053543109595e-05, "loss": 0.6546, "num_input_tokens_seen": 39324240, "step": 67735 }, { "epoch": 10.089365504915103, "grad_norm": 1.214555263519287, "learning_rate": 2.8957327011225198e-05, "loss": 0.6259, "num_input_tokens_seen": 39327184, "step": 67740 }, { "epoch": 10.090110217456061, "grad_norm": 1.3999608755111694, "learning_rate": 2.8954118524499434e-05, "loss": 0.6475, "num_input_tokens_seen": 39329968, "step": 67745 }, { "epoch": 10.090854929997022, "grad_norm": 2.0796308517456055, "learning_rate": 2.895090997097286e-05, "loss": 0.5784, "num_input_tokens_seen": 39332944, "step": 67750 }, { "epoch": 10.09159964253798, "grad_norm": 0.8150162100791931, "learning_rate": 2.894770135069967e-05, "loss": 0.5867, "num_input_tokens_seen": 39335696, "step": 67755 }, { "epoch": 10.09234435507894, "grad_norm": 1.9622669219970703, "learning_rate": 2.894449266373408e-05, "loss": 0.5123, "num_input_tokens_seen": 39338736, "step": 67760 }, { "epoch": 10.093089067619898, "grad_norm": 2.272465229034424, "learning_rate": 2.8941283910130295e-05, "loss": 0.6921, "num_input_tokens_seen": 39341744, "step": 67765 }, { "epoch": 10.093833780160859, "grad_norm": 1.4549404382705688, "learning_rate": 2.8938075089942524e-05, "loss": 0.6792, "num_input_tokens_seen": 39344720, "step": 67770 }, { "epoch": 10.094578492701817, "grad_norm": 1.4315937757492065, "learning_rate": 2.893486620322498e-05, "loss": 0.7, "num_input_tokens_seen": 39347792, "step": 67775 }, { "epoch": 10.095323205242776, "grad_norm": 1.1501436233520508, "learning_rate": 2.893165725003187e-05, "loss": 0.5588, "num_input_tokens_seen": 39351120, "step": 67780 }, { "epoch": 10.096067917783735, "grad_norm": 1.092322587966919, "learning_rate": 2.8928448230417404e-05, "loss": 0.6012, "num_input_tokens_seen": 39354128, "step": 67785 }, { "epoch": 10.096812630324695, "grad_norm": 1.5160731077194214, "learning_rate": 2.89252391444358e-05, "loss": 0.6429, "num_input_tokens_seen": 39358192, "step": 67790 }, { "epoch": 10.097557342865654, "grad_norm": 0.6063312888145447, "learning_rate": 2.892202999214127e-05, "loss": 0.4948, "num_input_tokens_seen": 39361008, "step": 67795 }, { "epoch": 10.098302055406613, "grad_norm": 1.1763474941253662, "learning_rate": 2.8918820773588025e-05, "loss": 0.5904, "num_input_tokens_seen": 39363952, "step": 67800 }, { "epoch": 10.099046767947572, "grad_norm": 2.536571979522705, "learning_rate": 2.8915611488830284e-05, "loss": 0.5561, "num_input_tokens_seen": 39366896, "step": 67805 }, { "epoch": 10.099791480488532, "grad_norm": 1.324425458908081, "learning_rate": 2.891240213792228e-05, "loss": 0.532, "num_input_tokens_seen": 39369808, "step": 67810 }, { "epoch": 10.100536193029491, "grad_norm": 1.6923578977584839, "learning_rate": 2.890919272091821e-05, "loss": 0.5915, "num_input_tokens_seen": 39372688, "step": 67815 }, { "epoch": 10.10128090557045, "grad_norm": 0.9347689747810364, "learning_rate": 2.8905983237872304e-05, "loss": 0.7072, "num_input_tokens_seen": 39375728, "step": 67820 }, { "epoch": 10.102025618111409, "grad_norm": 1.1017261743545532, "learning_rate": 2.890277368883878e-05, "loss": 0.6948, "num_input_tokens_seen": 39378768, "step": 67825 }, { "epoch": 10.102770330652369, "grad_norm": 1.7794909477233887, "learning_rate": 2.889956407387186e-05, "loss": 0.7055, "num_input_tokens_seen": 39381808, "step": 67830 }, { "epoch": 10.103515043193328, "grad_norm": 2.6172380447387695, "learning_rate": 2.8896354393025765e-05, "loss": 0.6031, "num_input_tokens_seen": 39384720, "step": 67835 }, { "epoch": 10.104259755734287, "grad_norm": 1.3045730590820312, "learning_rate": 2.8893144646354725e-05, "loss": 0.5911, "num_input_tokens_seen": 39387824, "step": 67840 }, { "epoch": 10.105004468275245, "grad_norm": 2.4837446212768555, "learning_rate": 2.888993483391297e-05, "loss": 0.4847, "num_input_tokens_seen": 39390704, "step": 67845 }, { "epoch": 10.105749180816204, "grad_norm": 1.4896992444992065, "learning_rate": 2.8886724955754713e-05, "loss": 0.4585, "num_input_tokens_seen": 39393648, "step": 67850 }, { "epoch": 10.106493893357165, "grad_norm": 1.9523050785064697, "learning_rate": 2.8883515011934186e-05, "loss": 0.5975, "num_input_tokens_seen": 39396432, "step": 67855 }, { "epoch": 10.107238605898123, "grad_norm": 1.267448902130127, "learning_rate": 2.8880305002505624e-05, "loss": 0.4765, "num_input_tokens_seen": 39399184, "step": 67860 }, { "epoch": 10.107983318439082, "grad_norm": 0.9184935092926025, "learning_rate": 2.887709492752325e-05, "loss": 0.4488, "num_input_tokens_seen": 39402160, "step": 67865 }, { "epoch": 10.108728030980041, "grad_norm": 1.1277053356170654, "learning_rate": 2.8873884787041304e-05, "loss": 0.6191, "num_input_tokens_seen": 39405104, "step": 67870 }, { "epoch": 10.109472743521001, "grad_norm": 1.4407601356506348, "learning_rate": 2.8870674581114004e-05, "loss": 0.6565, "num_input_tokens_seen": 39407984, "step": 67875 }, { "epoch": 10.11021745606196, "grad_norm": 1.3814382553100586, "learning_rate": 2.88674643097956e-05, "loss": 0.5988, "num_input_tokens_seen": 39410960, "step": 67880 }, { "epoch": 10.110962168602919, "grad_norm": 1.1774253845214844, "learning_rate": 2.886425397314031e-05, "loss": 0.708, "num_input_tokens_seen": 39413776, "step": 67885 }, { "epoch": 10.111706881143878, "grad_norm": 2.656015634536743, "learning_rate": 2.886104357120237e-05, "loss": 0.6802, "num_input_tokens_seen": 39416912, "step": 67890 }, { "epoch": 10.112451593684838, "grad_norm": 2.014540910720825, "learning_rate": 2.8857833104036036e-05, "loss": 0.8397, "num_input_tokens_seen": 39419888, "step": 67895 }, { "epoch": 10.113196306225797, "grad_norm": 2.0869436264038086, "learning_rate": 2.8854622571695526e-05, "loss": 0.6058, "num_input_tokens_seen": 39422576, "step": 67900 }, { "epoch": 10.113941018766756, "grad_norm": 0.9717230796813965, "learning_rate": 2.8851411974235086e-05, "loss": 0.5742, "num_input_tokens_seen": 39426000, "step": 67905 }, { "epoch": 10.114685731307715, "grad_norm": 1.2901719808578491, "learning_rate": 2.884820131170896e-05, "loss": 0.718, "num_input_tokens_seen": 39428784, "step": 67910 }, { "epoch": 10.115430443848675, "grad_norm": 1.2385027408599854, "learning_rate": 2.884499058417138e-05, "loss": 0.4565, "num_input_tokens_seen": 39431920, "step": 67915 }, { "epoch": 10.116175156389634, "grad_norm": 1.4733525514602661, "learning_rate": 2.8841779791676594e-05, "loss": 0.5696, "num_input_tokens_seen": 39434608, "step": 67920 }, { "epoch": 10.116919868930593, "grad_norm": 1.7552907466888428, "learning_rate": 2.8838568934278843e-05, "loss": 0.6619, "num_input_tokens_seen": 39437616, "step": 67925 }, { "epoch": 10.117664581471551, "grad_norm": 1.2687257528305054, "learning_rate": 2.8835358012032364e-05, "loss": 0.6712, "num_input_tokens_seen": 39440528, "step": 67930 }, { "epoch": 10.118409294012512, "grad_norm": 1.5766234397888184, "learning_rate": 2.8832147024991412e-05, "loss": 0.6233, "num_input_tokens_seen": 39443376, "step": 67935 }, { "epoch": 10.11915400655347, "grad_norm": 0.8317601084709167, "learning_rate": 2.882893597321024e-05, "loss": 0.5555, "num_input_tokens_seen": 39446032, "step": 67940 }, { "epoch": 10.11989871909443, "grad_norm": 2.2461533546447754, "learning_rate": 2.8825724856743075e-05, "loss": 0.6537, "num_input_tokens_seen": 39448848, "step": 67945 }, { "epoch": 10.120643431635388, "grad_norm": 2.539477586746216, "learning_rate": 2.8822513675644192e-05, "loss": 0.8348, "num_input_tokens_seen": 39451792, "step": 67950 }, { "epoch": 10.121388144176349, "grad_norm": 1.8003933429718018, "learning_rate": 2.8819302429967808e-05, "loss": 0.8191, "num_input_tokens_seen": 39454704, "step": 67955 }, { "epoch": 10.122132856717307, "grad_norm": 2.206233024597168, "learning_rate": 2.88160911197682e-05, "loss": 0.6172, "num_input_tokens_seen": 39457872, "step": 67960 }, { "epoch": 10.122877569258266, "grad_norm": 2.1615071296691895, "learning_rate": 2.881287974509961e-05, "loss": 0.8749, "num_input_tokens_seen": 39461968, "step": 67965 }, { "epoch": 10.123622281799225, "grad_norm": 1.552005648612976, "learning_rate": 2.8809668306016286e-05, "loss": 0.5657, "num_input_tokens_seen": 39464784, "step": 67970 }, { "epoch": 10.124366994340185, "grad_norm": 1.3959429264068604, "learning_rate": 2.8806456802572502e-05, "loss": 0.5462, "num_input_tokens_seen": 39467504, "step": 67975 }, { "epoch": 10.125111706881144, "grad_norm": 2.103001356124878, "learning_rate": 2.8803245234822485e-05, "loss": 0.5922, "num_input_tokens_seen": 39470608, "step": 67980 }, { "epoch": 10.125856419422103, "grad_norm": 1.0294203758239746, "learning_rate": 2.880003360282051e-05, "loss": 0.5854, "num_input_tokens_seen": 39473520, "step": 67985 }, { "epoch": 10.126601131963062, "grad_norm": 0.8797132968902588, "learning_rate": 2.8796821906620837e-05, "loss": 0.46, "num_input_tokens_seen": 39476304, "step": 67990 }, { "epoch": 10.127345844504022, "grad_norm": 1.1641826629638672, "learning_rate": 2.8793610146277707e-05, "loss": 0.6129, "num_input_tokens_seen": 39479344, "step": 67995 }, { "epoch": 10.128090557044981, "grad_norm": 1.1798033714294434, "learning_rate": 2.879039832184539e-05, "loss": 0.6586, "num_input_tokens_seen": 39481968, "step": 68000 }, { "epoch": 10.12883526958594, "grad_norm": 1.4562586545944214, "learning_rate": 2.8787186433378142e-05, "loss": 0.6423, "num_input_tokens_seen": 39484688, "step": 68005 }, { "epoch": 10.129579982126899, "grad_norm": 1.5140691995620728, "learning_rate": 2.8783974480930244e-05, "loss": 0.7175, "num_input_tokens_seen": 39488208, "step": 68010 }, { "epoch": 10.130324694667857, "grad_norm": 2.3366150856018066, "learning_rate": 2.8780762464555928e-05, "loss": 0.4484, "num_input_tokens_seen": 39491024, "step": 68015 }, { "epoch": 10.131069407208818, "grad_norm": 1.2314982414245605, "learning_rate": 2.8777550384309477e-05, "loss": 0.7487, "num_input_tokens_seen": 39493968, "step": 68020 }, { "epoch": 10.131814119749777, "grad_norm": 2.15193772315979, "learning_rate": 2.877433824024515e-05, "loss": 0.6143, "num_input_tokens_seen": 39496720, "step": 68025 }, { "epoch": 10.132558832290735, "grad_norm": 1.7433804273605347, "learning_rate": 2.8771126032417222e-05, "loss": 0.5336, "num_input_tokens_seen": 39499728, "step": 68030 }, { "epoch": 10.133303544831694, "grad_norm": 1.7684077024459839, "learning_rate": 2.876791376087995e-05, "loss": 0.7859, "num_input_tokens_seen": 39502672, "step": 68035 }, { "epoch": 10.134048257372655, "grad_norm": 1.2841435670852661, "learning_rate": 2.8764701425687597e-05, "loss": 0.6021, "num_input_tokens_seen": 39505776, "step": 68040 }, { "epoch": 10.134792969913613, "grad_norm": 2.0804665088653564, "learning_rate": 2.876148902689445e-05, "loss": 0.5785, "num_input_tokens_seen": 39508560, "step": 68045 }, { "epoch": 10.135537682454572, "grad_norm": 0.8112273216247559, "learning_rate": 2.875827656455476e-05, "loss": 0.454, "num_input_tokens_seen": 39511600, "step": 68050 }, { "epoch": 10.136282394995531, "grad_norm": 1.075170636177063, "learning_rate": 2.8755064038722813e-05, "loss": 0.795, "num_input_tokens_seen": 39514480, "step": 68055 }, { "epoch": 10.137027107536491, "grad_norm": 2.071117877960205, "learning_rate": 2.875185144945287e-05, "loss": 0.4779, "num_input_tokens_seen": 39517168, "step": 68060 }, { "epoch": 10.13777182007745, "grad_norm": 1.3149878978729248, "learning_rate": 2.874863879679921e-05, "loss": 0.5998, "num_input_tokens_seen": 39519984, "step": 68065 }, { "epoch": 10.138516532618409, "grad_norm": 1.2567768096923828, "learning_rate": 2.8745426080816117e-05, "loss": 0.7783, "num_input_tokens_seen": 39522768, "step": 68070 }, { "epoch": 10.139261245159368, "grad_norm": 1.1930536031723022, "learning_rate": 2.8742213301557847e-05, "loss": 0.6363, "num_input_tokens_seen": 39526000, "step": 68075 }, { "epoch": 10.140005957700328, "grad_norm": 2.3087942600250244, "learning_rate": 2.8739000459078695e-05, "loss": 0.8043, "num_input_tokens_seen": 39529040, "step": 68080 }, { "epoch": 10.140750670241287, "grad_norm": 2.0986835956573486, "learning_rate": 2.8735787553432925e-05, "loss": 0.4157, "num_input_tokens_seen": 39532016, "step": 68085 }, { "epoch": 10.141495382782246, "grad_norm": 2.2599430084228516, "learning_rate": 2.873257458467482e-05, "loss": 0.4874, "num_input_tokens_seen": 39534736, "step": 68090 }, { "epoch": 10.142240095323205, "grad_norm": 0.9298659563064575, "learning_rate": 2.8729361552858662e-05, "loss": 0.5194, "num_input_tokens_seen": 39537840, "step": 68095 }, { "epoch": 10.142984807864165, "grad_norm": 1.7036328315734863, "learning_rate": 2.8726148458038732e-05, "loss": 0.5244, "num_input_tokens_seen": 39540528, "step": 68100 }, { "epoch": 10.143729520405124, "grad_norm": 2.3965413570404053, "learning_rate": 2.8722935300269315e-05, "loss": 0.6275, "num_input_tokens_seen": 39543568, "step": 68105 }, { "epoch": 10.144474232946083, "grad_norm": 1.2274531126022339, "learning_rate": 2.8719722079604684e-05, "loss": 0.6394, "num_input_tokens_seen": 39546384, "step": 68110 }, { "epoch": 10.145218945487041, "grad_norm": 0.846075713634491, "learning_rate": 2.8716508796099135e-05, "loss": 0.623, "num_input_tokens_seen": 39549296, "step": 68115 }, { "epoch": 10.145963658028002, "grad_norm": 0.7184671759605408, "learning_rate": 2.8713295449806944e-05, "loss": 0.4619, "num_input_tokens_seen": 39552048, "step": 68120 }, { "epoch": 10.14670837056896, "grad_norm": 1.192531943321228, "learning_rate": 2.8710082040782392e-05, "loss": 0.6096, "num_input_tokens_seen": 39555088, "step": 68125 }, { "epoch": 10.14745308310992, "grad_norm": 1.9066760540008545, "learning_rate": 2.870686856907978e-05, "loss": 0.5654, "num_input_tokens_seen": 39558064, "step": 68130 }, { "epoch": 10.148197795650878, "grad_norm": 1.5937607288360596, "learning_rate": 2.8703655034753397e-05, "loss": 0.5526, "num_input_tokens_seen": 39560752, "step": 68135 }, { "epoch": 10.148942508191839, "grad_norm": 1.3172218799591064, "learning_rate": 2.8700441437857527e-05, "loss": 0.6561, "num_input_tokens_seen": 39564016, "step": 68140 }, { "epoch": 10.149687220732797, "grad_norm": 1.1824913024902344, "learning_rate": 2.869722777844645e-05, "loss": 0.5859, "num_input_tokens_seen": 39567184, "step": 68145 }, { "epoch": 10.150431933273756, "grad_norm": 0.8680781126022339, "learning_rate": 2.869401405657448e-05, "loss": 0.7556, "num_input_tokens_seen": 39570448, "step": 68150 }, { "epoch": 10.151176645814715, "grad_norm": 1.427922010421753, "learning_rate": 2.8690800272295888e-05, "loss": 0.7196, "num_input_tokens_seen": 39573168, "step": 68155 }, { "epoch": 10.151921358355676, "grad_norm": 0.9516043663024902, "learning_rate": 2.8687586425664974e-05, "loss": 0.3919, "num_input_tokens_seen": 39576080, "step": 68160 }, { "epoch": 10.152666070896634, "grad_norm": 1.8241610527038574, "learning_rate": 2.868437251673604e-05, "loss": 0.8373, "num_input_tokens_seen": 39578928, "step": 68165 }, { "epoch": 10.153410783437593, "grad_norm": 2.1174309253692627, "learning_rate": 2.8681158545563375e-05, "loss": 0.6031, "num_input_tokens_seen": 39581840, "step": 68170 }, { "epoch": 10.154155495978552, "grad_norm": 0.8472251892089844, "learning_rate": 2.8677944512201283e-05, "loss": 0.5293, "num_input_tokens_seen": 39584880, "step": 68175 }, { "epoch": 10.15490020851951, "grad_norm": 1.723642349243164, "learning_rate": 2.8674730416704056e-05, "loss": 0.5996, "num_input_tokens_seen": 39588144, "step": 68180 }, { "epoch": 10.155644921060471, "grad_norm": 0.8067336678504944, "learning_rate": 2.8671516259125985e-05, "loss": 0.5092, "num_input_tokens_seen": 39591056, "step": 68185 }, { "epoch": 10.15638963360143, "grad_norm": 1.4565407037734985, "learning_rate": 2.866830203952139e-05, "loss": 0.6479, "num_input_tokens_seen": 39594160, "step": 68190 }, { "epoch": 10.157134346142389, "grad_norm": 0.9982093572616577, "learning_rate": 2.866508775794455e-05, "loss": 0.7275, "num_input_tokens_seen": 39597136, "step": 68195 }, { "epoch": 10.157879058683347, "grad_norm": 1.8567661046981812, "learning_rate": 2.866187341444978e-05, "loss": 0.6598, "num_input_tokens_seen": 39600144, "step": 68200 }, { "epoch": 10.158623771224308, "grad_norm": 1.8622558116912842, "learning_rate": 2.8658659009091383e-05, "loss": 0.7384, "num_input_tokens_seen": 39603248, "step": 68205 }, { "epoch": 10.159368483765267, "grad_norm": 1.7361072301864624, "learning_rate": 2.865544454192366e-05, "loss": 0.7319, "num_input_tokens_seen": 39606096, "step": 68210 }, { "epoch": 10.160113196306225, "grad_norm": 2.0933728218078613, "learning_rate": 2.8652230013000914e-05, "loss": 0.7177, "num_input_tokens_seen": 39609072, "step": 68215 }, { "epoch": 10.160857908847184, "grad_norm": 2.3880929946899414, "learning_rate": 2.8649015422377456e-05, "loss": 0.703, "num_input_tokens_seen": 39611952, "step": 68220 }, { "epoch": 10.161602621388145, "grad_norm": 1.3394757509231567, "learning_rate": 2.864580077010759e-05, "loss": 0.6602, "num_input_tokens_seen": 39614736, "step": 68225 }, { "epoch": 10.162347333929103, "grad_norm": 1.698228120803833, "learning_rate": 2.8642586056245628e-05, "loss": 0.5848, "num_input_tokens_seen": 39617424, "step": 68230 }, { "epoch": 10.163092046470062, "grad_norm": 1.3952770233154297, "learning_rate": 2.8639371280845872e-05, "loss": 0.6369, "num_input_tokens_seen": 39620592, "step": 68235 }, { "epoch": 10.163836759011021, "grad_norm": 1.788689374923706, "learning_rate": 2.863615644396264e-05, "loss": 0.5247, "num_input_tokens_seen": 39623632, "step": 68240 }, { "epoch": 10.164581471551982, "grad_norm": 1.8008761405944824, "learning_rate": 2.863294154565025e-05, "loss": 0.5955, "num_input_tokens_seen": 39626704, "step": 68245 }, { "epoch": 10.16532618409294, "grad_norm": 2.065190553665161, "learning_rate": 2.862972658596299e-05, "loss": 0.6101, "num_input_tokens_seen": 39629488, "step": 68250 }, { "epoch": 10.166070896633899, "grad_norm": 1.1874265670776367, "learning_rate": 2.8626511564955195e-05, "loss": 0.527, "num_input_tokens_seen": 39632400, "step": 68255 }, { "epoch": 10.166815609174858, "grad_norm": 2.2124316692352295, "learning_rate": 2.8623296482681166e-05, "loss": 0.8103, "num_input_tokens_seen": 39635536, "step": 68260 }, { "epoch": 10.167560321715818, "grad_norm": 0.9780825972557068, "learning_rate": 2.862008133919523e-05, "loss": 0.6525, "num_input_tokens_seen": 39638480, "step": 68265 }, { "epoch": 10.168305034256777, "grad_norm": 1.5700851678848267, "learning_rate": 2.8616866134551706e-05, "loss": 0.5245, "num_input_tokens_seen": 39641616, "step": 68270 }, { "epoch": 10.169049746797736, "grad_norm": 1.011388897895813, "learning_rate": 2.86136508688049e-05, "loss": 0.664, "num_input_tokens_seen": 39644528, "step": 68275 }, { "epoch": 10.169794459338695, "grad_norm": 1.341380000114441, "learning_rate": 2.861043554200914e-05, "loss": 0.4901, "num_input_tokens_seen": 39647504, "step": 68280 }, { "epoch": 10.170539171879655, "grad_norm": 1.6266964673995972, "learning_rate": 2.8607220154218734e-05, "loss": 0.5835, "num_input_tokens_seen": 39650544, "step": 68285 }, { "epoch": 10.171283884420614, "grad_norm": 1.1921191215515137, "learning_rate": 2.860400470548801e-05, "loss": 0.3821, "num_input_tokens_seen": 39653296, "step": 68290 }, { "epoch": 10.172028596961573, "grad_norm": 0.8301233649253845, "learning_rate": 2.8600789195871286e-05, "loss": 0.5148, "num_input_tokens_seen": 39656176, "step": 68295 }, { "epoch": 10.172773309502531, "grad_norm": 1.7110705375671387, "learning_rate": 2.8597573625422892e-05, "loss": 0.6946, "num_input_tokens_seen": 39659568, "step": 68300 }, { "epoch": 10.173518022043492, "grad_norm": 2.0518417358398438, "learning_rate": 2.859435799419715e-05, "loss": 0.8386, "num_input_tokens_seen": 39662352, "step": 68305 }, { "epoch": 10.17426273458445, "grad_norm": 1.6291650533676147, "learning_rate": 2.8591142302248392e-05, "loss": 0.5948, "num_input_tokens_seen": 39665360, "step": 68310 }, { "epoch": 10.17500744712541, "grad_norm": 1.4586389064788818, "learning_rate": 2.8587926549630923e-05, "loss": 0.6251, "num_input_tokens_seen": 39668144, "step": 68315 }, { "epoch": 10.175752159666368, "grad_norm": 1.9309477806091309, "learning_rate": 2.858471073639908e-05, "loss": 0.6226, "num_input_tokens_seen": 39670960, "step": 68320 }, { "epoch": 10.176496872207329, "grad_norm": 1.3085986375808716, "learning_rate": 2.8581494862607194e-05, "loss": 0.8388, "num_input_tokens_seen": 39674096, "step": 68325 }, { "epoch": 10.177241584748288, "grad_norm": 1.196682095527649, "learning_rate": 2.8578278928309594e-05, "loss": 0.4341, "num_input_tokens_seen": 39676624, "step": 68330 }, { "epoch": 10.177986297289246, "grad_norm": 1.5461950302124023, "learning_rate": 2.8575062933560605e-05, "loss": 0.5751, "num_input_tokens_seen": 39679376, "step": 68335 }, { "epoch": 10.178731009830205, "grad_norm": 1.4381088018417358, "learning_rate": 2.8571846878414565e-05, "loss": 0.7374, "num_input_tokens_seen": 39682256, "step": 68340 }, { "epoch": 10.179475722371166, "grad_norm": 1.5970302820205688, "learning_rate": 2.8568630762925803e-05, "loss": 0.5427, "num_input_tokens_seen": 39685360, "step": 68345 }, { "epoch": 10.180220434912124, "grad_norm": 1.4848852157592773, "learning_rate": 2.8565414587148654e-05, "loss": 0.4196, "num_input_tokens_seen": 39688272, "step": 68350 }, { "epoch": 10.180965147453083, "grad_norm": 2.2971558570861816, "learning_rate": 2.856219835113744e-05, "loss": 0.4914, "num_input_tokens_seen": 39691536, "step": 68355 }, { "epoch": 10.181709859994042, "grad_norm": 1.5095250606536865, "learning_rate": 2.8558982054946515e-05, "loss": 0.6152, "num_input_tokens_seen": 39694384, "step": 68360 }, { "epoch": 10.182454572535, "grad_norm": 0.8601177334785461, "learning_rate": 2.85557656986302e-05, "loss": 0.4439, "num_input_tokens_seen": 39697136, "step": 68365 }, { "epoch": 10.183199285075961, "grad_norm": 1.3313381671905518, "learning_rate": 2.8552549282242836e-05, "loss": 0.6057, "num_input_tokens_seen": 39700176, "step": 68370 }, { "epoch": 10.18394399761692, "grad_norm": 0.7535238862037659, "learning_rate": 2.854933280583877e-05, "loss": 0.5553, "num_input_tokens_seen": 39703152, "step": 68375 }, { "epoch": 10.184688710157879, "grad_norm": 1.203805923461914, "learning_rate": 2.8546116269472322e-05, "loss": 0.5802, "num_input_tokens_seen": 39706160, "step": 68380 }, { "epoch": 10.185433422698837, "grad_norm": 1.6461056470870972, "learning_rate": 2.8542899673197847e-05, "loss": 0.6245, "num_input_tokens_seen": 39709264, "step": 68385 }, { "epoch": 10.186178135239798, "grad_norm": 0.7925124168395996, "learning_rate": 2.8539683017069697e-05, "loss": 0.6763, "num_input_tokens_seen": 39712112, "step": 68390 }, { "epoch": 10.186922847780757, "grad_norm": 1.1916171312332153, "learning_rate": 2.8536466301142185e-05, "loss": 0.4834, "num_input_tokens_seen": 39714960, "step": 68395 }, { "epoch": 10.187667560321715, "grad_norm": 2.6428003311157227, "learning_rate": 2.853324952546967e-05, "loss": 0.6743, "num_input_tokens_seen": 39718032, "step": 68400 }, { "epoch": 10.188412272862674, "grad_norm": 1.3380651473999023, "learning_rate": 2.8530032690106494e-05, "loss": 0.6754, "num_input_tokens_seen": 39721072, "step": 68405 }, { "epoch": 10.189156985403635, "grad_norm": 1.1962904930114746, "learning_rate": 2.8526815795107016e-05, "loss": 0.622, "num_input_tokens_seen": 39724176, "step": 68410 }, { "epoch": 10.189901697944594, "grad_norm": 2.4504282474517822, "learning_rate": 2.8523598840525563e-05, "loss": 0.6318, "num_input_tokens_seen": 39726992, "step": 68415 }, { "epoch": 10.190646410485552, "grad_norm": 1.3820016384124756, "learning_rate": 2.852038182641648e-05, "loss": 0.57, "num_input_tokens_seen": 39730064, "step": 68420 }, { "epoch": 10.191391123026511, "grad_norm": 1.6640474796295166, "learning_rate": 2.8517164752834136e-05, "loss": 0.6904, "num_input_tokens_seen": 39732976, "step": 68425 }, { "epoch": 10.192135835567472, "grad_norm": 1.8135486841201782, "learning_rate": 2.8513947619832866e-05, "loss": 0.6529, "num_input_tokens_seen": 39736144, "step": 68430 }, { "epoch": 10.19288054810843, "grad_norm": 1.0607069730758667, "learning_rate": 2.8510730427467015e-05, "loss": 0.4611, "num_input_tokens_seen": 39738992, "step": 68435 }, { "epoch": 10.19362526064939, "grad_norm": 1.0790947675704956, "learning_rate": 2.8507513175790944e-05, "loss": 0.5346, "num_input_tokens_seen": 39741808, "step": 68440 }, { "epoch": 10.194369973190348, "grad_norm": 0.8273305296897888, "learning_rate": 2.850429586485901e-05, "loss": 0.7795, "num_input_tokens_seen": 39744720, "step": 68445 }, { "epoch": 10.195114685731308, "grad_norm": 0.846646785736084, "learning_rate": 2.850107849472555e-05, "loss": 0.5427, "num_input_tokens_seen": 39747408, "step": 68450 }, { "epoch": 10.195859398272267, "grad_norm": 1.1831408739089966, "learning_rate": 2.8497861065444937e-05, "loss": 0.4281, "num_input_tokens_seen": 39750128, "step": 68455 }, { "epoch": 10.196604110813226, "grad_norm": 2.0669126510620117, "learning_rate": 2.8494643577071506e-05, "loss": 0.5744, "num_input_tokens_seen": 39753296, "step": 68460 }, { "epoch": 10.197348823354185, "grad_norm": 1.0678318738937378, "learning_rate": 2.849142602965963e-05, "loss": 0.6129, "num_input_tokens_seen": 39756016, "step": 68465 }, { "epoch": 10.198093535895145, "grad_norm": 0.8871248960494995, "learning_rate": 2.8488208423263663e-05, "loss": 0.6046, "num_input_tokens_seen": 39759312, "step": 68470 }, { "epoch": 10.198838248436104, "grad_norm": 0.9030411839485168, "learning_rate": 2.8484990757937958e-05, "loss": 0.5793, "num_input_tokens_seen": 39762192, "step": 68475 }, { "epoch": 10.199582960977063, "grad_norm": 1.5157467126846313, "learning_rate": 2.848177303373687e-05, "loss": 0.7134, "num_input_tokens_seen": 39765232, "step": 68480 }, { "epoch": 10.200327673518021, "grad_norm": 0.9911561608314514, "learning_rate": 2.847855525071477e-05, "loss": 0.5825, "num_input_tokens_seen": 39768208, "step": 68485 }, { "epoch": 10.201072386058982, "grad_norm": 1.282118797302246, "learning_rate": 2.8475337408926005e-05, "loss": 0.6189, "num_input_tokens_seen": 39771312, "step": 68490 }, { "epoch": 10.20181709859994, "grad_norm": 1.9101567268371582, "learning_rate": 2.8472119508424954e-05, "loss": 0.6147, "num_input_tokens_seen": 39774192, "step": 68495 }, { "epoch": 10.2025618111409, "grad_norm": 1.3920570611953735, "learning_rate": 2.8468901549265976e-05, "loss": 0.7285, "num_input_tokens_seen": 39777424, "step": 68500 }, { "epoch": 10.203306523681858, "grad_norm": 1.7820154428482056, "learning_rate": 2.8465683531503435e-05, "loss": 0.8004, "num_input_tokens_seen": 39780208, "step": 68505 }, { "epoch": 10.204051236222819, "grad_norm": 2.5569536685943604, "learning_rate": 2.8462465455191682e-05, "loss": 0.6436, "num_input_tokens_seen": 39782928, "step": 68510 }, { "epoch": 10.204795948763778, "grad_norm": 0.9230301976203918, "learning_rate": 2.845924732038511e-05, "loss": 0.5121, "num_input_tokens_seen": 39785488, "step": 68515 }, { "epoch": 10.205540661304736, "grad_norm": 0.9648760557174683, "learning_rate": 2.8456029127138056e-05, "loss": 0.6305, "num_input_tokens_seen": 39788176, "step": 68520 }, { "epoch": 10.206285373845695, "grad_norm": 1.1951473951339722, "learning_rate": 2.8452810875504903e-05, "loss": 0.7029, "num_input_tokens_seen": 39790832, "step": 68525 }, { "epoch": 10.207030086386654, "grad_norm": 1.2050944566726685, "learning_rate": 2.8449592565540024e-05, "loss": 0.7934, "num_input_tokens_seen": 39793968, "step": 68530 }, { "epoch": 10.207774798927614, "grad_norm": 1.6758372783660889, "learning_rate": 2.844637419729778e-05, "loss": 0.7368, "num_input_tokens_seen": 39796816, "step": 68535 }, { "epoch": 10.208519511468573, "grad_norm": 1.1111623048782349, "learning_rate": 2.844315577083255e-05, "loss": 0.6237, "num_input_tokens_seen": 39799824, "step": 68540 }, { "epoch": 10.209264224009532, "grad_norm": 1.042437195777893, "learning_rate": 2.8439937286198704e-05, "loss": 0.5095, "num_input_tokens_seen": 39802896, "step": 68545 }, { "epoch": 10.21000893655049, "grad_norm": 1.0736205577850342, "learning_rate": 2.8436718743450614e-05, "loss": 0.6359, "num_input_tokens_seen": 39806128, "step": 68550 }, { "epoch": 10.210753649091451, "grad_norm": 1.2806332111358643, "learning_rate": 2.8433500142642654e-05, "loss": 0.4943, "num_input_tokens_seen": 39809040, "step": 68555 }, { "epoch": 10.21149836163241, "grad_norm": 0.9186065793037415, "learning_rate": 2.8430281483829196e-05, "loss": 0.447, "num_input_tokens_seen": 39811920, "step": 68560 }, { "epoch": 10.212243074173369, "grad_norm": 1.094114065170288, "learning_rate": 2.842706276706462e-05, "loss": 0.498, "num_input_tokens_seen": 39814672, "step": 68565 }, { "epoch": 10.212987786714327, "grad_norm": 1.182242512702942, "learning_rate": 2.8423843992403298e-05, "loss": 0.6409, "num_input_tokens_seen": 39817456, "step": 68570 }, { "epoch": 10.213732499255288, "grad_norm": 1.8352562189102173, "learning_rate": 2.8420625159899622e-05, "loss": 0.6868, "num_input_tokens_seen": 39820496, "step": 68575 }, { "epoch": 10.214477211796247, "grad_norm": 0.9796020984649658, "learning_rate": 2.8417406269607954e-05, "loss": 0.5227, "num_input_tokens_seen": 39823120, "step": 68580 }, { "epoch": 10.215221924337206, "grad_norm": 1.162882924079895, "learning_rate": 2.8414187321582676e-05, "loss": 0.6741, "num_input_tokens_seen": 39826000, "step": 68585 }, { "epoch": 10.215966636878164, "grad_norm": 1.404640793800354, "learning_rate": 2.8410968315878178e-05, "loss": 0.465, "num_input_tokens_seen": 39828720, "step": 68590 }, { "epoch": 10.216711349419125, "grad_norm": 1.8769584894180298, "learning_rate": 2.8407749252548843e-05, "loss": 0.5657, "num_input_tokens_seen": 39831984, "step": 68595 }, { "epoch": 10.217456061960084, "grad_norm": 1.5018614530563354, "learning_rate": 2.8404530131649036e-05, "loss": 0.5915, "num_input_tokens_seen": 39835056, "step": 68600 }, { "epoch": 10.218200774501042, "grad_norm": 1.1956809759140015, "learning_rate": 2.8401310953233158e-05, "loss": 0.4942, "num_input_tokens_seen": 39838032, "step": 68605 }, { "epoch": 10.218945487042001, "grad_norm": 1.1372735500335693, "learning_rate": 2.839809171735559e-05, "loss": 0.6694, "num_input_tokens_seen": 39840912, "step": 68610 }, { "epoch": 10.219690199582962, "grad_norm": 1.1061714887619019, "learning_rate": 2.8394872424070716e-05, "loss": 0.539, "num_input_tokens_seen": 39843600, "step": 68615 }, { "epoch": 10.22043491212392, "grad_norm": 2.955592393875122, "learning_rate": 2.8391653073432918e-05, "loss": 0.5561, "num_input_tokens_seen": 39846416, "step": 68620 }, { "epoch": 10.22117962466488, "grad_norm": 0.8852096199989319, "learning_rate": 2.838843366549659e-05, "loss": 0.491, "num_input_tokens_seen": 39849264, "step": 68625 }, { "epoch": 10.221924337205838, "grad_norm": 0.7787603139877319, "learning_rate": 2.8385214200316118e-05, "loss": 0.5861, "num_input_tokens_seen": 39852016, "step": 68630 }, { "epoch": 10.222669049746798, "grad_norm": 1.1010048389434814, "learning_rate": 2.83819946779459e-05, "loss": 0.5608, "num_input_tokens_seen": 39854896, "step": 68635 }, { "epoch": 10.223413762287757, "grad_norm": 1.309432864189148, "learning_rate": 2.8378775098440318e-05, "loss": 0.5741, "num_input_tokens_seen": 39857776, "step": 68640 }, { "epoch": 10.224158474828716, "grad_norm": 1.5534240007400513, "learning_rate": 2.8375555461853764e-05, "loss": 0.4343, "num_input_tokens_seen": 39860688, "step": 68645 }, { "epoch": 10.224903187369675, "grad_norm": 2.12388277053833, "learning_rate": 2.8372335768240626e-05, "loss": 0.6221, "num_input_tokens_seen": 39863504, "step": 68650 }, { "epoch": 10.225647899910635, "grad_norm": 2.7793686389923096, "learning_rate": 2.8369116017655307e-05, "loss": 0.7389, "num_input_tokens_seen": 39866544, "step": 68655 }, { "epoch": 10.226392612451594, "grad_norm": 2.175945281982422, "learning_rate": 2.836589621015219e-05, "loss": 0.4633, "num_input_tokens_seen": 39869392, "step": 68660 }, { "epoch": 10.227137324992553, "grad_norm": 1.7887585163116455, "learning_rate": 2.8362676345785683e-05, "loss": 0.5313, "num_input_tokens_seen": 39872816, "step": 68665 }, { "epoch": 10.227882037533512, "grad_norm": 1.011412262916565, "learning_rate": 2.835945642461018e-05, "loss": 0.8247, "num_input_tokens_seen": 39875696, "step": 68670 }, { "epoch": 10.228626750074472, "grad_norm": 1.539394736289978, "learning_rate": 2.8356236446680073e-05, "loss": 0.5382, "num_input_tokens_seen": 39878224, "step": 68675 }, { "epoch": 10.22937146261543, "grad_norm": 0.8040351271629333, "learning_rate": 2.835301641204976e-05, "loss": 0.6939, "num_input_tokens_seen": 39880880, "step": 68680 }, { "epoch": 10.23011617515639, "grad_norm": 1.3036059141159058, "learning_rate": 2.834979632077364e-05, "loss": 0.6592, "num_input_tokens_seen": 39883728, "step": 68685 }, { "epoch": 10.230860887697348, "grad_norm": 1.4663805961608887, "learning_rate": 2.834657617290612e-05, "loss": 0.6621, "num_input_tokens_seen": 39886768, "step": 68690 }, { "epoch": 10.231605600238307, "grad_norm": 1.7522010803222656, "learning_rate": 2.8343355968501596e-05, "loss": 0.9175, "num_input_tokens_seen": 39889808, "step": 68695 }, { "epoch": 10.232350312779268, "grad_norm": 1.9334428310394287, "learning_rate": 2.8340135707614467e-05, "loss": 0.6916, "num_input_tokens_seen": 39892784, "step": 68700 }, { "epoch": 10.233095025320226, "grad_norm": 1.233311653137207, "learning_rate": 2.8336915390299152e-05, "loss": 0.4851, "num_input_tokens_seen": 39895440, "step": 68705 }, { "epoch": 10.233839737861185, "grad_norm": 1.2574695348739624, "learning_rate": 2.8333695016610034e-05, "loss": 0.561, "num_input_tokens_seen": 39898096, "step": 68710 }, { "epoch": 10.234584450402144, "grad_norm": 1.2242047786712646, "learning_rate": 2.833047458660153e-05, "loss": 0.6118, "num_input_tokens_seen": 39901424, "step": 68715 }, { "epoch": 10.235329162943104, "grad_norm": 1.45832097530365, "learning_rate": 2.8327254100328044e-05, "loss": 0.7996, "num_input_tokens_seen": 39904400, "step": 68720 }, { "epoch": 10.236073875484063, "grad_norm": 0.813689112663269, "learning_rate": 2.8324033557843975e-05, "loss": 0.5885, "num_input_tokens_seen": 39906928, "step": 68725 }, { "epoch": 10.236818588025022, "grad_norm": 1.348809003829956, "learning_rate": 2.832081295920374e-05, "loss": 0.7296, "num_input_tokens_seen": 39909744, "step": 68730 }, { "epoch": 10.23756330056598, "grad_norm": 1.1606897115707397, "learning_rate": 2.8317592304461744e-05, "loss": 0.4883, "num_input_tokens_seen": 39912560, "step": 68735 }, { "epoch": 10.238308013106941, "grad_norm": 1.8159927129745483, "learning_rate": 2.8314371593672408e-05, "loss": 0.7009, "num_input_tokens_seen": 39915152, "step": 68740 }, { "epoch": 10.2390527256479, "grad_norm": 2.2751312255859375, "learning_rate": 2.8311150826890122e-05, "loss": 0.7177, "num_input_tokens_seen": 39918000, "step": 68745 }, { "epoch": 10.239797438188859, "grad_norm": 1.6344858407974243, "learning_rate": 2.830793000416931e-05, "loss": 0.6629, "num_input_tokens_seen": 39920624, "step": 68750 }, { "epoch": 10.240542150729818, "grad_norm": 1.1037434339523315, "learning_rate": 2.8304709125564382e-05, "loss": 0.5055, "num_input_tokens_seen": 39923792, "step": 68755 }, { "epoch": 10.241286863270778, "grad_norm": 2.122793674468994, "learning_rate": 2.8301488191129756e-05, "loss": 0.7386, "num_input_tokens_seen": 39926608, "step": 68760 }, { "epoch": 10.242031575811737, "grad_norm": 2.994082450866699, "learning_rate": 2.8298267200919836e-05, "loss": 0.6385, "num_input_tokens_seen": 39929296, "step": 68765 }, { "epoch": 10.242776288352696, "grad_norm": 1.9395602941513062, "learning_rate": 2.8295046154989047e-05, "loss": 0.7085, "num_input_tokens_seen": 39932144, "step": 68770 }, { "epoch": 10.243521000893654, "grad_norm": 1.04889976978302, "learning_rate": 2.8291825053391808e-05, "loss": 0.5805, "num_input_tokens_seen": 39934992, "step": 68775 }, { "epoch": 10.244265713434615, "grad_norm": 0.88669753074646, "learning_rate": 2.828860389618252e-05, "loss": 0.5061, "num_input_tokens_seen": 39938000, "step": 68780 }, { "epoch": 10.245010425975574, "grad_norm": 2.3476390838623047, "learning_rate": 2.8285382683415617e-05, "loss": 0.6463, "num_input_tokens_seen": 39940880, "step": 68785 }, { "epoch": 10.245755138516532, "grad_norm": 1.3195220232009888, "learning_rate": 2.8282161415145513e-05, "loss": 0.8167, "num_input_tokens_seen": 39943536, "step": 68790 }, { "epoch": 10.246499851057491, "grad_norm": 1.2934436798095703, "learning_rate": 2.827894009142663e-05, "loss": 0.4834, "num_input_tokens_seen": 39946352, "step": 68795 }, { "epoch": 10.247244563598452, "grad_norm": 1.6830790042877197, "learning_rate": 2.827571871231338e-05, "loss": 0.4525, "num_input_tokens_seen": 39949200, "step": 68800 }, { "epoch": 10.24798927613941, "grad_norm": 1.445699691772461, "learning_rate": 2.82724972778602e-05, "loss": 0.5778, "num_input_tokens_seen": 39952144, "step": 68805 }, { "epoch": 10.24873398868037, "grad_norm": 1.1581844091415405, "learning_rate": 2.8269275788121503e-05, "loss": 0.8922, "num_input_tokens_seen": 39955376, "step": 68810 }, { "epoch": 10.249478701221328, "grad_norm": 1.3869397640228271, "learning_rate": 2.8266054243151708e-05, "loss": 0.5503, "num_input_tokens_seen": 39958096, "step": 68815 }, { "epoch": 10.250223413762289, "grad_norm": 1.286991834640503, "learning_rate": 2.8262832643005242e-05, "loss": 0.6752, "num_input_tokens_seen": 39960816, "step": 68820 }, { "epoch": 10.250968126303247, "grad_norm": 0.8418287634849548, "learning_rate": 2.8259610987736545e-05, "loss": 0.7114, "num_input_tokens_seen": 39963984, "step": 68825 }, { "epoch": 10.251712838844206, "grad_norm": 1.9278031587600708, "learning_rate": 2.825638927740003e-05, "loss": 0.6427, "num_input_tokens_seen": 39966928, "step": 68830 }, { "epoch": 10.252457551385165, "grad_norm": 1.0767046213150024, "learning_rate": 2.825316751205013e-05, "loss": 0.5129, "num_input_tokens_seen": 39969840, "step": 68835 }, { "epoch": 10.253202263926125, "grad_norm": 1.0977197885513306, "learning_rate": 2.8249945691741276e-05, "loss": 0.5845, "num_input_tokens_seen": 39972784, "step": 68840 }, { "epoch": 10.253946976467084, "grad_norm": 1.0719780921936035, "learning_rate": 2.824672381652788e-05, "loss": 0.3592, "num_input_tokens_seen": 39975632, "step": 68845 }, { "epoch": 10.254691689008043, "grad_norm": 2.0232677459716797, "learning_rate": 2.8243501886464392e-05, "loss": 0.8693, "num_input_tokens_seen": 39978480, "step": 68850 }, { "epoch": 10.255436401549002, "grad_norm": 1.6728121042251587, "learning_rate": 2.8240279901605238e-05, "loss": 0.5599, "num_input_tokens_seen": 39981328, "step": 68855 }, { "epoch": 10.256181114089962, "grad_norm": 1.2474160194396973, "learning_rate": 2.823705786200484e-05, "loss": 0.6025, "num_input_tokens_seen": 39984048, "step": 68860 }, { "epoch": 10.256925826630921, "grad_norm": 1.2055753469467163, "learning_rate": 2.8233835767717642e-05, "loss": 0.5283, "num_input_tokens_seen": 39986928, "step": 68865 }, { "epoch": 10.25767053917188, "grad_norm": 1.4718286991119385, "learning_rate": 2.8230613618798086e-05, "loss": 0.5722, "num_input_tokens_seen": 39990096, "step": 68870 }, { "epoch": 10.258415251712838, "grad_norm": 2.0007660388946533, "learning_rate": 2.822739141530059e-05, "loss": 0.764, "num_input_tokens_seen": 39992912, "step": 68875 }, { "epoch": 10.259159964253797, "grad_norm": 1.8009998798370361, "learning_rate": 2.8224169157279597e-05, "loss": 0.6359, "num_input_tokens_seen": 39995920, "step": 68880 }, { "epoch": 10.259904676794758, "grad_norm": 1.6475015878677368, "learning_rate": 2.8220946844789535e-05, "loss": 0.5537, "num_input_tokens_seen": 39999088, "step": 68885 }, { "epoch": 10.260649389335716, "grad_norm": 1.2087641954421997, "learning_rate": 2.8217724477884854e-05, "loss": 0.5087, "num_input_tokens_seen": 40002192, "step": 68890 }, { "epoch": 10.261394101876675, "grad_norm": 1.0671244859695435, "learning_rate": 2.821450205661999e-05, "loss": 0.6202, "num_input_tokens_seen": 40005040, "step": 68895 }, { "epoch": 10.262138814417634, "grad_norm": 1.5462349653244019, "learning_rate": 2.8211279581049384e-05, "loss": 0.7992, "num_input_tokens_seen": 40007760, "step": 68900 }, { "epoch": 10.262883526958595, "grad_norm": 1.0498631000518799, "learning_rate": 2.8208057051227473e-05, "loss": 0.3967, "num_input_tokens_seen": 40010640, "step": 68905 }, { "epoch": 10.263628239499553, "grad_norm": 2.1310789585113525, "learning_rate": 2.820483446720869e-05, "loss": 0.6786, "num_input_tokens_seen": 40013488, "step": 68910 }, { "epoch": 10.264372952040512, "grad_norm": 1.466503381729126, "learning_rate": 2.8201611829047498e-05, "loss": 0.6534, "num_input_tokens_seen": 40016368, "step": 68915 }, { "epoch": 10.26511766458147, "grad_norm": 1.401429533958435, "learning_rate": 2.819838913679832e-05, "loss": 0.4825, "num_input_tokens_seen": 40019376, "step": 68920 }, { "epoch": 10.265862377122431, "grad_norm": 1.9873735904693604, "learning_rate": 2.819516639051561e-05, "loss": 0.5783, "num_input_tokens_seen": 40022768, "step": 68925 }, { "epoch": 10.26660708966339, "grad_norm": 0.7389929890632629, "learning_rate": 2.8191943590253806e-05, "loss": 0.4279, "num_input_tokens_seen": 40025616, "step": 68930 }, { "epoch": 10.267351802204349, "grad_norm": 3.4056894779205322, "learning_rate": 2.8188720736067364e-05, "loss": 0.694, "num_input_tokens_seen": 40028560, "step": 68935 }, { "epoch": 10.268096514745308, "grad_norm": 1.321363091468811, "learning_rate": 2.818549782801073e-05, "loss": 0.43, "num_input_tokens_seen": 40031600, "step": 68940 }, { "epoch": 10.268841227286268, "grad_norm": 0.9525887966156006, "learning_rate": 2.8182274866138343e-05, "loss": 0.5993, "num_input_tokens_seen": 40034576, "step": 68945 }, { "epoch": 10.269585939827227, "grad_norm": 0.8823301196098328, "learning_rate": 2.8179051850504656e-05, "loss": 0.6242, "num_input_tokens_seen": 40037584, "step": 68950 }, { "epoch": 10.270330652368186, "grad_norm": 1.4398236274719238, "learning_rate": 2.8175828781164127e-05, "loss": 0.8178, "num_input_tokens_seen": 40040496, "step": 68955 }, { "epoch": 10.271075364909144, "grad_norm": 1.1807286739349365, "learning_rate": 2.8172605658171192e-05, "loss": 0.5997, "num_input_tokens_seen": 40043696, "step": 68960 }, { "epoch": 10.271820077450105, "grad_norm": 1.924539566040039, "learning_rate": 2.8169382481580303e-05, "loss": 0.6092, "num_input_tokens_seen": 40046480, "step": 68965 }, { "epoch": 10.272564789991064, "grad_norm": 0.8627010583877563, "learning_rate": 2.8166159251445928e-05, "loss": 0.5533, "num_input_tokens_seen": 40049488, "step": 68970 }, { "epoch": 10.273309502532022, "grad_norm": 2.2628297805786133, "learning_rate": 2.8162935967822505e-05, "loss": 0.631, "num_input_tokens_seen": 40052336, "step": 68975 }, { "epoch": 10.274054215072981, "grad_norm": 1.0719592571258545, "learning_rate": 2.8159712630764494e-05, "loss": 0.6099, "num_input_tokens_seen": 40055120, "step": 68980 }, { "epoch": 10.274798927613942, "grad_norm": 0.8036710619926453, "learning_rate": 2.815648924032635e-05, "loss": 0.4535, "num_input_tokens_seen": 40057808, "step": 68985 }, { "epoch": 10.2755436401549, "grad_norm": 1.823050856590271, "learning_rate": 2.8153265796562528e-05, "loss": 0.5423, "num_input_tokens_seen": 40060528, "step": 68990 }, { "epoch": 10.27628835269586, "grad_norm": 2.4857499599456787, "learning_rate": 2.815004229952749e-05, "loss": 0.6223, "num_input_tokens_seen": 40063280, "step": 68995 }, { "epoch": 10.277033065236818, "grad_norm": 1.2553154230117798, "learning_rate": 2.8146818749275684e-05, "loss": 0.4713, "num_input_tokens_seen": 40066256, "step": 69000 }, { "epoch": 10.277777777777779, "grad_norm": 1.7573466300964355, "learning_rate": 2.814359514586158e-05, "loss": 0.6268, "num_input_tokens_seen": 40069232, "step": 69005 }, { "epoch": 10.278522490318737, "grad_norm": 0.9891617298126221, "learning_rate": 2.8140371489339624e-05, "loss": 0.596, "num_input_tokens_seen": 40071952, "step": 69010 }, { "epoch": 10.279267202859696, "grad_norm": 1.7367095947265625, "learning_rate": 2.8137147779764285e-05, "loss": 0.5186, "num_input_tokens_seen": 40074768, "step": 69015 }, { "epoch": 10.280011915400655, "grad_norm": 1.7877167463302612, "learning_rate": 2.8133924017190023e-05, "loss": 0.5282, "num_input_tokens_seen": 40077616, "step": 69020 }, { "epoch": 10.280756627941615, "grad_norm": 1.3598712682724, "learning_rate": 2.8130700201671296e-05, "loss": 0.5302, "num_input_tokens_seen": 40080560, "step": 69025 }, { "epoch": 10.281501340482574, "grad_norm": 1.9279847145080566, "learning_rate": 2.812747633326257e-05, "loss": 0.6978, "num_input_tokens_seen": 40083408, "step": 69030 }, { "epoch": 10.282246053023533, "grad_norm": 2.0133163928985596, "learning_rate": 2.812425241201832e-05, "loss": 0.5839, "num_input_tokens_seen": 40086352, "step": 69035 }, { "epoch": 10.282990765564492, "grad_norm": 1.880751609802246, "learning_rate": 2.8121028437993002e-05, "loss": 0.6845, "num_input_tokens_seen": 40089264, "step": 69040 }, { "epoch": 10.283735478105452, "grad_norm": 0.9913706183433533, "learning_rate": 2.8117804411241074e-05, "loss": 0.5334, "num_input_tokens_seen": 40092272, "step": 69045 }, { "epoch": 10.284480190646411, "grad_norm": 1.5377130508422852, "learning_rate": 2.8114580331817004e-05, "loss": 0.5193, "num_input_tokens_seen": 40095056, "step": 69050 }, { "epoch": 10.28522490318737, "grad_norm": 2.4632675647735596, "learning_rate": 2.8111356199775268e-05, "loss": 0.4688, "num_input_tokens_seen": 40098288, "step": 69055 }, { "epoch": 10.285969615728328, "grad_norm": 0.7484036684036255, "learning_rate": 2.8108132015170337e-05, "loss": 0.4147, "num_input_tokens_seen": 40101232, "step": 69060 }, { "epoch": 10.286714328269287, "grad_norm": 1.1591688394546509, "learning_rate": 2.8104907778056667e-05, "loss": 0.6079, "num_input_tokens_seen": 40104048, "step": 69065 }, { "epoch": 10.287459040810248, "grad_norm": 1.4459545612335205, "learning_rate": 2.8101683488488745e-05, "loss": 0.6874, "num_input_tokens_seen": 40107024, "step": 69070 }, { "epoch": 10.288203753351207, "grad_norm": 1.5657869577407837, "learning_rate": 2.8098459146521026e-05, "loss": 0.6478, "num_input_tokens_seen": 40109904, "step": 69075 }, { "epoch": 10.288948465892165, "grad_norm": 1.3406628370285034, "learning_rate": 2.8095234752207993e-05, "loss": 0.4621, "num_input_tokens_seen": 40112816, "step": 69080 }, { "epoch": 10.289693178433124, "grad_norm": 1.2950239181518555, "learning_rate": 2.809201030560411e-05, "loss": 0.438, "num_input_tokens_seen": 40115664, "step": 69085 }, { "epoch": 10.290437890974085, "grad_norm": 1.0497945547103882, "learning_rate": 2.8088785806763856e-05, "loss": 0.7244, "num_input_tokens_seen": 40118128, "step": 69090 }, { "epoch": 10.291182603515043, "grad_norm": 1.1542425155639648, "learning_rate": 2.8085561255741704e-05, "loss": 0.5279, "num_input_tokens_seen": 40121232, "step": 69095 }, { "epoch": 10.291927316056002, "grad_norm": 1.767468810081482, "learning_rate": 2.8082336652592135e-05, "loss": 0.649, "num_input_tokens_seen": 40124304, "step": 69100 }, { "epoch": 10.29267202859696, "grad_norm": 1.9380546808242798, "learning_rate": 2.8079111997369624e-05, "loss": 0.6947, "num_input_tokens_seen": 40127056, "step": 69105 }, { "epoch": 10.293416741137921, "grad_norm": 1.9486606121063232, "learning_rate": 2.807588729012864e-05, "loss": 1.0383, "num_input_tokens_seen": 40129872, "step": 69110 }, { "epoch": 10.29416145367888, "grad_norm": 2.5609776973724365, "learning_rate": 2.8072662530923666e-05, "loss": 0.6006, "num_input_tokens_seen": 40132912, "step": 69115 }, { "epoch": 10.294906166219839, "grad_norm": 2.9686098098754883, "learning_rate": 2.8069437719809182e-05, "loss": 0.4985, "num_input_tokens_seen": 40135568, "step": 69120 }, { "epoch": 10.295650878760798, "grad_norm": 1.4083658456802368, "learning_rate": 2.806621285683967e-05, "loss": 0.6988, "num_input_tokens_seen": 40138192, "step": 69125 }, { "epoch": 10.296395591301758, "grad_norm": 1.6424938440322876, "learning_rate": 2.8062987942069603e-05, "loss": 0.6048, "num_input_tokens_seen": 40141264, "step": 69130 }, { "epoch": 10.297140303842717, "grad_norm": 1.0842571258544922, "learning_rate": 2.8059762975553478e-05, "loss": 0.6463, "num_input_tokens_seen": 40144112, "step": 69135 }, { "epoch": 10.297885016383676, "grad_norm": 1.0353013277053833, "learning_rate": 2.8056537957345757e-05, "loss": 0.7468, "num_input_tokens_seen": 40147120, "step": 69140 }, { "epoch": 10.298629728924634, "grad_norm": 2.310957908630371, "learning_rate": 2.8053312887500936e-05, "loss": 0.6056, "num_input_tokens_seen": 40150064, "step": 69145 }, { "epoch": 10.299374441465595, "grad_norm": 1.3006153106689453, "learning_rate": 2.8050087766073496e-05, "loss": 0.4408, "num_input_tokens_seen": 40153040, "step": 69150 }, { "epoch": 10.300119154006554, "grad_norm": 1.3298200368881226, "learning_rate": 2.804686259311792e-05, "loss": 0.6582, "num_input_tokens_seen": 40156144, "step": 69155 }, { "epoch": 10.300863866547513, "grad_norm": 1.1015076637268066, "learning_rate": 2.8043637368688707e-05, "loss": 0.6831, "num_input_tokens_seen": 40159056, "step": 69160 }, { "epoch": 10.301608579088471, "grad_norm": 1.673079490661621, "learning_rate": 2.804041209284033e-05, "loss": 0.5107, "num_input_tokens_seen": 40161968, "step": 69165 }, { "epoch": 10.302353291629432, "grad_norm": 0.9837141036987305, "learning_rate": 2.803718676562729e-05, "loss": 0.4645, "num_input_tokens_seen": 40165008, "step": 69170 }, { "epoch": 10.30309800417039, "grad_norm": 1.1696518659591675, "learning_rate": 2.803396138710405e-05, "loss": 0.7008, "num_input_tokens_seen": 40168048, "step": 69175 }, { "epoch": 10.30384271671135, "grad_norm": 1.4864475727081299, "learning_rate": 2.8030735957325122e-05, "loss": 0.468, "num_input_tokens_seen": 40170832, "step": 69180 }, { "epoch": 10.304587429252308, "grad_norm": 0.7728701233863831, "learning_rate": 2.8027510476344986e-05, "loss": 0.4883, "num_input_tokens_seen": 40173840, "step": 69185 }, { "epoch": 10.305332141793269, "grad_norm": 2.2649154663085938, "learning_rate": 2.8024284944218145e-05, "loss": 0.6149, "num_input_tokens_seen": 40176688, "step": 69190 }, { "epoch": 10.306076854334227, "grad_norm": 2.1530706882476807, "learning_rate": 2.802105936099908e-05, "loss": 0.801, "num_input_tokens_seen": 40179184, "step": 69195 }, { "epoch": 10.306821566875186, "grad_norm": 1.7247978448867798, "learning_rate": 2.8017833726742293e-05, "loss": 0.6255, "num_input_tokens_seen": 40182128, "step": 69200 }, { "epoch": 10.307566279416145, "grad_norm": 1.6003923416137695, "learning_rate": 2.8014608041502273e-05, "loss": 0.7007, "num_input_tokens_seen": 40185008, "step": 69205 }, { "epoch": 10.308310991957104, "grad_norm": 3.092302083969116, "learning_rate": 2.8011382305333505e-05, "loss": 0.7375, "num_input_tokens_seen": 40187792, "step": 69210 }, { "epoch": 10.309055704498064, "grad_norm": 1.2573459148406982, "learning_rate": 2.8008156518290496e-05, "loss": 0.6013, "num_input_tokens_seen": 40190896, "step": 69215 }, { "epoch": 10.309800417039023, "grad_norm": 1.4079228639602661, "learning_rate": 2.8004930680427742e-05, "loss": 0.5674, "num_input_tokens_seen": 40193936, "step": 69220 }, { "epoch": 10.310545129579982, "grad_norm": 1.5919255018234253, "learning_rate": 2.8001704791799732e-05, "loss": 0.7839, "num_input_tokens_seen": 40196880, "step": 69225 }, { "epoch": 10.31128984212094, "grad_norm": 0.9000779986381531, "learning_rate": 2.799847885246098e-05, "loss": 0.6079, "num_input_tokens_seen": 40199696, "step": 69230 }, { "epoch": 10.312034554661901, "grad_norm": 1.6917787790298462, "learning_rate": 2.799525286246597e-05, "loss": 0.7735, "num_input_tokens_seen": 40202928, "step": 69235 }, { "epoch": 10.31277926720286, "grad_norm": 1.5584036111831665, "learning_rate": 2.7992026821869215e-05, "loss": 0.7, "num_input_tokens_seen": 40205872, "step": 69240 }, { "epoch": 10.313523979743819, "grad_norm": 1.14019775390625, "learning_rate": 2.7988800730725202e-05, "loss": 0.5951, "num_input_tokens_seen": 40208656, "step": 69245 }, { "epoch": 10.314268692284777, "grad_norm": 1.152948021888733, "learning_rate": 2.7985574589088437e-05, "loss": 0.6658, "num_input_tokens_seen": 40212016, "step": 69250 }, { "epoch": 10.315013404825738, "grad_norm": 1.5050712823867798, "learning_rate": 2.798234839701342e-05, "loss": 0.5826, "num_input_tokens_seen": 40214928, "step": 69255 }, { "epoch": 10.315758117366697, "grad_norm": 1.8233790397644043, "learning_rate": 2.797912215455466e-05, "loss": 0.6434, "num_input_tokens_seen": 40218000, "step": 69260 }, { "epoch": 10.316502829907655, "grad_norm": 1.5539400577545166, "learning_rate": 2.797589586176666e-05, "loss": 0.7115, "num_input_tokens_seen": 40221072, "step": 69265 }, { "epoch": 10.317247542448614, "grad_norm": 2.0953121185302734, "learning_rate": 2.797266951870393e-05, "loss": 0.7271, "num_input_tokens_seen": 40223888, "step": 69270 }, { "epoch": 10.317992254989575, "grad_norm": 1.792137622833252, "learning_rate": 2.7969443125420963e-05, "loss": 0.5321, "num_input_tokens_seen": 40226480, "step": 69275 }, { "epoch": 10.318736967530533, "grad_norm": 2.014342784881592, "learning_rate": 2.7966216681972278e-05, "loss": 0.4132, "num_input_tokens_seen": 40229584, "step": 69280 }, { "epoch": 10.319481680071492, "grad_norm": 1.8714901208877563, "learning_rate": 2.7962990188412375e-05, "loss": 0.744, "num_input_tokens_seen": 40232528, "step": 69285 }, { "epoch": 10.320226392612451, "grad_norm": 1.6422808170318604, "learning_rate": 2.7959763644795762e-05, "loss": 0.5426, "num_input_tokens_seen": 40235248, "step": 69290 }, { "epoch": 10.320971105153411, "grad_norm": 1.1478012800216675, "learning_rate": 2.7956537051176952e-05, "loss": 0.5485, "num_input_tokens_seen": 40238096, "step": 69295 }, { "epoch": 10.32171581769437, "grad_norm": 2.2035281658172607, "learning_rate": 2.7953310407610455e-05, "loss": 0.4822, "num_input_tokens_seen": 40240720, "step": 69300 }, { "epoch": 10.322460530235329, "grad_norm": 2.1559407711029053, "learning_rate": 2.7950083714150776e-05, "loss": 0.5707, "num_input_tokens_seen": 40243728, "step": 69305 }, { "epoch": 10.323205242776288, "grad_norm": 1.3692927360534668, "learning_rate": 2.794685697085243e-05, "loss": 0.5147, "num_input_tokens_seen": 40246480, "step": 69310 }, { "epoch": 10.323949955317248, "grad_norm": 1.1010710000991821, "learning_rate": 2.7943630177769932e-05, "loss": 0.6939, "num_input_tokens_seen": 40249552, "step": 69315 }, { "epoch": 10.324694667858207, "grad_norm": 1.2341551780700684, "learning_rate": 2.79404033349578e-05, "loss": 0.6214, "num_input_tokens_seen": 40252624, "step": 69320 }, { "epoch": 10.325439380399166, "grad_norm": 1.5234136581420898, "learning_rate": 2.7937176442470535e-05, "loss": 0.594, "num_input_tokens_seen": 40255664, "step": 69325 }, { "epoch": 10.326184092940125, "grad_norm": 1.4289621114730835, "learning_rate": 2.793394950036266e-05, "loss": 0.6345, "num_input_tokens_seen": 40258608, "step": 69330 }, { "epoch": 10.326928805481085, "grad_norm": 1.9922559261322021, "learning_rate": 2.7930722508688696e-05, "loss": 0.6625, "num_input_tokens_seen": 40261264, "step": 69335 }, { "epoch": 10.327673518022044, "grad_norm": 1.282319188117981, "learning_rate": 2.792749546750315e-05, "loss": 0.6937, "num_input_tokens_seen": 40264336, "step": 69340 }, { "epoch": 10.328418230563003, "grad_norm": 1.1357107162475586, "learning_rate": 2.792426837686054e-05, "loss": 0.6016, "num_input_tokens_seen": 40267184, "step": 69345 }, { "epoch": 10.329162943103961, "grad_norm": 2.7254397869110107, "learning_rate": 2.7921041236815387e-05, "loss": 0.6722, "num_input_tokens_seen": 40270000, "step": 69350 }, { "epoch": 10.329907655644922, "grad_norm": 1.7476348876953125, "learning_rate": 2.7917814047422214e-05, "loss": 0.6385, "num_input_tokens_seen": 40272976, "step": 69355 }, { "epoch": 10.33065236818588, "grad_norm": 0.9155638813972473, "learning_rate": 2.7914586808735542e-05, "loss": 0.5675, "num_input_tokens_seen": 40275696, "step": 69360 }, { "epoch": 10.33139708072684, "grad_norm": 1.1556694507598877, "learning_rate": 2.7911359520809886e-05, "loss": 0.5416, "num_input_tokens_seen": 40278416, "step": 69365 }, { "epoch": 10.332141793267798, "grad_norm": 1.0182534456253052, "learning_rate": 2.7908132183699775e-05, "loss": 0.6252, "num_input_tokens_seen": 40281392, "step": 69370 }, { "epoch": 10.332886505808759, "grad_norm": 1.3363780975341797, "learning_rate": 2.790490479745972e-05, "loss": 0.5339, "num_input_tokens_seen": 40284240, "step": 69375 }, { "epoch": 10.333631218349717, "grad_norm": 1.232374906539917, "learning_rate": 2.7901677362144252e-05, "loss": 0.5432, "num_input_tokens_seen": 40287376, "step": 69380 }, { "epoch": 10.334375930890676, "grad_norm": 0.9742134809494019, "learning_rate": 2.7898449877807885e-05, "loss": 0.6352, "num_input_tokens_seen": 40291056, "step": 69385 }, { "epoch": 10.335120643431635, "grad_norm": 1.5318297147750854, "learning_rate": 2.7895222344505163e-05, "loss": 0.4919, "num_input_tokens_seen": 40293776, "step": 69390 }, { "epoch": 10.335865355972594, "grad_norm": 2.311784029006958, "learning_rate": 2.78919947622906e-05, "loss": 0.7405, "num_input_tokens_seen": 40296720, "step": 69395 }, { "epoch": 10.336610068513554, "grad_norm": 1.223594069480896, "learning_rate": 2.788876713121873e-05, "loss": 0.4914, "num_input_tokens_seen": 40299568, "step": 69400 }, { "epoch": 10.337354781054513, "grad_norm": 1.557755708694458, "learning_rate": 2.7885539451344077e-05, "loss": 0.6791, "num_input_tokens_seen": 40302320, "step": 69405 }, { "epoch": 10.338099493595472, "grad_norm": 1.100765347480774, "learning_rate": 2.788231172272116e-05, "loss": 0.6118, "num_input_tokens_seen": 40304848, "step": 69410 }, { "epoch": 10.33884420613643, "grad_norm": 1.4151002168655396, "learning_rate": 2.7879083945404517e-05, "loss": 0.4431, "num_input_tokens_seen": 40307632, "step": 69415 }, { "epoch": 10.339588918677391, "grad_norm": 2.153714656829834, "learning_rate": 2.7875856119448672e-05, "loss": 0.6043, "num_input_tokens_seen": 40310960, "step": 69420 }, { "epoch": 10.34033363121835, "grad_norm": 1.9473681449890137, "learning_rate": 2.7872628244908167e-05, "loss": 0.5526, "num_input_tokens_seen": 40313744, "step": 69425 }, { "epoch": 10.341078343759309, "grad_norm": 0.9250386357307434, "learning_rate": 2.7869400321837525e-05, "loss": 0.7478, "num_input_tokens_seen": 40316976, "step": 69430 }, { "epoch": 10.341823056300267, "grad_norm": 2.368298292160034, "learning_rate": 2.7866172350291286e-05, "loss": 0.7754, "num_input_tokens_seen": 40319664, "step": 69435 }, { "epoch": 10.342567768841228, "grad_norm": 0.7851019501686096, "learning_rate": 2.7862944330323982e-05, "loss": 0.5485, "num_input_tokens_seen": 40322768, "step": 69440 }, { "epoch": 10.343312481382187, "grad_norm": 1.3965610265731812, "learning_rate": 2.785971626199013e-05, "loss": 0.6282, "num_input_tokens_seen": 40325680, "step": 69445 }, { "epoch": 10.344057193923145, "grad_norm": 1.4823557138442993, "learning_rate": 2.7856488145344285e-05, "loss": 0.716, "num_input_tokens_seen": 40328720, "step": 69450 }, { "epoch": 10.344801906464104, "grad_norm": 1.1561024188995361, "learning_rate": 2.785325998044097e-05, "loss": 0.7224, "num_input_tokens_seen": 40331408, "step": 69455 }, { "epoch": 10.345546619005065, "grad_norm": 0.9161731600761414, "learning_rate": 2.7850031767334734e-05, "loss": 0.5346, "num_input_tokens_seen": 40334384, "step": 69460 }, { "epoch": 10.346291331546023, "grad_norm": 1.4930614233016968, "learning_rate": 2.784680350608011e-05, "loss": 0.6101, "num_input_tokens_seen": 40337104, "step": 69465 }, { "epoch": 10.347036044086982, "grad_norm": 1.4063549041748047, "learning_rate": 2.784357519673163e-05, "loss": 0.353, "num_input_tokens_seen": 40339888, "step": 69470 }, { "epoch": 10.347780756627941, "grad_norm": 2.2657623291015625, "learning_rate": 2.784034683934384e-05, "loss": 0.6582, "num_input_tokens_seen": 40342992, "step": 69475 }, { "epoch": 10.348525469168901, "grad_norm": 1.2017754316329956, "learning_rate": 2.7837118433971277e-05, "loss": 0.5775, "num_input_tokens_seen": 40345840, "step": 69480 }, { "epoch": 10.34927018170986, "grad_norm": 0.8656206130981445, "learning_rate": 2.7833889980668476e-05, "loss": 0.7137, "num_input_tokens_seen": 40348880, "step": 69485 }, { "epoch": 10.350014894250819, "grad_norm": 0.8204964995384216, "learning_rate": 2.7830661479489987e-05, "loss": 0.7015, "num_input_tokens_seen": 40351664, "step": 69490 }, { "epoch": 10.350759606791778, "grad_norm": 1.36091148853302, "learning_rate": 2.782743293049035e-05, "loss": 0.5991, "num_input_tokens_seen": 40354576, "step": 69495 }, { "epoch": 10.351504319332738, "grad_norm": 2.46189546585083, "learning_rate": 2.7824204333724115e-05, "loss": 0.6548, "num_input_tokens_seen": 40357296, "step": 69500 }, { "epoch": 10.352249031873697, "grad_norm": 1.7294831275939941, "learning_rate": 2.7820975689245805e-05, "loss": 0.7296, "num_input_tokens_seen": 40360624, "step": 69505 }, { "epoch": 10.352993744414656, "grad_norm": 1.0441806316375732, "learning_rate": 2.7817746997109983e-05, "loss": 0.6033, "num_input_tokens_seen": 40363664, "step": 69510 }, { "epoch": 10.353738456955615, "grad_norm": 1.942301630973816, "learning_rate": 2.7814518257371187e-05, "loss": 0.6349, "num_input_tokens_seen": 40366416, "step": 69515 }, { "epoch": 10.354483169496575, "grad_norm": 1.0927598476409912, "learning_rate": 2.7811289470083972e-05, "loss": 0.4165, "num_input_tokens_seen": 40369168, "step": 69520 }, { "epoch": 10.355227882037534, "grad_norm": 2.917510986328125, "learning_rate": 2.7808060635302875e-05, "loss": 0.4864, "num_input_tokens_seen": 40371984, "step": 69525 }, { "epoch": 10.355972594578493, "grad_norm": 1.4008917808532715, "learning_rate": 2.7804831753082445e-05, "loss": 0.6014, "num_input_tokens_seen": 40374768, "step": 69530 }, { "epoch": 10.356717307119451, "grad_norm": 2.4023447036743164, "learning_rate": 2.7801602823477236e-05, "loss": 0.6442, "num_input_tokens_seen": 40377648, "step": 69535 }, { "epoch": 10.357462019660412, "grad_norm": 2.5945217609405518, "learning_rate": 2.779837384654179e-05, "loss": 0.7009, "num_input_tokens_seen": 40380144, "step": 69540 }, { "epoch": 10.35820673220137, "grad_norm": 2.3389008045196533, "learning_rate": 2.7795144822330673e-05, "loss": 0.5342, "num_input_tokens_seen": 40383152, "step": 69545 }, { "epoch": 10.35895144474233, "grad_norm": 0.8215388655662537, "learning_rate": 2.7791915750898413e-05, "loss": 0.4297, "num_input_tokens_seen": 40386064, "step": 69550 }, { "epoch": 10.359696157283288, "grad_norm": 1.3163065910339355, "learning_rate": 2.7788686632299577e-05, "loss": 0.6431, "num_input_tokens_seen": 40389136, "step": 69555 }, { "epoch": 10.360440869824249, "grad_norm": 1.8736073970794678, "learning_rate": 2.778545746658872e-05, "loss": 0.7783, "num_input_tokens_seen": 40391856, "step": 69560 }, { "epoch": 10.361185582365207, "grad_norm": 1.5056393146514893, "learning_rate": 2.7782228253820385e-05, "loss": 0.6591, "num_input_tokens_seen": 40394832, "step": 69565 }, { "epoch": 10.361930294906166, "grad_norm": 1.3059808015823364, "learning_rate": 2.777899899404914e-05, "loss": 0.6884, "num_input_tokens_seen": 40397776, "step": 69570 }, { "epoch": 10.362675007447125, "grad_norm": 1.0905972719192505, "learning_rate": 2.777576968732952e-05, "loss": 0.7289, "num_input_tokens_seen": 40400720, "step": 69575 }, { "epoch": 10.363419719988084, "grad_norm": 2.019383668899536, "learning_rate": 2.7772540333716102e-05, "loss": 0.585, "num_input_tokens_seen": 40403472, "step": 69580 }, { "epoch": 10.364164432529044, "grad_norm": 0.7455946803092957, "learning_rate": 2.7769310933263425e-05, "loss": 0.4871, "num_input_tokens_seen": 40406160, "step": 69585 }, { "epoch": 10.364909145070003, "grad_norm": 1.1121091842651367, "learning_rate": 2.776608148602605e-05, "loss": 0.5632, "num_input_tokens_seen": 40409008, "step": 69590 }, { "epoch": 10.365653857610962, "grad_norm": 2.0245048999786377, "learning_rate": 2.7762851992058548e-05, "loss": 0.6363, "num_input_tokens_seen": 40411856, "step": 69595 }, { "epoch": 10.36639857015192, "grad_norm": 0.9518625140190125, "learning_rate": 2.7759622451415473e-05, "loss": 0.6058, "num_input_tokens_seen": 40414928, "step": 69600 }, { "epoch": 10.367143282692881, "grad_norm": 1.3287315368652344, "learning_rate": 2.775639286415138e-05, "loss": 0.5477, "num_input_tokens_seen": 40418160, "step": 69605 }, { "epoch": 10.36788799523384, "grad_norm": 1.558552861213684, "learning_rate": 2.7753163230320828e-05, "loss": 0.6637, "num_input_tokens_seen": 40421392, "step": 69610 }, { "epoch": 10.368632707774799, "grad_norm": 1.524291753768921, "learning_rate": 2.774993354997838e-05, "loss": 0.678, "num_input_tokens_seen": 40424496, "step": 69615 }, { "epoch": 10.369377420315757, "grad_norm": 1.4780083894729614, "learning_rate": 2.77467038231786e-05, "loss": 0.6315, "num_input_tokens_seen": 40427056, "step": 69620 }, { "epoch": 10.370122132856718, "grad_norm": 1.5435341596603394, "learning_rate": 2.7743474049976054e-05, "loss": 0.7482, "num_input_tokens_seen": 40430000, "step": 69625 }, { "epoch": 10.370866845397677, "grad_norm": 0.976300835609436, "learning_rate": 2.77402442304253e-05, "loss": 0.6365, "num_input_tokens_seen": 40432976, "step": 69630 }, { "epoch": 10.371611557938635, "grad_norm": 1.3669710159301758, "learning_rate": 2.7737014364580904e-05, "loss": 0.6931, "num_input_tokens_seen": 40436016, "step": 69635 }, { "epoch": 10.372356270479594, "grad_norm": 1.6449122428894043, "learning_rate": 2.7733784452497436e-05, "loss": 0.7826, "num_input_tokens_seen": 40438992, "step": 69640 }, { "epoch": 10.373100983020555, "grad_norm": 1.6196736097335815, "learning_rate": 2.7730554494229453e-05, "loss": 0.4585, "num_input_tokens_seen": 40441872, "step": 69645 }, { "epoch": 10.373845695561513, "grad_norm": 0.9773943424224854, "learning_rate": 2.772732448983153e-05, "loss": 0.7206, "num_input_tokens_seen": 40444752, "step": 69650 }, { "epoch": 10.374590408102472, "grad_norm": 2.0312860012054443, "learning_rate": 2.7724094439358227e-05, "loss": 0.5024, "num_input_tokens_seen": 40447792, "step": 69655 }, { "epoch": 10.375335120643431, "grad_norm": 1.4928885698318481, "learning_rate": 2.7720864342864123e-05, "loss": 0.5645, "num_input_tokens_seen": 40450480, "step": 69660 }, { "epoch": 10.376079833184392, "grad_norm": 1.5331019163131714, "learning_rate": 2.771763420040378e-05, "loss": 0.7267, "num_input_tokens_seen": 40453456, "step": 69665 }, { "epoch": 10.37682454572535, "grad_norm": 1.3461036682128906, "learning_rate": 2.771440401203177e-05, "loss": 0.6583, "num_input_tokens_seen": 40456432, "step": 69670 }, { "epoch": 10.377569258266309, "grad_norm": 1.0942431688308716, "learning_rate": 2.7711173777802657e-05, "loss": 0.5971, "num_input_tokens_seen": 40459728, "step": 69675 }, { "epoch": 10.378313970807268, "grad_norm": 1.1738024950027466, "learning_rate": 2.770794349777102e-05, "loss": 0.4554, "num_input_tokens_seen": 40462640, "step": 69680 }, { "epoch": 10.379058683348228, "grad_norm": 1.5516752004623413, "learning_rate": 2.770471317199144e-05, "loss": 0.5092, "num_input_tokens_seen": 40465200, "step": 69685 }, { "epoch": 10.379803395889187, "grad_norm": 3.677921772003174, "learning_rate": 2.7701482800518475e-05, "loss": 0.5818, "num_input_tokens_seen": 40468336, "step": 69690 }, { "epoch": 10.380548108430146, "grad_norm": 1.4152617454528809, "learning_rate": 2.7698252383406696e-05, "loss": 0.6262, "num_input_tokens_seen": 40470992, "step": 69695 }, { "epoch": 10.381292820971105, "grad_norm": 1.5398403406143188, "learning_rate": 2.7695021920710694e-05, "loss": 0.5702, "num_input_tokens_seen": 40474032, "step": 69700 }, { "epoch": 10.382037533512065, "grad_norm": 2.425283908843994, "learning_rate": 2.7691791412485035e-05, "loss": 0.6649, "num_input_tokens_seen": 40476688, "step": 69705 }, { "epoch": 10.382782246053024, "grad_norm": 1.0345772504806519, "learning_rate": 2.768856085878429e-05, "loss": 0.5356, "num_input_tokens_seen": 40479216, "step": 69710 }, { "epoch": 10.383526958593983, "grad_norm": 1.6794321537017822, "learning_rate": 2.7685330259663045e-05, "loss": 0.4948, "num_input_tokens_seen": 40481872, "step": 69715 }, { "epoch": 10.384271671134941, "grad_norm": 1.3091309070587158, "learning_rate": 2.768209961517587e-05, "loss": 0.5272, "num_input_tokens_seen": 40484528, "step": 69720 }, { "epoch": 10.3850163836759, "grad_norm": 2.0294604301452637, "learning_rate": 2.7678868925377362e-05, "loss": 0.7867, "num_input_tokens_seen": 40487312, "step": 69725 }, { "epoch": 10.38576109621686, "grad_norm": 1.6634966135025024, "learning_rate": 2.7675638190322073e-05, "loss": 0.8791, "num_input_tokens_seen": 40490000, "step": 69730 }, { "epoch": 10.38650580875782, "grad_norm": 1.4744926691055298, "learning_rate": 2.7672407410064603e-05, "loss": 0.5386, "num_input_tokens_seen": 40493136, "step": 69735 }, { "epoch": 10.387250521298778, "grad_norm": 1.3215748071670532, "learning_rate": 2.7669176584659522e-05, "loss": 0.574, "num_input_tokens_seen": 40496048, "step": 69740 }, { "epoch": 10.387995233839739, "grad_norm": 0.8678595423698425, "learning_rate": 2.766594571416141e-05, "loss": 0.6593, "num_input_tokens_seen": 40499120, "step": 69745 }, { "epoch": 10.388739946380698, "grad_norm": 1.5937676429748535, "learning_rate": 2.7662714798624865e-05, "loss": 0.6217, "num_input_tokens_seen": 40502160, "step": 69750 }, { "epoch": 10.389484658921656, "grad_norm": 1.464038372039795, "learning_rate": 2.7659483838104456e-05, "loss": 0.6187, "num_input_tokens_seen": 40505168, "step": 69755 }, { "epoch": 10.390229371462615, "grad_norm": 2.174182891845703, "learning_rate": 2.7656252832654766e-05, "loss": 0.5812, "num_input_tokens_seen": 40507728, "step": 69760 }, { "epoch": 10.390974084003574, "grad_norm": 1.984973669052124, "learning_rate": 2.765302178233039e-05, "loss": 0.6232, "num_input_tokens_seen": 40510896, "step": 69765 }, { "epoch": 10.391718796544534, "grad_norm": 1.2047605514526367, "learning_rate": 2.764979068718591e-05, "loss": 0.5511, "num_input_tokens_seen": 40513456, "step": 69770 }, { "epoch": 10.392463509085493, "grad_norm": 1.561309814453125, "learning_rate": 2.7646559547275907e-05, "loss": 0.6482, "num_input_tokens_seen": 40516240, "step": 69775 }, { "epoch": 10.393208221626452, "grad_norm": 1.886002779006958, "learning_rate": 2.7643328362654968e-05, "loss": 0.5189, "num_input_tokens_seen": 40519120, "step": 69780 }, { "epoch": 10.39395293416741, "grad_norm": 1.2484134435653687, "learning_rate": 2.764009713337768e-05, "loss": 0.6713, "num_input_tokens_seen": 40522128, "step": 69785 }, { "epoch": 10.394697646708371, "grad_norm": 1.191986322402954, "learning_rate": 2.763686585949864e-05, "loss": 0.7253, "num_input_tokens_seen": 40524880, "step": 69790 }, { "epoch": 10.39544235924933, "grad_norm": 2.014197587966919, "learning_rate": 2.7633634541072428e-05, "loss": 0.6634, "num_input_tokens_seen": 40527600, "step": 69795 }, { "epoch": 10.396187071790289, "grad_norm": 3.9349026679992676, "learning_rate": 2.763040317815364e-05, "loss": 0.7824, "num_input_tokens_seen": 40530704, "step": 69800 }, { "epoch": 10.396931784331247, "grad_norm": 1.7451645135879517, "learning_rate": 2.7627171770796868e-05, "loss": 0.588, "num_input_tokens_seen": 40533648, "step": 69805 }, { "epoch": 10.397676496872208, "grad_norm": 1.662429690361023, "learning_rate": 2.762394031905669e-05, "loss": 0.6137, "num_input_tokens_seen": 40536528, "step": 69810 }, { "epoch": 10.398421209413167, "grad_norm": 1.0416474342346191, "learning_rate": 2.7620708822987708e-05, "loss": 0.5931, "num_input_tokens_seen": 40539504, "step": 69815 }, { "epoch": 10.399165921954125, "grad_norm": 1.2548701763153076, "learning_rate": 2.7617477282644515e-05, "loss": 0.6452, "num_input_tokens_seen": 40542416, "step": 69820 }, { "epoch": 10.399910634495084, "grad_norm": 1.854875922203064, "learning_rate": 2.7614245698081702e-05, "loss": 0.6934, "num_input_tokens_seen": 40545392, "step": 69825 }, { "epoch": 10.400655347036045, "grad_norm": 1.581376552581787, "learning_rate": 2.7611014069353874e-05, "loss": 0.6669, "num_input_tokens_seen": 40548432, "step": 69830 }, { "epoch": 10.401400059577004, "grad_norm": 1.5073922872543335, "learning_rate": 2.760778239651561e-05, "loss": 0.687, "num_input_tokens_seen": 40551696, "step": 69835 }, { "epoch": 10.402144772117962, "grad_norm": 1.3457915782928467, "learning_rate": 2.760455067962151e-05, "loss": 0.7342, "num_input_tokens_seen": 40554576, "step": 69840 }, { "epoch": 10.402889484658921, "grad_norm": 1.175341010093689, "learning_rate": 2.7601318918726184e-05, "loss": 0.6592, "num_input_tokens_seen": 40557552, "step": 69845 }, { "epoch": 10.403634197199882, "grad_norm": 1.881441593170166, "learning_rate": 2.7598087113884207e-05, "loss": 0.6419, "num_input_tokens_seen": 40560464, "step": 69850 }, { "epoch": 10.40437890974084, "grad_norm": 0.999478816986084, "learning_rate": 2.7594855265150192e-05, "loss": 0.6305, "num_input_tokens_seen": 40563280, "step": 69855 }, { "epoch": 10.405123622281799, "grad_norm": 1.0544164180755615, "learning_rate": 2.7591623372578736e-05, "loss": 0.4625, "num_input_tokens_seen": 40566128, "step": 69860 }, { "epoch": 10.405868334822758, "grad_norm": 2.726513385772705, "learning_rate": 2.758839143622444e-05, "loss": 0.7685, "num_input_tokens_seen": 40568816, "step": 69865 }, { "epoch": 10.406613047363718, "grad_norm": 1.364722728729248, "learning_rate": 2.7585159456141895e-05, "loss": 0.7787, "num_input_tokens_seen": 40571568, "step": 69870 }, { "epoch": 10.407357759904677, "grad_norm": 1.258251428604126, "learning_rate": 2.7581927432385713e-05, "loss": 0.5956, "num_input_tokens_seen": 40574128, "step": 69875 }, { "epoch": 10.408102472445636, "grad_norm": 0.9877480864524841, "learning_rate": 2.757869536501049e-05, "loss": 0.6421, "num_input_tokens_seen": 40577072, "step": 69880 }, { "epoch": 10.408847184986595, "grad_norm": 0.8782561421394348, "learning_rate": 2.7575463254070832e-05, "loss": 0.6058, "num_input_tokens_seen": 40580528, "step": 69885 }, { "epoch": 10.409591897527555, "grad_norm": 1.3694751262664795, "learning_rate": 2.757223109962134e-05, "loss": 0.5823, "num_input_tokens_seen": 40583408, "step": 69890 }, { "epoch": 10.410336610068514, "grad_norm": 1.0840020179748535, "learning_rate": 2.7568998901716613e-05, "loss": 0.4465, "num_input_tokens_seen": 40586384, "step": 69895 }, { "epoch": 10.411081322609473, "grad_norm": 1.3145374059677124, "learning_rate": 2.7565766660411263e-05, "loss": 0.629, "num_input_tokens_seen": 40589264, "step": 69900 }, { "epoch": 10.411826035150431, "grad_norm": 1.463950514793396, "learning_rate": 2.7562534375759898e-05, "loss": 0.7662, "num_input_tokens_seen": 40592016, "step": 69905 }, { "epoch": 10.41257074769139, "grad_norm": 0.9230253100395203, "learning_rate": 2.7559302047817108e-05, "loss": 0.5667, "num_input_tokens_seen": 40594512, "step": 69910 }, { "epoch": 10.41331546023235, "grad_norm": 1.4505631923675537, "learning_rate": 2.755606967663752e-05, "loss": 0.6061, "num_input_tokens_seen": 40597360, "step": 69915 }, { "epoch": 10.41406017277331, "grad_norm": 1.2303212881088257, "learning_rate": 2.755283726227573e-05, "loss": 0.8163, "num_input_tokens_seen": 40600048, "step": 69920 }, { "epoch": 10.414804885314268, "grad_norm": 1.528165578842163, "learning_rate": 2.754960480478635e-05, "loss": 0.5473, "num_input_tokens_seen": 40603216, "step": 69925 }, { "epoch": 10.415549597855227, "grad_norm": 1.2145780324935913, "learning_rate": 2.7546372304223983e-05, "loss": 0.592, "num_input_tokens_seen": 40606064, "step": 69930 }, { "epoch": 10.416294310396188, "grad_norm": 1.0848075151443481, "learning_rate": 2.7543139760643255e-05, "loss": 0.5835, "num_input_tokens_seen": 40608880, "step": 69935 }, { "epoch": 10.417039022937146, "grad_norm": 1.3899682760238647, "learning_rate": 2.7539907174098755e-05, "loss": 0.6894, "num_input_tokens_seen": 40611472, "step": 69940 }, { "epoch": 10.417783735478105, "grad_norm": 1.0562869310379028, "learning_rate": 2.7536674544645108e-05, "loss": 0.6504, "num_input_tokens_seen": 40614128, "step": 69945 }, { "epoch": 10.418528448019064, "grad_norm": 0.7777829170227051, "learning_rate": 2.7533441872336923e-05, "loss": 0.5445, "num_input_tokens_seen": 40616848, "step": 69950 }, { "epoch": 10.419273160560024, "grad_norm": 3.681135416030884, "learning_rate": 2.7530209157228808e-05, "loss": 0.7018, "num_input_tokens_seen": 40619728, "step": 69955 }, { "epoch": 10.420017873100983, "grad_norm": 1.4040296077728271, "learning_rate": 2.752697639937539e-05, "loss": 0.6817, "num_input_tokens_seen": 40622608, "step": 69960 }, { "epoch": 10.420762585641942, "grad_norm": 1.9501433372497559, "learning_rate": 2.752374359883127e-05, "loss": 0.639, "num_input_tokens_seen": 40625648, "step": 69965 }, { "epoch": 10.4215072981829, "grad_norm": 2.224930763244629, "learning_rate": 2.7520510755651068e-05, "loss": 0.7235, "num_input_tokens_seen": 40628688, "step": 69970 }, { "epoch": 10.422252010723861, "grad_norm": 1.533737301826477, "learning_rate": 2.7517277869889395e-05, "loss": 0.5182, "num_input_tokens_seen": 40631632, "step": 69975 }, { "epoch": 10.42299672326482, "grad_norm": 1.6554592847824097, "learning_rate": 2.7514044941600874e-05, "loss": 0.5553, "num_input_tokens_seen": 40634544, "step": 69980 }, { "epoch": 10.423741435805779, "grad_norm": 1.0441458225250244, "learning_rate": 2.7510811970840115e-05, "loss": 0.6188, "num_input_tokens_seen": 40637520, "step": 69985 }, { "epoch": 10.424486148346737, "grad_norm": 1.3182605504989624, "learning_rate": 2.7507578957661746e-05, "loss": 0.5094, "num_input_tokens_seen": 40640304, "step": 69990 }, { "epoch": 10.425230860887698, "grad_norm": 1.1908949613571167, "learning_rate": 2.7504345902120375e-05, "loss": 0.679, "num_input_tokens_seen": 40643024, "step": 69995 }, { "epoch": 10.425975573428657, "grad_norm": 1.4855053424835205, "learning_rate": 2.7501112804270624e-05, "loss": 0.7241, "num_input_tokens_seen": 40645840, "step": 70000 }, { "epoch": 10.426720285969616, "grad_norm": 1.1782660484313965, "learning_rate": 2.749787966416712e-05, "loss": 0.617, "num_input_tokens_seen": 40648688, "step": 70005 }, { "epoch": 10.427464998510574, "grad_norm": 0.6963868737220764, "learning_rate": 2.7494646481864472e-05, "loss": 0.4863, "num_input_tokens_seen": 40651632, "step": 70010 }, { "epoch": 10.428209711051535, "grad_norm": 2.0078377723693848, "learning_rate": 2.749141325741731e-05, "loss": 0.633, "num_input_tokens_seen": 40654480, "step": 70015 }, { "epoch": 10.428954423592494, "grad_norm": 2.9355649948120117, "learning_rate": 2.7488179990880248e-05, "loss": 0.562, "num_input_tokens_seen": 40657392, "step": 70020 }, { "epoch": 10.429699136133452, "grad_norm": 0.8728790283203125, "learning_rate": 2.7484946682307917e-05, "loss": 0.6473, "num_input_tokens_seen": 40660144, "step": 70025 }, { "epoch": 10.430443848674411, "grad_norm": 1.5555928945541382, "learning_rate": 2.7481713331754945e-05, "loss": 0.5325, "num_input_tokens_seen": 40662992, "step": 70030 }, { "epoch": 10.431188561215372, "grad_norm": 1.4088834524154663, "learning_rate": 2.7478479939275937e-05, "loss": 0.65, "num_input_tokens_seen": 40665648, "step": 70035 }, { "epoch": 10.43193327375633, "grad_norm": 1.1697829961776733, "learning_rate": 2.7475246504925535e-05, "loss": 0.6607, "num_input_tokens_seen": 40668464, "step": 70040 }, { "epoch": 10.43267798629729, "grad_norm": 1.9540144205093384, "learning_rate": 2.7472013028758364e-05, "loss": 0.6388, "num_input_tokens_seen": 40671600, "step": 70045 }, { "epoch": 10.433422698838248, "grad_norm": 0.8042500615119934, "learning_rate": 2.7468779510829036e-05, "loss": 0.5669, "num_input_tokens_seen": 40674416, "step": 70050 }, { "epoch": 10.434167411379208, "grad_norm": 1.9518091678619385, "learning_rate": 2.746554595119219e-05, "loss": 0.6466, "num_input_tokens_seen": 40677584, "step": 70055 }, { "epoch": 10.434912123920167, "grad_norm": 0.9091411828994751, "learning_rate": 2.7462312349902452e-05, "loss": 0.6769, "num_input_tokens_seen": 40680432, "step": 70060 }, { "epoch": 10.435656836461126, "grad_norm": 1.8381564617156982, "learning_rate": 2.7459078707014453e-05, "loss": 0.6026, "num_input_tokens_seen": 40683120, "step": 70065 }, { "epoch": 10.436401549002085, "grad_norm": 1.9356400966644287, "learning_rate": 2.745584502258281e-05, "loss": 0.5954, "num_input_tokens_seen": 40686032, "step": 70070 }, { "epoch": 10.437146261543045, "grad_norm": 1.1165231466293335, "learning_rate": 2.745261129666217e-05, "loss": 0.6704, "num_input_tokens_seen": 40688912, "step": 70075 }, { "epoch": 10.437890974084004, "grad_norm": 1.1629457473754883, "learning_rate": 2.7449377529307147e-05, "loss": 0.4761, "num_input_tokens_seen": 40691824, "step": 70080 }, { "epoch": 10.438635686624963, "grad_norm": 0.9971603751182556, "learning_rate": 2.7446143720572387e-05, "loss": 0.4294, "num_input_tokens_seen": 40694992, "step": 70085 }, { "epoch": 10.439380399165922, "grad_norm": 1.871604561805725, "learning_rate": 2.7442909870512513e-05, "loss": 0.6472, "num_input_tokens_seen": 40697936, "step": 70090 }, { "epoch": 10.44012511170688, "grad_norm": 0.9419998526573181, "learning_rate": 2.7439675979182155e-05, "loss": 0.5267, "num_input_tokens_seen": 40700816, "step": 70095 }, { "epoch": 10.44086982424784, "grad_norm": 1.1951870918273926, "learning_rate": 2.7436442046635962e-05, "loss": 0.4575, "num_input_tokens_seen": 40703536, "step": 70100 }, { "epoch": 10.4416145367888, "grad_norm": 0.8725791573524475, "learning_rate": 2.7433208072928546e-05, "loss": 0.5892, "num_input_tokens_seen": 40706416, "step": 70105 }, { "epoch": 10.442359249329758, "grad_norm": 1.603310227394104, "learning_rate": 2.7429974058114553e-05, "loss": 0.7594, "num_input_tokens_seen": 40709168, "step": 70110 }, { "epoch": 10.443103961870717, "grad_norm": 1.5066956281661987, "learning_rate": 2.7426740002248624e-05, "loss": 0.7508, "num_input_tokens_seen": 40712080, "step": 70115 }, { "epoch": 10.443848674411678, "grad_norm": 2.1909751892089844, "learning_rate": 2.7423505905385382e-05, "loss": 0.5898, "num_input_tokens_seen": 40714864, "step": 70120 }, { "epoch": 10.444593386952636, "grad_norm": 1.810105323791504, "learning_rate": 2.742027176757948e-05, "loss": 0.6088, "num_input_tokens_seen": 40717936, "step": 70125 }, { "epoch": 10.445338099493595, "grad_norm": 0.6212257146835327, "learning_rate": 2.741703758888554e-05, "loss": 0.5929, "num_input_tokens_seen": 40720752, "step": 70130 }, { "epoch": 10.446082812034554, "grad_norm": 0.7609351277351379, "learning_rate": 2.7413803369358217e-05, "loss": 0.4287, "num_input_tokens_seen": 40723440, "step": 70135 }, { "epoch": 10.446827524575514, "grad_norm": 1.1175955533981323, "learning_rate": 2.7410569109052124e-05, "loss": 0.4693, "num_input_tokens_seen": 40726608, "step": 70140 }, { "epoch": 10.447572237116473, "grad_norm": 1.9152331352233887, "learning_rate": 2.7407334808021924e-05, "loss": 0.5076, "num_input_tokens_seen": 40729488, "step": 70145 }, { "epoch": 10.448316949657432, "grad_norm": 1.592360258102417, "learning_rate": 2.740410046632224e-05, "loss": 0.5389, "num_input_tokens_seen": 40732208, "step": 70150 }, { "epoch": 10.44906166219839, "grad_norm": 1.0544108152389526, "learning_rate": 2.7400866084007732e-05, "loss": 0.6327, "num_input_tokens_seen": 40735216, "step": 70155 }, { "epoch": 10.449806374739351, "grad_norm": 1.8781888484954834, "learning_rate": 2.7397631661133032e-05, "loss": 0.7164, "num_input_tokens_seen": 40738192, "step": 70160 }, { "epoch": 10.45055108728031, "grad_norm": 0.7230468392372131, "learning_rate": 2.7394397197752786e-05, "loss": 0.5467, "num_input_tokens_seen": 40740944, "step": 70165 }, { "epoch": 10.451295799821269, "grad_norm": 1.8510104417800903, "learning_rate": 2.7391162693921624e-05, "loss": 0.9154, "num_input_tokens_seen": 40743632, "step": 70170 }, { "epoch": 10.452040512362228, "grad_norm": 1.6051411628723145, "learning_rate": 2.7387928149694197e-05, "loss": 0.5722, "num_input_tokens_seen": 40746544, "step": 70175 }, { "epoch": 10.452785224903188, "grad_norm": 1.251474380493164, "learning_rate": 2.7384693565125153e-05, "loss": 0.4601, "num_input_tokens_seen": 40749360, "step": 70180 }, { "epoch": 10.453529937444147, "grad_norm": 1.1260241270065308, "learning_rate": 2.7381458940269134e-05, "loss": 0.5775, "num_input_tokens_seen": 40752272, "step": 70185 }, { "epoch": 10.454274649985106, "grad_norm": 1.5552302598953247, "learning_rate": 2.737822427518079e-05, "loss": 0.5757, "num_input_tokens_seen": 40755024, "step": 70190 }, { "epoch": 10.455019362526064, "grad_norm": 1.1863151788711548, "learning_rate": 2.7374989569914766e-05, "loss": 0.5956, "num_input_tokens_seen": 40758064, "step": 70195 }, { "epoch": 10.455764075067025, "grad_norm": 1.0734503269195557, "learning_rate": 2.73717548245257e-05, "loss": 0.4584, "num_input_tokens_seen": 40760784, "step": 70200 }, { "epoch": 10.456508787607984, "grad_norm": 2.3521883487701416, "learning_rate": 2.736852003906826e-05, "loss": 0.5514, "num_input_tokens_seen": 40763504, "step": 70205 }, { "epoch": 10.457253500148942, "grad_norm": 1.3117402791976929, "learning_rate": 2.736528521359707e-05, "loss": 0.7068, "num_input_tokens_seen": 40766224, "step": 70210 }, { "epoch": 10.457998212689901, "grad_norm": 1.151987910270691, "learning_rate": 2.736205034816679e-05, "loss": 0.559, "num_input_tokens_seen": 40769200, "step": 70215 }, { "epoch": 10.458742925230862, "grad_norm": 1.8242579698562622, "learning_rate": 2.735881544283207e-05, "loss": 0.5893, "num_input_tokens_seen": 40772240, "step": 70220 }, { "epoch": 10.45948763777182, "grad_norm": 0.9593362212181091, "learning_rate": 2.735558049764756e-05, "loss": 0.5741, "num_input_tokens_seen": 40775344, "step": 70225 }, { "epoch": 10.46023235031278, "grad_norm": 0.9980390071868896, "learning_rate": 2.735234551266792e-05, "loss": 0.6532, "num_input_tokens_seen": 40778128, "step": 70230 }, { "epoch": 10.460977062853738, "grad_norm": 1.6666890382766724, "learning_rate": 2.734911048794779e-05, "loss": 0.7175, "num_input_tokens_seen": 40780976, "step": 70235 }, { "epoch": 10.461721775394698, "grad_norm": 1.302894949913025, "learning_rate": 2.7345875423541817e-05, "loss": 0.4163, "num_input_tokens_seen": 40783824, "step": 70240 }, { "epoch": 10.462466487935657, "grad_norm": 2.0137734413146973, "learning_rate": 2.7342640319504674e-05, "loss": 0.6683, "num_input_tokens_seen": 40786416, "step": 70245 }, { "epoch": 10.463211200476616, "grad_norm": 1.4168874025344849, "learning_rate": 2.7339405175890998e-05, "loss": 0.6305, "num_input_tokens_seen": 40789200, "step": 70250 }, { "epoch": 10.463955913017575, "grad_norm": 1.2862255573272705, "learning_rate": 2.733616999275545e-05, "loss": 0.6053, "num_input_tokens_seen": 40792240, "step": 70255 }, { "epoch": 10.464700625558535, "grad_norm": 1.6634327173233032, "learning_rate": 2.7332934770152686e-05, "loss": 0.7559, "num_input_tokens_seen": 40795088, "step": 70260 }, { "epoch": 10.465445338099494, "grad_norm": 1.635461449623108, "learning_rate": 2.7329699508137363e-05, "loss": 0.6892, "num_input_tokens_seen": 40798064, "step": 70265 }, { "epoch": 10.466190050640453, "grad_norm": 1.506650447845459, "learning_rate": 2.7326464206764125e-05, "loss": 0.5273, "num_input_tokens_seen": 40801104, "step": 70270 }, { "epoch": 10.466934763181412, "grad_norm": 1.578628659248352, "learning_rate": 2.7323228866087647e-05, "loss": 0.5046, "num_input_tokens_seen": 40803600, "step": 70275 }, { "epoch": 10.46767947572237, "grad_norm": 2.123960018157959, "learning_rate": 2.731999348616257e-05, "loss": 0.6335, "num_input_tokens_seen": 40806320, "step": 70280 }, { "epoch": 10.46842418826333, "grad_norm": 1.245486855506897, "learning_rate": 2.731675806704357e-05, "loss": 0.5258, "num_input_tokens_seen": 40809200, "step": 70285 }, { "epoch": 10.46916890080429, "grad_norm": 1.712630033493042, "learning_rate": 2.7313522608785295e-05, "loss": 0.5011, "num_input_tokens_seen": 40812144, "step": 70290 }, { "epoch": 10.469913613345248, "grad_norm": 1.9469027519226074, "learning_rate": 2.7310287111442407e-05, "loss": 0.8274, "num_input_tokens_seen": 40815152, "step": 70295 }, { "epoch": 10.470658325886207, "grad_norm": 0.7434383034706116, "learning_rate": 2.730705157506957e-05, "loss": 0.641, "num_input_tokens_seen": 40817936, "step": 70300 }, { "epoch": 10.471403038427168, "grad_norm": 2.5496208667755127, "learning_rate": 2.7303815999721433e-05, "loss": 0.7601, "num_input_tokens_seen": 40820688, "step": 70305 }, { "epoch": 10.472147750968126, "grad_norm": 1.8488757610321045, "learning_rate": 2.730058038545267e-05, "loss": 0.7165, "num_input_tokens_seen": 40823728, "step": 70310 }, { "epoch": 10.472892463509085, "grad_norm": 2.985738754272461, "learning_rate": 2.7297344732317938e-05, "loss": 0.6549, "num_input_tokens_seen": 40826544, "step": 70315 }, { "epoch": 10.473637176050044, "grad_norm": 1.6004538536071777, "learning_rate": 2.7294109040371902e-05, "loss": 0.6165, "num_input_tokens_seen": 40829488, "step": 70320 }, { "epoch": 10.474381888591004, "grad_norm": 1.9582198858261108, "learning_rate": 2.729087330966923e-05, "loss": 0.6154, "num_input_tokens_seen": 40832368, "step": 70325 }, { "epoch": 10.475126601131963, "grad_norm": 2.062040328979492, "learning_rate": 2.7287637540264584e-05, "loss": 0.6299, "num_input_tokens_seen": 40835312, "step": 70330 }, { "epoch": 10.475871313672922, "grad_norm": 1.0672787427902222, "learning_rate": 2.7284401732212615e-05, "loss": 0.5146, "num_input_tokens_seen": 40838064, "step": 70335 }, { "epoch": 10.47661602621388, "grad_norm": 2.154031991958618, "learning_rate": 2.7281165885568006e-05, "loss": 0.8109, "num_input_tokens_seen": 40840816, "step": 70340 }, { "epoch": 10.477360738754841, "grad_norm": 2.014209032058716, "learning_rate": 2.7277930000385414e-05, "loss": 0.5244, "num_input_tokens_seen": 40843472, "step": 70345 }, { "epoch": 10.4781054512958, "grad_norm": 1.2554670572280884, "learning_rate": 2.7274694076719513e-05, "loss": 0.5177, "num_input_tokens_seen": 40846320, "step": 70350 }, { "epoch": 10.478850163836759, "grad_norm": 1.3303805589675903, "learning_rate": 2.727145811462497e-05, "loss": 0.7244, "num_input_tokens_seen": 40849296, "step": 70355 }, { "epoch": 10.479594876377718, "grad_norm": 1.2992241382598877, "learning_rate": 2.7268222114156454e-05, "loss": 0.4966, "num_input_tokens_seen": 40852048, "step": 70360 }, { "epoch": 10.480339588918678, "grad_norm": 1.9766730070114136, "learning_rate": 2.7264986075368625e-05, "loss": 0.6719, "num_input_tokens_seen": 40855184, "step": 70365 }, { "epoch": 10.481084301459637, "grad_norm": 1.3261065483093262, "learning_rate": 2.726174999831616e-05, "loss": 0.5462, "num_input_tokens_seen": 40858032, "step": 70370 }, { "epoch": 10.481829014000596, "grad_norm": 1.436824917793274, "learning_rate": 2.7258513883053727e-05, "loss": 0.5355, "num_input_tokens_seen": 40861008, "step": 70375 }, { "epoch": 10.482573726541554, "grad_norm": 1.189238429069519, "learning_rate": 2.7255277729635997e-05, "loss": 0.6816, "num_input_tokens_seen": 40863760, "step": 70380 }, { "epoch": 10.483318439082515, "grad_norm": 1.7293288707733154, "learning_rate": 2.725204153811764e-05, "loss": 0.5626, "num_input_tokens_seen": 40866544, "step": 70385 }, { "epoch": 10.484063151623474, "grad_norm": 1.8936840295791626, "learning_rate": 2.7248805308553333e-05, "loss": 0.5972, "num_input_tokens_seen": 40869232, "step": 70390 }, { "epoch": 10.484807864164432, "grad_norm": 1.3945857286453247, "learning_rate": 2.7245569040997747e-05, "loss": 0.8546, "num_input_tokens_seen": 40872112, "step": 70395 }, { "epoch": 10.485552576705391, "grad_norm": 2.0439388751983643, "learning_rate": 2.7242332735505555e-05, "loss": 0.5421, "num_input_tokens_seen": 40874640, "step": 70400 }, { "epoch": 10.486297289246352, "grad_norm": 1.424246907234192, "learning_rate": 2.7239096392131423e-05, "loss": 0.7597, "num_input_tokens_seen": 40877776, "step": 70405 }, { "epoch": 10.48704200178731, "grad_norm": 1.116052508354187, "learning_rate": 2.723586001093004e-05, "loss": 0.5531, "num_input_tokens_seen": 40880304, "step": 70410 }, { "epoch": 10.48778671432827, "grad_norm": 2.142714023590088, "learning_rate": 2.7232623591956074e-05, "loss": 0.6052, "num_input_tokens_seen": 40883248, "step": 70415 }, { "epoch": 10.488531426869228, "grad_norm": 2.018993616104126, "learning_rate": 2.72293871352642e-05, "loss": 0.5365, "num_input_tokens_seen": 40886576, "step": 70420 }, { "epoch": 10.489276139410187, "grad_norm": 1.2610925436019897, "learning_rate": 2.7226150640909092e-05, "loss": 0.6991, "num_input_tokens_seen": 40889680, "step": 70425 }, { "epoch": 10.490020851951147, "grad_norm": 1.2613445520401, "learning_rate": 2.722291410894544e-05, "loss": 0.593, "num_input_tokens_seen": 40892272, "step": 70430 }, { "epoch": 10.490765564492106, "grad_norm": 1.2853575944900513, "learning_rate": 2.721967753942791e-05, "loss": 0.4968, "num_input_tokens_seen": 40894928, "step": 70435 }, { "epoch": 10.491510277033065, "grad_norm": 1.3453338146209717, "learning_rate": 2.721644093241118e-05, "loss": 0.6336, "num_input_tokens_seen": 40897776, "step": 70440 }, { "epoch": 10.492254989574024, "grad_norm": 2.5223755836486816, "learning_rate": 2.7213204287949938e-05, "loss": 0.7838, "num_input_tokens_seen": 40900560, "step": 70445 }, { "epoch": 10.492999702114984, "grad_norm": 1.3958890438079834, "learning_rate": 2.7209967606098862e-05, "loss": 0.5807, "num_input_tokens_seen": 40903728, "step": 70450 }, { "epoch": 10.493744414655943, "grad_norm": 2.350815534591675, "learning_rate": 2.7206730886912624e-05, "loss": 0.6503, "num_input_tokens_seen": 40906512, "step": 70455 }, { "epoch": 10.494489127196902, "grad_norm": 0.936319887638092, "learning_rate": 2.7203494130445905e-05, "loss": 0.471, "num_input_tokens_seen": 40909680, "step": 70460 }, { "epoch": 10.49523383973786, "grad_norm": 1.0459843873977661, "learning_rate": 2.7200257336753405e-05, "loss": 0.7035, "num_input_tokens_seen": 40912528, "step": 70465 }, { "epoch": 10.495978552278821, "grad_norm": 1.091963529586792, "learning_rate": 2.7197020505889786e-05, "loss": 0.7066, "num_input_tokens_seen": 40915728, "step": 70470 }, { "epoch": 10.49672326481978, "grad_norm": 1.2124569416046143, "learning_rate": 2.7193783637909736e-05, "loss": 0.4396, "num_input_tokens_seen": 40918416, "step": 70475 }, { "epoch": 10.497467977360738, "grad_norm": 1.6622825860977173, "learning_rate": 2.7190546732867945e-05, "loss": 0.6872, "num_input_tokens_seen": 40921680, "step": 70480 }, { "epoch": 10.498212689901697, "grad_norm": 1.1589601039886475, "learning_rate": 2.7187309790819092e-05, "loss": 0.6884, "num_input_tokens_seen": 40924592, "step": 70485 }, { "epoch": 10.498957402442658, "grad_norm": 1.7935833930969238, "learning_rate": 2.7184072811817867e-05, "loss": 0.6806, "num_input_tokens_seen": 40927376, "step": 70490 }, { "epoch": 10.499702114983616, "grad_norm": 1.393592357635498, "learning_rate": 2.7180835795918952e-05, "loss": 0.7408, "num_input_tokens_seen": 40930576, "step": 70495 }, { "epoch": 10.500446827524575, "grad_norm": 0.7625372409820557, "learning_rate": 2.7177598743177028e-05, "loss": 0.3869, "num_input_tokens_seen": 40933264, "step": 70500 }, { "epoch": 10.501191540065534, "grad_norm": 2.0146408081054688, "learning_rate": 2.717436165364679e-05, "loss": 0.548, "num_input_tokens_seen": 40936208, "step": 70505 }, { "epoch": 10.501936252606495, "grad_norm": 0.5921198129653931, "learning_rate": 2.7171124527382917e-05, "loss": 0.4625, "num_input_tokens_seen": 40938960, "step": 70510 }, { "epoch": 10.502680965147453, "grad_norm": 2.3568403720855713, "learning_rate": 2.7167887364440102e-05, "loss": 0.5839, "num_input_tokens_seen": 40941744, "step": 70515 }, { "epoch": 10.503425677688412, "grad_norm": 1.2307634353637695, "learning_rate": 2.7164650164873033e-05, "loss": 0.434, "num_input_tokens_seen": 40944560, "step": 70520 }, { "epoch": 10.50417039022937, "grad_norm": 1.3031847476959229, "learning_rate": 2.7161412928736407e-05, "loss": 0.4982, "num_input_tokens_seen": 40947280, "step": 70525 }, { "epoch": 10.504915102770331, "grad_norm": 2.377011775970459, "learning_rate": 2.7158175656084906e-05, "loss": 0.6747, "num_input_tokens_seen": 40949872, "step": 70530 }, { "epoch": 10.50565981531129, "grad_norm": 1.9778894186019897, "learning_rate": 2.7154938346973214e-05, "loss": 0.49, "num_input_tokens_seen": 40952912, "step": 70535 }, { "epoch": 10.506404527852249, "grad_norm": 0.9708722829818726, "learning_rate": 2.715170100145603e-05, "loss": 0.5271, "num_input_tokens_seen": 40955728, "step": 70540 }, { "epoch": 10.507149240393208, "grad_norm": 1.7565605640411377, "learning_rate": 2.7148463619588045e-05, "loss": 0.65, "num_input_tokens_seen": 40958736, "step": 70545 }, { "epoch": 10.507893952934168, "grad_norm": 0.8464024066925049, "learning_rate": 2.714522620142395e-05, "loss": 0.4974, "num_input_tokens_seen": 40961328, "step": 70550 }, { "epoch": 10.508638665475127, "grad_norm": 2.4577691555023193, "learning_rate": 2.7141988747018437e-05, "loss": 0.5501, "num_input_tokens_seen": 40964016, "step": 70555 }, { "epoch": 10.509383378016086, "grad_norm": 1.0604677200317383, "learning_rate": 2.713875125642621e-05, "loss": 0.5044, "num_input_tokens_seen": 40966928, "step": 70560 }, { "epoch": 10.510128090557044, "grad_norm": 1.8868663311004639, "learning_rate": 2.713551372970195e-05, "loss": 0.5331, "num_input_tokens_seen": 40969904, "step": 70565 }, { "epoch": 10.510872803098005, "grad_norm": 4.578794479370117, "learning_rate": 2.7132276166900357e-05, "loss": 0.7081, "num_input_tokens_seen": 40972976, "step": 70570 }, { "epoch": 10.511617515638964, "grad_norm": 1.0904115438461304, "learning_rate": 2.7129038568076122e-05, "loss": 0.5294, "num_input_tokens_seen": 40975952, "step": 70575 }, { "epoch": 10.512362228179922, "grad_norm": 1.6779462099075317, "learning_rate": 2.712580093328394e-05, "loss": 0.4565, "num_input_tokens_seen": 40978832, "step": 70580 }, { "epoch": 10.513106940720881, "grad_norm": 1.074538230895996, "learning_rate": 2.7122563262578515e-05, "loss": 0.5368, "num_input_tokens_seen": 40981744, "step": 70585 }, { "epoch": 10.513851653261842, "grad_norm": 2.4329705238342285, "learning_rate": 2.7119325556014546e-05, "loss": 0.7574, "num_input_tokens_seen": 40984496, "step": 70590 }, { "epoch": 10.5145963658028, "grad_norm": 1.5444016456604004, "learning_rate": 2.7116087813646724e-05, "loss": 0.735, "num_input_tokens_seen": 40987248, "step": 70595 }, { "epoch": 10.51534107834376, "grad_norm": 1.3065829277038574, "learning_rate": 2.7112850035529748e-05, "loss": 0.7211, "num_input_tokens_seen": 40990032, "step": 70600 }, { "epoch": 10.516085790884718, "grad_norm": 2.345062494277954, "learning_rate": 2.7109612221718316e-05, "loss": 0.7726, "num_input_tokens_seen": 40992976, "step": 70605 }, { "epoch": 10.516830503425677, "grad_norm": 0.9776096343994141, "learning_rate": 2.7106374372267136e-05, "loss": 0.407, "num_input_tokens_seen": 40995952, "step": 70610 }, { "epoch": 10.517575215966637, "grad_norm": 1.0956007242202759, "learning_rate": 2.7103136487230895e-05, "loss": 0.7329, "num_input_tokens_seen": 40998608, "step": 70615 }, { "epoch": 10.518319928507596, "grad_norm": 1.5323437452316284, "learning_rate": 2.70998985666643e-05, "loss": 0.7004, "num_input_tokens_seen": 41001488, "step": 70620 }, { "epoch": 10.519064641048555, "grad_norm": 1.8811839818954468, "learning_rate": 2.7096660610622055e-05, "loss": 0.731, "num_input_tokens_seen": 41004144, "step": 70625 }, { "epoch": 10.519809353589514, "grad_norm": 1.2084206342697144, "learning_rate": 2.7093422619158866e-05, "loss": 0.4807, "num_input_tokens_seen": 41006960, "step": 70630 }, { "epoch": 10.520554066130474, "grad_norm": 1.0310828685760498, "learning_rate": 2.709018459232942e-05, "loss": 0.6987, "num_input_tokens_seen": 41009840, "step": 70635 }, { "epoch": 10.521298778671433, "grad_norm": 1.9164060354232788, "learning_rate": 2.7086946530188434e-05, "loss": 0.4487, "num_input_tokens_seen": 41012688, "step": 70640 }, { "epoch": 10.522043491212392, "grad_norm": 1.140923023223877, "learning_rate": 2.7083708432790605e-05, "loss": 0.5754, "num_input_tokens_seen": 41015472, "step": 70645 }, { "epoch": 10.52278820375335, "grad_norm": 1.0201363563537598, "learning_rate": 2.7080470300190646e-05, "loss": 0.4624, "num_input_tokens_seen": 41018384, "step": 70650 }, { "epoch": 10.523532916294311, "grad_norm": 1.4702537059783936, "learning_rate": 2.7077232132443247e-05, "loss": 0.6242, "num_input_tokens_seen": 41021616, "step": 70655 }, { "epoch": 10.52427762883527, "grad_norm": 1.3054174184799194, "learning_rate": 2.7073993929603138e-05, "loss": 0.6028, "num_input_tokens_seen": 41024432, "step": 70660 }, { "epoch": 10.525022341376228, "grad_norm": 1.6259181499481201, "learning_rate": 2.7070755691724993e-05, "loss": 0.735, "num_input_tokens_seen": 41026928, "step": 70665 }, { "epoch": 10.525767053917187, "grad_norm": 5.113518714904785, "learning_rate": 2.7067517418863543e-05, "loss": 0.6279, "num_input_tokens_seen": 41029744, "step": 70670 }, { "epoch": 10.526511766458148, "grad_norm": 1.788376808166504, "learning_rate": 2.706427911107348e-05, "loss": 0.5107, "num_input_tokens_seen": 41032528, "step": 70675 }, { "epoch": 10.527256478999107, "grad_norm": 1.2813947200775146, "learning_rate": 2.7061040768409523e-05, "loss": 0.58, "num_input_tokens_seen": 41035344, "step": 70680 }, { "epoch": 10.528001191540065, "grad_norm": 1.3753728866577148, "learning_rate": 2.705780239092638e-05, "loss": 0.6085, "num_input_tokens_seen": 41038128, "step": 70685 }, { "epoch": 10.528745904081024, "grad_norm": 1.0880974531173706, "learning_rate": 2.705456397867876e-05, "loss": 0.5974, "num_input_tokens_seen": 41041008, "step": 70690 }, { "epoch": 10.529490616621985, "grad_norm": 1.6798136234283447, "learning_rate": 2.7051325531721366e-05, "loss": 0.6173, "num_input_tokens_seen": 41043984, "step": 70695 }, { "epoch": 10.530235329162943, "grad_norm": 0.9414414763450623, "learning_rate": 2.704808705010891e-05, "loss": 0.6352, "num_input_tokens_seen": 41046960, "step": 70700 }, { "epoch": 10.530980041703902, "grad_norm": 1.2931946516036987, "learning_rate": 2.7044848533896105e-05, "loss": 0.674, "num_input_tokens_seen": 41049872, "step": 70705 }, { "epoch": 10.53172475424486, "grad_norm": 1.2973476648330688, "learning_rate": 2.704160998313766e-05, "loss": 0.6087, "num_input_tokens_seen": 41052592, "step": 70710 }, { "epoch": 10.532469466785821, "grad_norm": 1.617908239364624, "learning_rate": 2.7038371397888295e-05, "loss": 0.6558, "num_input_tokens_seen": 41055952, "step": 70715 }, { "epoch": 10.53321417932678, "grad_norm": 2.059563398361206, "learning_rate": 2.7035132778202717e-05, "loss": 0.6547, "num_input_tokens_seen": 41058992, "step": 70720 }, { "epoch": 10.533958891867739, "grad_norm": 1.6150143146514893, "learning_rate": 2.7031894124135638e-05, "loss": 0.4765, "num_input_tokens_seen": 41061712, "step": 70725 }, { "epoch": 10.534703604408698, "grad_norm": 1.47760009765625, "learning_rate": 2.7028655435741772e-05, "loss": 0.5905, "num_input_tokens_seen": 41064624, "step": 70730 }, { "epoch": 10.535448316949658, "grad_norm": 1.310244083404541, "learning_rate": 2.7025416713075836e-05, "loss": 0.6735, "num_input_tokens_seen": 41067664, "step": 70735 }, { "epoch": 10.536193029490617, "grad_norm": 1.7509737014770508, "learning_rate": 2.702217795619254e-05, "loss": 0.5896, "num_input_tokens_seen": 41070544, "step": 70740 }, { "epoch": 10.536937742031576, "grad_norm": 2.1896040439605713, "learning_rate": 2.7018939165146606e-05, "loss": 0.5051, "num_input_tokens_seen": 41074064, "step": 70745 }, { "epoch": 10.537682454572534, "grad_norm": 3.0402042865753174, "learning_rate": 2.701570033999274e-05, "loss": 0.6283, "num_input_tokens_seen": 41076656, "step": 70750 }, { "epoch": 10.538427167113493, "grad_norm": 1.442718505859375, "learning_rate": 2.7012461480785668e-05, "loss": 0.6203, "num_input_tokens_seen": 41079504, "step": 70755 }, { "epoch": 10.539171879654454, "grad_norm": 1.1672066450119019, "learning_rate": 2.7009222587580114e-05, "loss": 0.4802, "num_input_tokens_seen": 41082320, "step": 70760 }, { "epoch": 10.539916592195413, "grad_norm": 1.305567741394043, "learning_rate": 2.7005983660430778e-05, "loss": 0.702, "num_input_tokens_seen": 41085072, "step": 70765 }, { "epoch": 10.540661304736371, "grad_norm": 1.4579615592956543, "learning_rate": 2.700274469939239e-05, "loss": 0.649, "num_input_tokens_seen": 41087760, "step": 70770 }, { "epoch": 10.541406017277332, "grad_norm": 1.7292180061340332, "learning_rate": 2.6999505704519662e-05, "loss": 0.5829, "num_input_tokens_seen": 41090320, "step": 70775 }, { "epoch": 10.54215072981829, "grad_norm": 0.9588637948036194, "learning_rate": 2.6996266675867322e-05, "loss": 0.4211, "num_input_tokens_seen": 41093456, "step": 70780 }, { "epoch": 10.54289544235925, "grad_norm": 1.549179196357727, "learning_rate": 2.6993027613490078e-05, "loss": 0.8209, "num_input_tokens_seen": 41096272, "step": 70785 }, { "epoch": 10.543640154900208, "grad_norm": 1.5188177824020386, "learning_rate": 2.698978851744266e-05, "loss": 0.6733, "num_input_tokens_seen": 41099408, "step": 70790 }, { "epoch": 10.544384867441167, "grad_norm": 2.7481515407562256, "learning_rate": 2.69865493877798e-05, "loss": 0.6274, "num_input_tokens_seen": 41102256, "step": 70795 }, { "epoch": 10.545129579982127, "grad_norm": 1.4758293628692627, "learning_rate": 2.698331022455619e-05, "loss": 0.5979, "num_input_tokens_seen": 41105040, "step": 70800 }, { "epoch": 10.545874292523086, "grad_norm": 1.6846950054168701, "learning_rate": 2.6980071027826574e-05, "loss": 0.5708, "num_input_tokens_seen": 41107760, "step": 70805 }, { "epoch": 10.546619005064045, "grad_norm": 1.1558014154434204, "learning_rate": 2.697683179764568e-05, "loss": 0.7122, "num_input_tokens_seen": 41110608, "step": 70810 }, { "epoch": 10.547363717605004, "grad_norm": 1.1805992126464844, "learning_rate": 2.6973592534068216e-05, "loss": 0.7277, "num_input_tokens_seen": 41113680, "step": 70815 }, { "epoch": 10.548108430145964, "grad_norm": 1.4012649059295654, "learning_rate": 2.697035323714891e-05, "loss": 0.5361, "num_input_tokens_seen": 41116720, "step": 70820 }, { "epoch": 10.548853142686923, "grad_norm": 0.9488559365272522, "learning_rate": 2.6967113906942494e-05, "loss": 0.5323, "num_input_tokens_seen": 41119600, "step": 70825 }, { "epoch": 10.549597855227882, "grad_norm": 1.046397089958191, "learning_rate": 2.696387454350368e-05, "loss": 0.5733, "num_input_tokens_seen": 41122672, "step": 70830 }, { "epoch": 10.55034256776884, "grad_norm": 1.6790916919708252, "learning_rate": 2.6960635146887202e-05, "loss": 0.5104, "num_input_tokens_seen": 41125840, "step": 70835 }, { "epoch": 10.551087280309801, "grad_norm": 1.5469247102737427, "learning_rate": 2.6957395717147794e-05, "loss": 0.526, "num_input_tokens_seen": 41129136, "step": 70840 }, { "epoch": 10.55183199285076, "grad_norm": 1.0538413524627686, "learning_rate": 2.695415625434017e-05, "loss": 0.5617, "num_input_tokens_seen": 41131664, "step": 70845 }, { "epoch": 10.552576705391719, "grad_norm": 2.2915499210357666, "learning_rate": 2.695091675851906e-05, "loss": 0.5999, "num_input_tokens_seen": 41134256, "step": 70850 }, { "epoch": 10.553321417932677, "grad_norm": 2.145998001098633, "learning_rate": 2.6947677229739198e-05, "loss": 0.4633, "num_input_tokens_seen": 41137200, "step": 70855 }, { "epoch": 10.554066130473638, "grad_norm": 2.0569167137145996, "learning_rate": 2.6944437668055313e-05, "loss": 0.5386, "num_input_tokens_seen": 41140016, "step": 70860 }, { "epoch": 10.554810843014597, "grad_norm": 1.2832547426223755, "learning_rate": 2.6941198073522118e-05, "loss": 0.5711, "num_input_tokens_seen": 41143344, "step": 70865 }, { "epoch": 10.555555555555555, "grad_norm": 1.1037437915802002, "learning_rate": 2.693795844619436e-05, "loss": 0.6256, "num_input_tokens_seen": 41146416, "step": 70870 }, { "epoch": 10.556300268096514, "grad_norm": 0.7541627883911133, "learning_rate": 2.6934718786126763e-05, "loss": 0.5305, "num_input_tokens_seen": 41149264, "step": 70875 }, { "epoch": 10.557044980637475, "grad_norm": 1.0691696405410767, "learning_rate": 2.6931479093374056e-05, "loss": 0.6235, "num_input_tokens_seen": 41152272, "step": 70880 }, { "epoch": 10.557789693178433, "grad_norm": 1.2672923803329468, "learning_rate": 2.6928239367990974e-05, "loss": 0.6467, "num_input_tokens_seen": 41154864, "step": 70885 }, { "epoch": 10.558534405719392, "grad_norm": 1.0203527212142944, "learning_rate": 2.692499961003226e-05, "loss": 0.4726, "num_input_tokens_seen": 41157968, "step": 70890 }, { "epoch": 10.559279118260351, "grad_norm": 2.6025609970092773, "learning_rate": 2.692175981955263e-05, "loss": 0.5117, "num_input_tokens_seen": 41160656, "step": 70895 }, { "epoch": 10.560023830801311, "grad_norm": 1.3848837614059448, "learning_rate": 2.691851999660681e-05, "loss": 0.622, "num_input_tokens_seen": 41163408, "step": 70900 }, { "epoch": 10.56076854334227, "grad_norm": 1.630454421043396, "learning_rate": 2.691528014124955e-05, "loss": 0.7112, "num_input_tokens_seen": 41166448, "step": 70905 }, { "epoch": 10.561513255883229, "grad_norm": 0.8785250782966614, "learning_rate": 2.6912040253535574e-05, "loss": 0.6399, "num_input_tokens_seen": 41169200, "step": 70910 }, { "epoch": 10.562257968424188, "grad_norm": 1.1586989164352417, "learning_rate": 2.6908800333519625e-05, "loss": 0.6826, "num_input_tokens_seen": 41172176, "step": 70915 }, { "epoch": 10.563002680965148, "grad_norm": 2.1949527263641357, "learning_rate": 2.6905560381256434e-05, "loss": 0.7204, "num_input_tokens_seen": 41175024, "step": 70920 }, { "epoch": 10.563747393506107, "grad_norm": 1.6752358675003052, "learning_rate": 2.690232039680074e-05, "loss": 0.5211, "num_input_tokens_seen": 41177936, "step": 70925 }, { "epoch": 10.564492106047066, "grad_norm": 2.3077824115753174, "learning_rate": 2.6899080380207276e-05, "loss": 0.5933, "num_input_tokens_seen": 41181040, "step": 70930 }, { "epoch": 10.565236818588025, "grad_norm": 1.8011163473129272, "learning_rate": 2.689584033153078e-05, "loss": 0.7592, "num_input_tokens_seen": 41184016, "step": 70935 }, { "epoch": 10.565981531128983, "grad_norm": 1.7261850833892822, "learning_rate": 2.6892600250825982e-05, "loss": 0.5798, "num_input_tokens_seen": 41186736, "step": 70940 }, { "epoch": 10.566726243669944, "grad_norm": 1.8711868524551392, "learning_rate": 2.688936013814763e-05, "loss": 0.6611, "num_input_tokens_seen": 41189712, "step": 70945 }, { "epoch": 10.567470956210903, "grad_norm": 1.3169496059417725, "learning_rate": 2.688611999355046e-05, "loss": 0.4906, "num_input_tokens_seen": 41192368, "step": 70950 }, { "epoch": 10.568215668751861, "grad_norm": 1.1636515855789185, "learning_rate": 2.6882879817089207e-05, "loss": 0.5739, "num_input_tokens_seen": 41195088, "step": 70955 }, { "epoch": 10.568960381292822, "grad_norm": 1.196373462677002, "learning_rate": 2.6879639608818618e-05, "loss": 0.8534, "num_input_tokens_seen": 41198000, "step": 70960 }, { "epoch": 10.56970509383378, "grad_norm": 1.0453882217407227, "learning_rate": 2.6876399368793425e-05, "loss": 0.4704, "num_input_tokens_seen": 41200528, "step": 70965 }, { "epoch": 10.57044980637474, "grad_norm": 0.9563398957252502, "learning_rate": 2.6873159097068366e-05, "loss": 0.5626, "num_input_tokens_seen": 41203472, "step": 70970 }, { "epoch": 10.571194518915698, "grad_norm": 1.5093897581100464, "learning_rate": 2.68699187936982e-05, "loss": 0.7694, "num_input_tokens_seen": 41206224, "step": 70975 }, { "epoch": 10.571939231456657, "grad_norm": 1.1427934169769287, "learning_rate": 2.686667845873765e-05, "loss": 0.6367, "num_input_tokens_seen": 41209264, "step": 70980 }, { "epoch": 10.572683943997617, "grad_norm": 1.447585105895996, "learning_rate": 2.686343809224146e-05, "loss": 0.5643, "num_input_tokens_seen": 41211984, "step": 70985 }, { "epoch": 10.573428656538576, "grad_norm": 1.430705189704895, "learning_rate": 2.6860197694264388e-05, "loss": 0.487, "num_input_tokens_seen": 41214928, "step": 70990 }, { "epoch": 10.574173369079535, "grad_norm": 0.5767582058906555, "learning_rate": 2.685695726486116e-05, "loss": 0.6228, "num_input_tokens_seen": 41217872, "step": 70995 }, { "epoch": 10.574918081620494, "grad_norm": 0.9676464796066284, "learning_rate": 2.6853716804086527e-05, "loss": 0.5176, "num_input_tokens_seen": 41220656, "step": 71000 }, { "epoch": 10.575662794161454, "grad_norm": 1.0940760374069214, "learning_rate": 2.6850476311995226e-05, "loss": 0.6938, "num_input_tokens_seen": 41223760, "step": 71005 }, { "epoch": 10.576407506702413, "grad_norm": 1.0025131702423096, "learning_rate": 2.6847235788642018e-05, "loss": 0.5355, "num_input_tokens_seen": 41226544, "step": 71010 }, { "epoch": 10.577152219243372, "grad_norm": 1.2564597129821777, "learning_rate": 2.6843995234081636e-05, "loss": 0.7334, "num_input_tokens_seen": 41229360, "step": 71015 }, { "epoch": 10.57789693178433, "grad_norm": 0.9992018342018127, "learning_rate": 2.6840754648368826e-05, "loss": 0.7018, "num_input_tokens_seen": 41232080, "step": 71020 }, { "epoch": 10.578641644325291, "grad_norm": 2.2671642303466797, "learning_rate": 2.6837514031558347e-05, "loss": 0.7386, "num_input_tokens_seen": 41234640, "step": 71025 }, { "epoch": 10.57938635686625, "grad_norm": 1.3310015201568604, "learning_rate": 2.6834273383704927e-05, "loss": 0.6139, "num_input_tokens_seen": 41237776, "step": 71030 }, { "epoch": 10.580131069407209, "grad_norm": 1.79377019405365, "learning_rate": 2.6831032704863324e-05, "loss": 0.6124, "num_input_tokens_seen": 41240560, "step": 71035 }, { "epoch": 10.580875781948167, "grad_norm": 1.191043496131897, "learning_rate": 2.6827791995088282e-05, "loss": 0.6507, "num_input_tokens_seen": 41243472, "step": 71040 }, { "epoch": 10.581620494489128, "grad_norm": 1.181509017944336, "learning_rate": 2.6824551254434555e-05, "loss": 0.5207, "num_input_tokens_seen": 41246384, "step": 71045 }, { "epoch": 10.582365207030087, "grad_norm": 2.7623748779296875, "learning_rate": 2.6821310482956886e-05, "loss": 0.4669, "num_input_tokens_seen": 41249392, "step": 71050 }, { "epoch": 10.583109919571045, "grad_norm": 0.9432498216629028, "learning_rate": 2.6818069680710034e-05, "loss": 0.6376, "num_input_tokens_seen": 41252112, "step": 71055 }, { "epoch": 10.583854632112004, "grad_norm": 1.0042698383331299, "learning_rate": 2.681482884774874e-05, "loss": 0.6382, "num_input_tokens_seen": 41254960, "step": 71060 }, { "epoch": 10.584599344652965, "grad_norm": 3.2284843921661377, "learning_rate": 2.6811587984127758e-05, "loss": 0.4842, "num_input_tokens_seen": 41257872, "step": 71065 }, { "epoch": 10.585344057193923, "grad_norm": 0.9082568883895874, "learning_rate": 2.680834708990183e-05, "loss": 0.5776, "num_input_tokens_seen": 41261104, "step": 71070 }, { "epoch": 10.586088769734882, "grad_norm": 1.8696880340576172, "learning_rate": 2.680510616512572e-05, "loss": 0.6065, "num_input_tokens_seen": 41264208, "step": 71075 }, { "epoch": 10.586833482275841, "grad_norm": 1.6561307907104492, "learning_rate": 2.6801865209854177e-05, "loss": 0.5625, "num_input_tokens_seen": 41266960, "step": 71080 }, { "epoch": 10.587578194816802, "grad_norm": 1.473976969718933, "learning_rate": 2.6798624224141954e-05, "loss": 0.7537, "num_input_tokens_seen": 41269904, "step": 71085 }, { "epoch": 10.58832290735776, "grad_norm": 0.6153725385665894, "learning_rate": 2.6795383208043805e-05, "loss": 0.5024, "num_input_tokens_seen": 41272752, "step": 71090 }, { "epoch": 10.589067619898719, "grad_norm": 0.9092145562171936, "learning_rate": 2.679214216161448e-05, "loss": 0.4432, "num_input_tokens_seen": 41275472, "step": 71095 }, { "epoch": 10.589812332439678, "grad_norm": 1.1723225116729736, "learning_rate": 2.6788901084908734e-05, "loss": 0.4642, "num_input_tokens_seen": 41278032, "step": 71100 }, { "epoch": 10.590557044980638, "grad_norm": 2.132967472076416, "learning_rate": 2.678565997798132e-05, "loss": 0.4514, "num_input_tokens_seen": 41281136, "step": 71105 }, { "epoch": 10.591301757521597, "grad_norm": 1.4454624652862549, "learning_rate": 2.6782418840886997e-05, "loss": 0.6662, "num_input_tokens_seen": 41284240, "step": 71110 }, { "epoch": 10.592046470062556, "grad_norm": 1.1516737937927246, "learning_rate": 2.6779177673680516e-05, "loss": 0.6339, "num_input_tokens_seen": 41287184, "step": 71115 }, { "epoch": 10.592791182603515, "grad_norm": 1.9836434125900269, "learning_rate": 2.6775936476416636e-05, "loss": 0.5918, "num_input_tokens_seen": 41290128, "step": 71120 }, { "epoch": 10.593535895144473, "grad_norm": 1.1153233051300049, "learning_rate": 2.6772695249150125e-05, "loss": 0.533, "num_input_tokens_seen": 41293040, "step": 71125 }, { "epoch": 10.594280607685434, "grad_norm": 2.185939311981201, "learning_rate": 2.6769453991935717e-05, "loss": 0.7009, "num_input_tokens_seen": 41296112, "step": 71130 }, { "epoch": 10.595025320226393, "grad_norm": 1.7559806108474731, "learning_rate": 2.676621270482819e-05, "loss": 0.4127, "num_input_tokens_seen": 41298800, "step": 71135 }, { "epoch": 10.595770032767351, "grad_norm": 1.2895712852478027, "learning_rate": 2.6762971387882297e-05, "loss": 0.6194, "num_input_tokens_seen": 41301616, "step": 71140 }, { "epoch": 10.59651474530831, "grad_norm": 1.1225078105926514, "learning_rate": 2.6759730041152787e-05, "loss": 0.5888, "num_input_tokens_seen": 41304496, "step": 71145 }, { "epoch": 10.59725945784927, "grad_norm": 3.1314780712127686, "learning_rate": 2.6756488664694422e-05, "loss": 0.7136, "num_input_tokens_seen": 41307408, "step": 71150 }, { "epoch": 10.59800417039023, "grad_norm": 2.230823516845703, "learning_rate": 2.675324725856198e-05, "loss": 0.3447, "num_input_tokens_seen": 41310288, "step": 71155 }, { "epoch": 10.598748882931188, "grad_norm": 1.481252670288086, "learning_rate": 2.6750005822810197e-05, "loss": 0.5602, "num_input_tokens_seen": 41312816, "step": 71160 }, { "epoch": 10.599493595472147, "grad_norm": 1.8643721342086792, "learning_rate": 2.6746764357493848e-05, "loss": 0.5191, "num_input_tokens_seen": 41315952, "step": 71165 }, { "epoch": 10.600238308013108, "grad_norm": 1.0843561887741089, "learning_rate": 2.6743522862667687e-05, "loss": 0.5692, "num_input_tokens_seen": 41318768, "step": 71170 }, { "epoch": 10.600983020554066, "grad_norm": 0.8541988730430603, "learning_rate": 2.6740281338386484e-05, "loss": 0.5285, "num_input_tokens_seen": 41321744, "step": 71175 }, { "epoch": 10.601727733095025, "grad_norm": 1.7465134859085083, "learning_rate": 2.6737039784705e-05, "loss": 0.5263, "num_input_tokens_seen": 41324528, "step": 71180 }, { "epoch": 10.602472445635984, "grad_norm": 1.2196934223175049, "learning_rate": 2.6733798201677985e-05, "loss": 0.7789, "num_input_tokens_seen": 41327440, "step": 71185 }, { "epoch": 10.603217158176944, "grad_norm": 2.0013909339904785, "learning_rate": 2.6730556589360216e-05, "loss": 0.6577, "num_input_tokens_seen": 41330416, "step": 71190 }, { "epoch": 10.603961870717903, "grad_norm": 1.254157304763794, "learning_rate": 2.672731494780645e-05, "loss": 0.4821, "num_input_tokens_seen": 41333296, "step": 71195 }, { "epoch": 10.604706583258862, "grad_norm": 1.309374451637268, "learning_rate": 2.672407327707146e-05, "loss": 0.5444, "num_input_tokens_seen": 41336240, "step": 71200 }, { "epoch": 10.60545129579982, "grad_norm": 1.3168383836746216, "learning_rate": 2.6720831577209997e-05, "loss": 0.5672, "num_input_tokens_seen": 41338832, "step": 71205 }, { "epoch": 10.606196008340781, "grad_norm": 1.064383625984192, "learning_rate": 2.6717589848276835e-05, "loss": 0.5269, "num_input_tokens_seen": 41341552, "step": 71210 }, { "epoch": 10.60694072088174, "grad_norm": 0.8535448312759399, "learning_rate": 2.671434809032674e-05, "loss": 0.4942, "num_input_tokens_seen": 41344560, "step": 71215 }, { "epoch": 10.607685433422699, "grad_norm": 1.3931853771209717, "learning_rate": 2.6711106303414478e-05, "loss": 0.6077, "num_input_tokens_seen": 41347728, "step": 71220 }, { "epoch": 10.608430145963657, "grad_norm": 1.1810479164123535, "learning_rate": 2.6707864487594815e-05, "loss": 0.6034, "num_input_tokens_seen": 41350544, "step": 71225 }, { "epoch": 10.609174858504618, "grad_norm": 1.4570406675338745, "learning_rate": 2.6704622642922512e-05, "loss": 0.6227, "num_input_tokens_seen": 41353488, "step": 71230 }, { "epoch": 10.609919571045577, "grad_norm": 1.206183910369873, "learning_rate": 2.6701380769452346e-05, "loss": 0.6796, "num_input_tokens_seen": 41356816, "step": 71235 }, { "epoch": 10.610664283586535, "grad_norm": 2.153808832168579, "learning_rate": 2.6698138867239076e-05, "loss": 0.652, "num_input_tokens_seen": 41359888, "step": 71240 }, { "epoch": 10.611408996127494, "grad_norm": 2.086911916732788, "learning_rate": 2.6694896936337477e-05, "loss": 0.5439, "num_input_tokens_seen": 41362704, "step": 71245 }, { "epoch": 10.612153708668455, "grad_norm": 1.2054846286773682, "learning_rate": 2.6691654976802316e-05, "loss": 0.4974, "num_input_tokens_seen": 41365552, "step": 71250 }, { "epoch": 10.612898421209414, "grad_norm": 1.4917298555374146, "learning_rate": 2.6688412988688372e-05, "loss": 0.6366, "num_input_tokens_seen": 41368848, "step": 71255 }, { "epoch": 10.613643133750372, "grad_norm": 3.935163736343384, "learning_rate": 2.6685170972050404e-05, "loss": 0.6231, "num_input_tokens_seen": 41371632, "step": 71260 }, { "epoch": 10.614387846291331, "grad_norm": 2.5161385536193848, "learning_rate": 2.6681928926943183e-05, "loss": 0.8215, "num_input_tokens_seen": 41374544, "step": 71265 }, { "epoch": 10.615132558832292, "grad_norm": 1.287965178489685, "learning_rate": 2.667868685342148e-05, "loss": 0.5281, "num_input_tokens_seen": 41377552, "step": 71270 }, { "epoch": 10.61587727137325, "grad_norm": 1.1419477462768555, "learning_rate": 2.6675444751540068e-05, "loss": 0.618, "num_input_tokens_seen": 41380656, "step": 71275 }, { "epoch": 10.616621983914209, "grad_norm": 1.2138265371322632, "learning_rate": 2.667220262135372e-05, "loss": 0.623, "num_input_tokens_seen": 41383984, "step": 71280 }, { "epoch": 10.617366696455168, "grad_norm": 1.0169545412063599, "learning_rate": 2.666896046291721e-05, "loss": 0.671, "num_input_tokens_seen": 41386704, "step": 71285 }, { "epoch": 10.618111408996128, "grad_norm": 1.7602248191833496, "learning_rate": 2.6665718276285312e-05, "loss": 0.6065, "num_input_tokens_seen": 41389712, "step": 71290 }, { "epoch": 10.618856121537087, "grad_norm": 1.3304957151412964, "learning_rate": 2.66624760615128e-05, "loss": 0.6293, "num_input_tokens_seen": 41392912, "step": 71295 }, { "epoch": 10.619600834078046, "grad_norm": 1.50105619430542, "learning_rate": 2.6659233818654434e-05, "loss": 0.551, "num_input_tokens_seen": 41395856, "step": 71300 }, { "epoch": 10.620345546619005, "grad_norm": 1.4386131763458252, "learning_rate": 2.6655991547765e-05, "loss": 0.718, "num_input_tokens_seen": 41398896, "step": 71305 }, { "epoch": 10.621090259159963, "grad_norm": 0.9145070910453796, "learning_rate": 2.6652749248899277e-05, "loss": 0.579, "num_input_tokens_seen": 41401840, "step": 71310 }, { "epoch": 10.621834971700924, "grad_norm": 1.3800445795059204, "learning_rate": 2.6649506922112033e-05, "loss": 0.7751, "num_input_tokens_seen": 41404720, "step": 71315 }, { "epoch": 10.622579684241883, "grad_norm": 1.6300139427185059, "learning_rate": 2.6646264567458052e-05, "loss": 0.4991, "num_input_tokens_seen": 41407888, "step": 71320 }, { "epoch": 10.623324396782841, "grad_norm": 1.5058345794677734, "learning_rate": 2.6643022184992096e-05, "loss": 0.917, "num_input_tokens_seen": 41410736, "step": 71325 }, { "epoch": 10.6240691093238, "grad_norm": 2.4063169956207275, "learning_rate": 2.6639779774768953e-05, "loss": 0.6445, "num_input_tokens_seen": 41413520, "step": 71330 }, { "epoch": 10.62481382186476, "grad_norm": 1.3573907613754272, "learning_rate": 2.6636537336843396e-05, "loss": 0.558, "num_input_tokens_seen": 41416336, "step": 71335 }, { "epoch": 10.62555853440572, "grad_norm": 2.1395585536956787, "learning_rate": 2.663329487127021e-05, "loss": 0.4582, "num_input_tokens_seen": 41419280, "step": 71340 }, { "epoch": 10.626303246946678, "grad_norm": 1.8018410205841064, "learning_rate": 2.663005237810416e-05, "loss": 0.5912, "num_input_tokens_seen": 41422256, "step": 71345 }, { "epoch": 10.627047959487637, "grad_norm": 2.1606879234313965, "learning_rate": 2.6626809857400033e-05, "loss": 0.6295, "num_input_tokens_seen": 41424944, "step": 71350 }, { "epoch": 10.627792672028598, "grad_norm": 1.0816943645477295, "learning_rate": 2.662356730921261e-05, "loss": 0.5841, "num_input_tokens_seen": 41427888, "step": 71355 }, { "epoch": 10.628537384569556, "grad_norm": 1.2831143140792847, "learning_rate": 2.6620324733596664e-05, "loss": 0.6207, "num_input_tokens_seen": 41430864, "step": 71360 }, { "epoch": 10.629282097110515, "grad_norm": 0.9319922924041748, "learning_rate": 2.661708213060698e-05, "loss": 0.545, "num_input_tokens_seen": 41433744, "step": 71365 }, { "epoch": 10.630026809651474, "grad_norm": 2.006199598312378, "learning_rate": 2.661383950029834e-05, "loss": 0.6227, "num_input_tokens_seen": 41436368, "step": 71370 }, { "epoch": 10.630771522192434, "grad_norm": 1.6388307809829712, "learning_rate": 2.6610596842725522e-05, "loss": 0.6947, "num_input_tokens_seen": 41439056, "step": 71375 }, { "epoch": 10.631516234733393, "grad_norm": 1.1665507555007935, "learning_rate": 2.66073541579433e-05, "loss": 0.5825, "num_input_tokens_seen": 41442064, "step": 71380 }, { "epoch": 10.632260947274352, "grad_norm": 1.182491421699524, "learning_rate": 2.6604111446006464e-05, "loss": 0.5521, "num_input_tokens_seen": 41444688, "step": 71385 }, { "epoch": 10.63300565981531, "grad_norm": 1.2096598148345947, "learning_rate": 2.6600868706969806e-05, "loss": 0.6675, "num_input_tokens_seen": 41447632, "step": 71390 }, { "epoch": 10.633750372356271, "grad_norm": 2.7156805992126465, "learning_rate": 2.6597625940888087e-05, "loss": 0.8904, "num_input_tokens_seen": 41450640, "step": 71395 }, { "epoch": 10.63449508489723, "grad_norm": 2.165980100631714, "learning_rate": 2.6594383147816103e-05, "loss": 0.6699, "num_input_tokens_seen": 41453648, "step": 71400 }, { "epoch": 10.635239797438189, "grad_norm": 2.1335575580596924, "learning_rate": 2.659114032780864e-05, "loss": 0.5222, "num_input_tokens_seen": 41456464, "step": 71405 }, { "epoch": 10.635984509979147, "grad_norm": 1.068244457244873, "learning_rate": 2.6587897480920478e-05, "loss": 0.5438, "num_input_tokens_seen": 41459632, "step": 71410 }, { "epoch": 10.636729222520108, "grad_norm": 1.4399775266647339, "learning_rate": 2.6584654607206404e-05, "loss": 0.5005, "num_input_tokens_seen": 41462448, "step": 71415 }, { "epoch": 10.637473935061067, "grad_norm": 2.4482672214508057, "learning_rate": 2.6581411706721194e-05, "loss": 0.679, "num_input_tokens_seen": 41465232, "step": 71420 }, { "epoch": 10.638218647602026, "grad_norm": 1.8322231769561768, "learning_rate": 2.6578168779519652e-05, "loss": 0.56, "num_input_tokens_seen": 41468112, "step": 71425 }, { "epoch": 10.638963360142984, "grad_norm": 0.702318549156189, "learning_rate": 2.657492582565654e-05, "loss": 0.5417, "num_input_tokens_seen": 41470992, "step": 71430 }, { "epoch": 10.639708072683945, "grad_norm": 1.6287654638290405, "learning_rate": 2.6571682845186662e-05, "loss": 0.7576, "num_input_tokens_seen": 41474032, "step": 71435 }, { "epoch": 10.640452785224904, "grad_norm": 1.8930237293243408, "learning_rate": 2.6568439838164798e-05, "loss": 0.7487, "num_input_tokens_seen": 41476848, "step": 71440 }, { "epoch": 10.641197497765862, "grad_norm": 3.961987257003784, "learning_rate": 2.6565196804645738e-05, "loss": 0.76, "num_input_tokens_seen": 41479664, "step": 71445 }, { "epoch": 10.641942210306821, "grad_norm": 1.8292884826660156, "learning_rate": 2.6561953744684264e-05, "loss": 0.566, "num_input_tokens_seen": 41482352, "step": 71450 }, { "epoch": 10.64268692284778, "grad_norm": 2.2016477584838867, "learning_rate": 2.655871065833518e-05, "loss": 0.6955, "num_input_tokens_seen": 41485360, "step": 71455 }, { "epoch": 10.64343163538874, "grad_norm": 1.571434736251831, "learning_rate": 2.655546754565326e-05, "loss": 0.7464, "num_input_tokens_seen": 41488048, "step": 71460 }, { "epoch": 10.6441763479297, "grad_norm": 0.9765572547912598, "learning_rate": 2.6552224406693293e-05, "loss": 0.5832, "num_input_tokens_seen": 41490992, "step": 71465 }, { "epoch": 10.644921060470658, "grad_norm": 1.2200934886932373, "learning_rate": 2.6548981241510073e-05, "loss": 0.5937, "num_input_tokens_seen": 41494128, "step": 71470 }, { "epoch": 10.645665773011618, "grad_norm": 0.9836954474449158, "learning_rate": 2.654573805015839e-05, "loss": 0.4859, "num_input_tokens_seen": 41497072, "step": 71475 }, { "epoch": 10.646410485552577, "grad_norm": 1.2292778491973877, "learning_rate": 2.654249483269303e-05, "loss": 0.5129, "num_input_tokens_seen": 41500016, "step": 71480 }, { "epoch": 10.647155198093536, "grad_norm": 2.872509241104126, "learning_rate": 2.65392515891688e-05, "loss": 0.7157, "num_input_tokens_seen": 41502640, "step": 71485 }, { "epoch": 10.647899910634495, "grad_norm": 1.4581878185272217, "learning_rate": 2.6536008319640466e-05, "loss": 0.5095, "num_input_tokens_seen": 41505584, "step": 71490 }, { "epoch": 10.648644623175453, "grad_norm": 1.4466794729232788, "learning_rate": 2.6532765024162837e-05, "loss": 0.4674, "num_input_tokens_seen": 41508400, "step": 71495 }, { "epoch": 10.649389335716414, "grad_norm": 0.8405512571334839, "learning_rate": 2.6529521702790705e-05, "loss": 0.5272, "num_input_tokens_seen": 41511216, "step": 71500 }, { "epoch": 10.650134048257373, "grad_norm": 3.170696973800659, "learning_rate": 2.6526278355578848e-05, "loss": 0.7627, "num_input_tokens_seen": 41514224, "step": 71505 }, { "epoch": 10.650878760798332, "grad_norm": 1.3036218881607056, "learning_rate": 2.6523034982582078e-05, "loss": 0.6031, "num_input_tokens_seen": 41517200, "step": 71510 }, { "epoch": 10.65162347333929, "grad_norm": 1.6233198642730713, "learning_rate": 2.6519791583855174e-05, "loss": 0.7182, "num_input_tokens_seen": 41520304, "step": 71515 }, { "epoch": 10.65236818588025, "grad_norm": 2.503148078918457, "learning_rate": 2.6516548159452943e-05, "loss": 0.5058, "num_input_tokens_seen": 41523024, "step": 71520 }, { "epoch": 10.65311289842121, "grad_norm": 2.655123233795166, "learning_rate": 2.651330470943017e-05, "loss": 0.6196, "num_input_tokens_seen": 41525744, "step": 71525 }, { "epoch": 10.653857610962168, "grad_norm": 1.0390385389328003, "learning_rate": 2.651006123384165e-05, "loss": 0.5265, "num_input_tokens_seen": 41528400, "step": 71530 }, { "epoch": 10.654602323503127, "grad_norm": 1.3081631660461426, "learning_rate": 2.6506817732742173e-05, "loss": 0.4997, "num_input_tokens_seen": 41531440, "step": 71535 }, { "epoch": 10.655347036044088, "grad_norm": 1.8087913990020752, "learning_rate": 2.6503574206186553e-05, "loss": 0.6911, "num_input_tokens_seen": 41534384, "step": 71540 }, { "epoch": 10.656091748585046, "grad_norm": 1.7147775888442993, "learning_rate": 2.6500330654229573e-05, "loss": 0.6225, "num_input_tokens_seen": 41537552, "step": 71545 }, { "epoch": 10.656836461126005, "grad_norm": 2.0593338012695312, "learning_rate": 2.649708707692603e-05, "loss": 0.5508, "num_input_tokens_seen": 41540304, "step": 71550 }, { "epoch": 10.657581173666964, "grad_norm": 1.4051311016082764, "learning_rate": 2.6493843474330727e-05, "loss": 0.6194, "num_input_tokens_seen": 41543440, "step": 71555 }, { "epoch": 10.658325886207924, "grad_norm": 2.3578176498413086, "learning_rate": 2.649059984649845e-05, "loss": 0.4488, "num_input_tokens_seen": 41546128, "step": 71560 }, { "epoch": 10.659070598748883, "grad_norm": 1.2247965335845947, "learning_rate": 2.6487356193484002e-05, "loss": 0.6977, "num_input_tokens_seen": 41548848, "step": 71565 }, { "epoch": 10.659815311289842, "grad_norm": 2.115583658218384, "learning_rate": 2.6484112515342186e-05, "loss": 0.6999, "num_input_tokens_seen": 41551504, "step": 71570 }, { "epoch": 10.6605600238308, "grad_norm": 0.8501832485198975, "learning_rate": 2.6480868812127795e-05, "loss": 0.4362, "num_input_tokens_seen": 41554192, "step": 71575 }, { "epoch": 10.661304736371761, "grad_norm": 0.9156924486160278, "learning_rate": 2.6477625083895636e-05, "loss": 0.6257, "num_input_tokens_seen": 41557136, "step": 71580 }, { "epoch": 10.66204944891272, "grad_norm": 1.5133308172225952, "learning_rate": 2.6474381330700497e-05, "loss": 0.5649, "num_input_tokens_seen": 41560016, "step": 71585 }, { "epoch": 10.662794161453679, "grad_norm": 2.005070686340332, "learning_rate": 2.6471137552597193e-05, "loss": 0.8369, "num_input_tokens_seen": 41563056, "step": 71590 }, { "epoch": 10.663538873994638, "grad_norm": 2.545464277267456, "learning_rate": 2.646789374964051e-05, "loss": 0.7404, "num_input_tokens_seen": 41565648, "step": 71595 }, { "epoch": 10.664283586535598, "grad_norm": 0.9347237348556519, "learning_rate": 2.6464649921885247e-05, "loss": 0.5697, "num_input_tokens_seen": 41568624, "step": 71600 }, { "epoch": 10.665028299076557, "grad_norm": 1.395941972732544, "learning_rate": 2.646140606938622e-05, "loss": 0.6571, "num_input_tokens_seen": 41571728, "step": 71605 }, { "epoch": 10.665773011617516, "grad_norm": 1.6886931657791138, "learning_rate": 2.6458162192198223e-05, "loss": 0.6789, "num_input_tokens_seen": 41574864, "step": 71610 }, { "epoch": 10.666517724158474, "grad_norm": 2.5123279094696045, "learning_rate": 2.6454918290376053e-05, "loss": 0.7883, "num_input_tokens_seen": 41577840, "step": 71615 }, { "epoch": 10.667262436699435, "grad_norm": 2.1204798221588135, "learning_rate": 2.645167436397452e-05, "loss": 0.5943, "num_input_tokens_seen": 41580848, "step": 71620 }, { "epoch": 10.668007149240394, "grad_norm": 1.0359762907028198, "learning_rate": 2.644843041304843e-05, "loss": 0.6302, "num_input_tokens_seen": 41583824, "step": 71625 }, { "epoch": 10.668751861781352, "grad_norm": 1.4451310634613037, "learning_rate": 2.6445186437652577e-05, "loss": 0.6658, "num_input_tokens_seen": 41586544, "step": 71630 }, { "epoch": 10.669496574322311, "grad_norm": 2.0051887035369873, "learning_rate": 2.644194243784176e-05, "loss": 0.9086, "num_input_tokens_seen": 41589232, "step": 71635 }, { "epoch": 10.67024128686327, "grad_norm": 1.0064187049865723, "learning_rate": 2.64386984136708e-05, "loss": 0.6568, "num_input_tokens_seen": 41592080, "step": 71640 }, { "epoch": 10.67098599940423, "grad_norm": 1.5500729084014893, "learning_rate": 2.6435454365194483e-05, "loss": 0.7986, "num_input_tokens_seen": 41595056, "step": 71645 }, { "epoch": 10.67173071194519, "grad_norm": 1.3506113290786743, "learning_rate": 2.6432210292467634e-05, "loss": 0.5235, "num_input_tokens_seen": 41597776, "step": 71650 }, { "epoch": 10.672475424486148, "grad_norm": 1.1463419198989868, "learning_rate": 2.642896619554504e-05, "loss": 0.5937, "num_input_tokens_seen": 41600496, "step": 71655 }, { "epoch": 10.673220137027108, "grad_norm": 1.830057144165039, "learning_rate": 2.6425722074481525e-05, "loss": 0.6204, "num_input_tokens_seen": 41603472, "step": 71660 }, { "epoch": 10.673964849568067, "grad_norm": 2.7443137168884277, "learning_rate": 2.6422477929331875e-05, "loss": 0.613, "num_input_tokens_seen": 41606352, "step": 71665 }, { "epoch": 10.674709562109026, "grad_norm": 2.563218593597412, "learning_rate": 2.6419233760150907e-05, "loss": 0.6749, "num_input_tokens_seen": 41609296, "step": 71670 }, { "epoch": 10.675454274649985, "grad_norm": 0.7968015670776367, "learning_rate": 2.6415989566993425e-05, "loss": 0.5106, "num_input_tokens_seen": 41612016, "step": 71675 }, { "epoch": 10.676198987190944, "grad_norm": 0.6456744074821472, "learning_rate": 2.6412745349914242e-05, "loss": 0.4866, "num_input_tokens_seen": 41614864, "step": 71680 }, { "epoch": 10.676943699731904, "grad_norm": 0.8576405644416809, "learning_rate": 2.6409501108968164e-05, "loss": 0.6451, "num_input_tokens_seen": 41617712, "step": 71685 }, { "epoch": 10.677688412272863, "grad_norm": 1.6018205881118774, "learning_rate": 2.6406256844209998e-05, "loss": 0.5952, "num_input_tokens_seen": 41620656, "step": 71690 }, { "epoch": 10.678433124813822, "grad_norm": 0.6999890804290771, "learning_rate": 2.6403012555694546e-05, "loss": 0.7447, "num_input_tokens_seen": 41623504, "step": 71695 }, { "epoch": 10.67917783735478, "grad_norm": 1.6740621328353882, "learning_rate": 2.639976824347663e-05, "loss": 0.6166, "num_input_tokens_seen": 41626480, "step": 71700 }, { "epoch": 10.67992254989574, "grad_norm": 1.9885685443878174, "learning_rate": 2.639652390761105e-05, "loss": 0.8289, "num_input_tokens_seen": 41629328, "step": 71705 }, { "epoch": 10.6806672624367, "grad_norm": 2.4272561073303223, "learning_rate": 2.639327954815261e-05, "loss": 0.5942, "num_input_tokens_seen": 41632368, "step": 71710 }, { "epoch": 10.681411974977658, "grad_norm": 0.7586454749107361, "learning_rate": 2.6390035165156136e-05, "loss": 0.4734, "num_input_tokens_seen": 41635216, "step": 71715 }, { "epoch": 10.682156687518617, "grad_norm": 2.1548564434051514, "learning_rate": 2.6386790758676432e-05, "loss": 0.5682, "num_input_tokens_seen": 41638032, "step": 71720 }, { "epoch": 10.682901400059578, "grad_norm": 1.286489725112915, "learning_rate": 2.6383546328768305e-05, "loss": 0.747, "num_input_tokens_seen": 41640848, "step": 71725 }, { "epoch": 10.683646112600536, "grad_norm": 1.5182340145111084, "learning_rate": 2.6380301875486568e-05, "loss": 0.5343, "num_input_tokens_seen": 41643760, "step": 71730 }, { "epoch": 10.684390825141495, "grad_norm": 1.2615633010864258, "learning_rate": 2.637705739888603e-05, "loss": 0.6928, "num_input_tokens_seen": 41646416, "step": 71735 }, { "epoch": 10.685135537682454, "grad_norm": 1.2516632080078125, "learning_rate": 2.6373812899021516e-05, "loss": 0.6278, "num_input_tokens_seen": 41649328, "step": 71740 }, { "epoch": 10.685880250223414, "grad_norm": 1.2518774271011353, "learning_rate": 2.6370568375947825e-05, "loss": 0.5456, "num_input_tokens_seen": 41652272, "step": 71745 }, { "epoch": 10.686624962764373, "grad_norm": 1.1366974115371704, "learning_rate": 2.636732382971977e-05, "loss": 0.5053, "num_input_tokens_seen": 41655344, "step": 71750 }, { "epoch": 10.687369675305332, "grad_norm": 1.5259588956832886, "learning_rate": 2.6364079260392178e-05, "loss": 0.5978, "num_input_tokens_seen": 41658096, "step": 71755 }, { "epoch": 10.68811438784629, "grad_norm": 0.9789321422576904, "learning_rate": 2.6360834668019845e-05, "loss": 0.4729, "num_input_tokens_seen": 41660880, "step": 71760 }, { "epoch": 10.688859100387251, "grad_norm": 1.039425015449524, "learning_rate": 2.6357590052657595e-05, "loss": 0.5882, "num_input_tokens_seen": 41663632, "step": 71765 }, { "epoch": 10.68960381292821, "grad_norm": 0.8247698545455933, "learning_rate": 2.6354345414360236e-05, "loss": 0.5756, "num_input_tokens_seen": 41666448, "step": 71770 }, { "epoch": 10.690348525469169, "grad_norm": 1.4351119995117188, "learning_rate": 2.6351100753182594e-05, "loss": 0.6076, "num_input_tokens_seen": 41669392, "step": 71775 }, { "epoch": 10.691093238010128, "grad_norm": 1.1056045293807983, "learning_rate": 2.6347856069179483e-05, "loss": 0.6137, "num_input_tokens_seen": 41672784, "step": 71780 }, { "epoch": 10.691837950551088, "grad_norm": 0.8178455233573914, "learning_rate": 2.6344611362405708e-05, "loss": 0.5147, "num_input_tokens_seen": 41675536, "step": 71785 }, { "epoch": 10.692582663092047, "grad_norm": 1.4167091846466064, "learning_rate": 2.634136663291609e-05, "loss": 0.7015, "num_input_tokens_seen": 41678352, "step": 71790 }, { "epoch": 10.693327375633006, "grad_norm": 1.2996652126312256, "learning_rate": 2.6338121880765447e-05, "loss": 0.7425, "num_input_tokens_seen": 41680976, "step": 71795 }, { "epoch": 10.694072088173964, "grad_norm": 1.180487036705017, "learning_rate": 2.6334877106008594e-05, "loss": 0.6234, "num_input_tokens_seen": 41683952, "step": 71800 }, { "epoch": 10.694816800714925, "grad_norm": 1.0907107591629028, "learning_rate": 2.633163230870035e-05, "loss": 0.4622, "num_input_tokens_seen": 41686864, "step": 71805 }, { "epoch": 10.695561513255884, "grad_norm": 1.2842843532562256, "learning_rate": 2.632838748889553e-05, "loss": 0.5757, "num_input_tokens_seen": 41689840, "step": 71810 }, { "epoch": 10.696306225796842, "grad_norm": 1.7148312330245972, "learning_rate": 2.6325142646648958e-05, "loss": 0.5059, "num_input_tokens_seen": 41692784, "step": 71815 }, { "epoch": 10.697050938337801, "grad_norm": 1.505416750907898, "learning_rate": 2.632189778201544e-05, "loss": 0.5893, "num_input_tokens_seen": 41695280, "step": 71820 }, { "epoch": 10.69779565087876, "grad_norm": 0.9325193166732788, "learning_rate": 2.631865289504981e-05, "loss": 0.5502, "num_input_tokens_seen": 41698320, "step": 71825 }, { "epoch": 10.69854036341972, "grad_norm": 1.1116598844528198, "learning_rate": 2.631540798580688e-05, "loss": 0.4696, "num_input_tokens_seen": 41701136, "step": 71830 }, { "epoch": 10.69928507596068, "grad_norm": 1.8605377674102783, "learning_rate": 2.6312163054341464e-05, "loss": 0.5562, "num_input_tokens_seen": 41703984, "step": 71835 }, { "epoch": 10.700029788501638, "grad_norm": 2.271165609359741, "learning_rate": 2.6308918100708386e-05, "loss": 0.7301, "num_input_tokens_seen": 41707184, "step": 71840 }, { "epoch": 10.700774501042597, "grad_norm": 0.9682185649871826, "learning_rate": 2.6305673124962466e-05, "loss": 0.6933, "num_input_tokens_seen": 41710096, "step": 71845 }, { "epoch": 10.701519213583557, "grad_norm": 1.6452158689498901, "learning_rate": 2.6302428127158535e-05, "loss": 0.6864, "num_input_tokens_seen": 41713008, "step": 71850 }, { "epoch": 10.702263926124516, "grad_norm": 0.9939602613449097, "learning_rate": 2.6299183107351395e-05, "loss": 0.5257, "num_input_tokens_seen": 41715728, "step": 71855 }, { "epoch": 10.703008638665475, "grad_norm": 1.0306446552276611, "learning_rate": 2.629593806559589e-05, "loss": 0.5468, "num_input_tokens_seen": 41718448, "step": 71860 }, { "epoch": 10.703753351206434, "grad_norm": 1.4967232942581177, "learning_rate": 2.629269300194681e-05, "loss": 0.8293, "num_input_tokens_seen": 41721232, "step": 71865 }, { "epoch": 10.704498063747394, "grad_norm": 2.2846004962921143, "learning_rate": 2.6289447916459005e-05, "loss": 0.606, "num_input_tokens_seen": 41724016, "step": 71870 }, { "epoch": 10.705242776288353, "grad_norm": 2.176814317703247, "learning_rate": 2.628620280918729e-05, "loss": 0.5772, "num_input_tokens_seen": 41726960, "step": 71875 }, { "epoch": 10.705987488829312, "grad_norm": 1.274034857749939, "learning_rate": 2.6282957680186476e-05, "loss": 0.6838, "num_input_tokens_seen": 41730128, "step": 71880 }, { "epoch": 10.70673220137027, "grad_norm": 0.870903491973877, "learning_rate": 2.6279712529511406e-05, "loss": 0.6951, "num_input_tokens_seen": 41733072, "step": 71885 }, { "epoch": 10.707476913911231, "grad_norm": 1.2289854288101196, "learning_rate": 2.6276467357216895e-05, "loss": 0.5557, "num_input_tokens_seen": 41735888, "step": 71890 }, { "epoch": 10.70822162645219, "grad_norm": 1.1652076244354248, "learning_rate": 2.627322216335776e-05, "loss": 0.7314, "num_input_tokens_seen": 41738736, "step": 71895 }, { "epoch": 10.708966338993148, "grad_norm": 1.3597772121429443, "learning_rate": 2.6269976947988834e-05, "loss": 0.5086, "num_input_tokens_seen": 41741584, "step": 71900 }, { "epoch": 10.709711051534107, "grad_norm": 1.1205557584762573, "learning_rate": 2.626673171116493e-05, "loss": 0.7671, "num_input_tokens_seen": 41744880, "step": 71905 }, { "epoch": 10.710455764075068, "grad_norm": 1.712043046951294, "learning_rate": 2.626348645294089e-05, "loss": 0.6933, "num_input_tokens_seen": 41747984, "step": 71910 }, { "epoch": 10.711200476616026, "grad_norm": 1.1866376399993896, "learning_rate": 2.6260241173371525e-05, "loss": 0.6613, "num_input_tokens_seen": 41751440, "step": 71915 }, { "epoch": 10.711945189156985, "grad_norm": 1.0116450786590576, "learning_rate": 2.625699587251167e-05, "loss": 0.5448, "num_input_tokens_seen": 41754512, "step": 71920 }, { "epoch": 10.712689901697944, "grad_norm": 1.5354037284851074, "learning_rate": 2.6253750550416144e-05, "loss": 0.6424, "num_input_tokens_seen": 41757232, "step": 71925 }, { "epoch": 10.713434614238905, "grad_norm": 2.4091105461120605, "learning_rate": 2.6250505207139782e-05, "loss": 0.669, "num_input_tokens_seen": 41760176, "step": 71930 }, { "epoch": 10.714179326779863, "grad_norm": 1.389673113822937, "learning_rate": 2.62472598427374e-05, "loss": 0.5911, "num_input_tokens_seen": 41762992, "step": 71935 }, { "epoch": 10.714924039320822, "grad_norm": 1.3659287691116333, "learning_rate": 2.624401445726383e-05, "loss": 0.5695, "num_input_tokens_seen": 41766224, "step": 71940 }, { "epoch": 10.71566875186178, "grad_norm": 1.3428804874420166, "learning_rate": 2.6240769050773906e-05, "loss": 0.828, "num_input_tokens_seen": 41768944, "step": 71945 }, { "epoch": 10.716413464402741, "grad_norm": 1.5324290990829468, "learning_rate": 2.6237523623322446e-05, "loss": 0.6488, "num_input_tokens_seen": 41772048, "step": 71950 }, { "epoch": 10.7171581769437, "grad_norm": 1.5508452653884888, "learning_rate": 2.6234278174964288e-05, "loss": 0.6799, "num_input_tokens_seen": 41774864, "step": 71955 }, { "epoch": 10.717902889484659, "grad_norm": 1.43190598487854, "learning_rate": 2.6231032705754243e-05, "loss": 0.4559, "num_input_tokens_seen": 41777712, "step": 71960 }, { "epoch": 10.718647602025618, "grad_norm": 1.4129306077957153, "learning_rate": 2.6227787215747156e-05, "loss": 0.5412, "num_input_tokens_seen": 41780752, "step": 71965 }, { "epoch": 10.719392314566576, "grad_norm": 1.4827615022659302, "learning_rate": 2.6224541704997856e-05, "loss": 0.7095, "num_input_tokens_seen": 41783536, "step": 71970 }, { "epoch": 10.720137027107537, "grad_norm": 1.1859606504440308, "learning_rate": 2.6221296173561166e-05, "loss": 0.6374, "num_input_tokens_seen": 41786160, "step": 71975 }, { "epoch": 10.720881739648496, "grad_norm": 2.1883492469787598, "learning_rate": 2.6218050621491925e-05, "loss": 0.6403, "num_input_tokens_seen": 41788816, "step": 71980 }, { "epoch": 10.721626452189454, "grad_norm": 1.2060900926589966, "learning_rate": 2.6214805048844947e-05, "loss": 0.5086, "num_input_tokens_seen": 41791632, "step": 71985 }, { "epoch": 10.722371164730415, "grad_norm": 1.4226078987121582, "learning_rate": 2.621155945567508e-05, "loss": 0.7028, "num_input_tokens_seen": 41794480, "step": 71990 }, { "epoch": 10.723115877271374, "grad_norm": 1.061009407043457, "learning_rate": 2.620831384203714e-05, "loss": 0.5388, "num_input_tokens_seen": 41797424, "step": 71995 }, { "epoch": 10.723860589812332, "grad_norm": 1.0824263095855713, "learning_rate": 2.6205068207985965e-05, "loss": 0.6425, "num_input_tokens_seen": 41800176, "step": 72000 }, { "epoch": 10.724605302353291, "grad_norm": 1.7635732889175415, "learning_rate": 2.6201822553576394e-05, "loss": 0.537, "num_input_tokens_seen": 41802992, "step": 72005 }, { "epoch": 10.72535001489425, "grad_norm": 1.0364774465560913, "learning_rate": 2.619857687886325e-05, "loss": 0.7071, "num_input_tokens_seen": 41805808, "step": 72010 }, { "epoch": 10.72609472743521, "grad_norm": 2.007795572280884, "learning_rate": 2.6195331183901374e-05, "loss": 0.7029, "num_input_tokens_seen": 41808880, "step": 72015 }, { "epoch": 10.72683943997617, "grad_norm": 1.8955100774765015, "learning_rate": 2.6192085468745585e-05, "loss": 0.5727, "num_input_tokens_seen": 41811888, "step": 72020 }, { "epoch": 10.727584152517128, "grad_norm": 1.1388956308364868, "learning_rate": 2.6188839733450727e-05, "loss": 0.6762, "num_input_tokens_seen": 41814640, "step": 72025 }, { "epoch": 10.728328865058087, "grad_norm": 2.208721876144409, "learning_rate": 2.6185593978071627e-05, "loss": 0.6304, "num_input_tokens_seen": 41817232, "step": 72030 }, { "epoch": 10.729073577599047, "grad_norm": 1.3065992593765259, "learning_rate": 2.6182348202663122e-05, "loss": 0.547, "num_input_tokens_seen": 41820016, "step": 72035 }, { "epoch": 10.729818290140006, "grad_norm": 1.3547803163528442, "learning_rate": 2.617910240728004e-05, "loss": 0.6133, "num_input_tokens_seen": 41822864, "step": 72040 }, { "epoch": 10.730563002680965, "grad_norm": 1.568881630897522, "learning_rate": 2.6175856591977226e-05, "loss": 0.3428, "num_input_tokens_seen": 41826064, "step": 72045 }, { "epoch": 10.731307715221924, "grad_norm": 1.0424673557281494, "learning_rate": 2.6172610756809517e-05, "loss": 0.5909, "num_input_tokens_seen": 41828944, "step": 72050 }, { "epoch": 10.732052427762884, "grad_norm": 1.070877194404602, "learning_rate": 2.6169364901831732e-05, "loss": 0.4735, "num_input_tokens_seen": 41831664, "step": 72055 }, { "epoch": 10.732797140303843, "grad_norm": 1.4580917358398438, "learning_rate": 2.6166119027098724e-05, "loss": 0.5978, "num_input_tokens_seen": 41834896, "step": 72060 }, { "epoch": 10.733541852844802, "grad_norm": 2.0926549434661865, "learning_rate": 2.6162873132665315e-05, "loss": 0.6203, "num_input_tokens_seen": 41837680, "step": 72065 }, { "epoch": 10.73428656538576, "grad_norm": 0.9467275738716125, "learning_rate": 2.6159627218586345e-05, "loss": 0.444, "num_input_tokens_seen": 41840496, "step": 72070 }, { "epoch": 10.735031277926721, "grad_norm": 1.4354444742202759, "learning_rate": 2.6156381284916653e-05, "loss": 0.5647, "num_input_tokens_seen": 41843728, "step": 72075 }, { "epoch": 10.73577599046768, "grad_norm": 0.8625853061676025, "learning_rate": 2.615313533171107e-05, "loss": 0.4471, "num_input_tokens_seen": 41846896, "step": 72080 }, { "epoch": 10.736520703008638, "grad_norm": 1.6966404914855957, "learning_rate": 2.6149889359024447e-05, "loss": 0.5508, "num_input_tokens_seen": 41850096, "step": 72085 }, { "epoch": 10.737265415549597, "grad_norm": 1.8500698804855347, "learning_rate": 2.6146643366911612e-05, "loss": 0.7303, "num_input_tokens_seen": 41853040, "step": 72090 }, { "epoch": 10.738010128090558, "grad_norm": 1.389326572418213, "learning_rate": 2.614339735542739e-05, "loss": 0.7795, "num_input_tokens_seen": 41856208, "step": 72095 }, { "epoch": 10.738754840631517, "grad_norm": 2.0434885025024414, "learning_rate": 2.6140151324626644e-05, "loss": 0.4947, "num_input_tokens_seen": 41859408, "step": 72100 }, { "epoch": 10.739499553172475, "grad_norm": 1.7946418523788452, "learning_rate": 2.61369052745642e-05, "loss": 0.6363, "num_input_tokens_seen": 41862448, "step": 72105 }, { "epoch": 10.740244265713434, "grad_norm": 1.0277727842330933, "learning_rate": 2.6133659205294892e-05, "loss": 0.6497, "num_input_tokens_seen": 41865392, "step": 72110 }, { "epoch": 10.740988978254395, "grad_norm": 1.1055315732955933, "learning_rate": 2.6130413116873557e-05, "loss": 0.5064, "num_input_tokens_seen": 41868304, "step": 72115 }, { "epoch": 10.741733690795353, "grad_norm": 3.952690839767456, "learning_rate": 2.6127167009355058e-05, "loss": 0.6522, "num_input_tokens_seen": 41871216, "step": 72120 }, { "epoch": 10.742478403336312, "grad_norm": 1.8934496641159058, "learning_rate": 2.6123920882794208e-05, "loss": 0.6688, "num_input_tokens_seen": 41874064, "step": 72125 }, { "epoch": 10.74322311587727, "grad_norm": 0.9047168493270874, "learning_rate": 2.6120674737245854e-05, "loss": 0.5948, "num_input_tokens_seen": 41876560, "step": 72130 }, { "epoch": 10.743967828418231, "grad_norm": 1.5340920686721802, "learning_rate": 2.611742857276484e-05, "loss": 0.4918, "num_input_tokens_seen": 41879280, "step": 72135 }, { "epoch": 10.74471254095919, "grad_norm": 1.1994439363479614, "learning_rate": 2.6114182389406012e-05, "loss": 0.6385, "num_input_tokens_seen": 41882064, "step": 72140 }, { "epoch": 10.745457253500149, "grad_norm": 1.800662636756897, "learning_rate": 2.6110936187224205e-05, "loss": 0.7488, "num_input_tokens_seen": 41885072, "step": 72145 }, { "epoch": 10.746201966041108, "grad_norm": 2.053961753845215, "learning_rate": 2.610768996627426e-05, "loss": 0.52, "num_input_tokens_seen": 41887952, "step": 72150 }, { "epoch": 10.746946678582066, "grad_norm": 2.0822489261627197, "learning_rate": 2.6104443726611016e-05, "loss": 1.0495, "num_input_tokens_seen": 41890960, "step": 72155 }, { "epoch": 10.747691391123027, "grad_norm": 1.29120934009552, "learning_rate": 2.610119746828932e-05, "loss": 0.4949, "num_input_tokens_seen": 41893840, "step": 72160 }, { "epoch": 10.748436103663986, "grad_norm": 1.0372127294540405, "learning_rate": 2.6097951191364007e-05, "loss": 0.5272, "num_input_tokens_seen": 41896656, "step": 72165 }, { "epoch": 10.749180816204944, "grad_norm": 1.3491384983062744, "learning_rate": 2.6094704895889927e-05, "loss": 0.5683, "num_input_tokens_seen": 41899408, "step": 72170 }, { "epoch": 10.749925528745905, "grad_norm": 1.8808507919311523, "learning_rate": 2.609145858192192e-05, "loss": 0.6532, "num_input_tokens_seen": 41902224, "step": 72175 }, { "epoch": 10.750670241286864, "grad_norm": 1.512399435043335, "learning_rate": 2.608821224951483e-05, "loss": 0.7685, "num_input_tokens_seen": 41905392, "step": 72180 }, { "epoch": 10.751414953827823, "grad_norm": 2.9197425842285156, "learning_rate": 2.608496589872351e-05, "loss": 0.5183, "num_input_tokens_seen": 41908208, "step": 72185 }, { "epoch": 10.752159666368781, "grad_norm": 3.805781126022339, "learning_rate": 2.6081719529602776e-05, "loss": 0.5704, "num_input_tokens_seen": 41910896, "step": 72190 }, { "epoch": 10.75290437890974, "grad_norm": 1.037207007408142, "learning_rate": 2.6078473142207498e-05, "loss": 0.622, "num_input_tokens_seen": 41913744, "step": 72195 }, { "epoch": 10.7536490914507, "grad_norm": 1.813877820968628, "learning_rate": 2.607522673659251e-05, "loss": 0.7123, "num_input_tokens_seen": 41916208, "step": 72200 }, { "epoch": 10.75439380399166, "grad_norm": 2.361605644226074, "learning_rate": 2.6071980312812665e-05, "loss": 0.7271, "num_input_tokens_seen": 41919184, "step": 72205 }, { "epoch": 10.755138516532618, "grad_norm": 2.1070351600646973, "learning_rate": 2.6068733870922797e-05, "loss": 0.7178, "num_input_tokens_seen": 41922096, "step": 72210 }, { "epoch": 10.755883229073577, "grad_norm": 3.3816189765930176, "learning_rate": 2.606548741097776e-05, "loss": 0.8789, "num_input_tokens_seen": 41925072, "step": 72215 }, { "epoch": 10.756627941614537, "grad_norm": 1.9263194799423218, "learning_rate": 2.6062240933032394e-05, "loss": 0.5201, "num_input_tokens_seen": 41927728, "step": 72220 }, { "epoch": 10.757372654155496, "grad_norm": 0.9066363573074341, "learning_rate": 2.6058994437141554e-05, "loss": 0.644, "num_input_tokens_seen": 41930640, "step": 72225 }, { "epoch": 10.758117366696455, "grad_norm": 1.216787576675415, "learning_rate": 2.605574792336007e-05, "loss": 0.7312, "num_input_tokens_seen": 41933680, "step": 72230 }, { "epoch": 10.758862079237414, "grad_norm": 1.62227201461792, "learning_rate": 2.6052501391742802e-05, "loss": 0.4524, "num_input_tokens_seen": 41936496, "step": 72235 }, { "epoch": 10.759606791778374, "grad_norm": 0.7964901328086853, "learning_rate": 2.604925484234459e-05, "loss": 0.4603, "num_input_tokens_seen": 41939472, "step": 72240 }, { "epoch": 10.760351504319333, "grad_norm": 1.7053120136260986, "learning_rate": 2.6046008275220286e-05, "loss": 0.5772, "num_input_tokens_seen": 41942320, "step": 72245 }, { "epoch": 10.761096216860292, "grad_norm": 0.9914297461509705, "learning_rate": 2.604276169042473e-05, "loss": 0.5857, "num_input_tokens_seen": 41945232, "step": 72250 }, { "epoch": 10.76184092940125, "grad_norm": 0.8180022835731506, "learning_rate": 2.6039515088012783e-05, "loss": 0.6773, "num_input_tokens_seen": 41948432, "step": 72255 }, { "epoch": 10.762585641942211, "grad_norm": 1.9818493127822876, "learning_rate": 2.6036268468039282e-05, "loss": 0.6766, "num_input_tokens_seen": 41951248, "step": 72260 }, { "epoch": 10.76333035448317, "grad_norm": 1.4918304681777954, "learning_rate": 2.603302183055908e-05, "loss": 0.6546, "num_input_tokens_seen": 41954032, "step": 72265 }, { "epoch": 10.764075067024129, "grad_norm": 0.9900903701782227, "learning_rate": 2.6029775175627024e-05, "loss": 0.5805, "num_input_tokens_seen": 41956848, "step": 72270 }, { "epoch": 10.764819779565087, "grad_norm": 2.0104410648345947, "learning_rate": 2.602652850329796e-05, "loss": 0.7567, "num_input_tokens_seen": 41959536, "step": 72275 }, { "epoch": 10.765564492106048, "grad_norm": 1.3619269132614136, "learning_rate": 2.6023281813626737e-05, "loss": 0.5064, "num_input_tokens_seen": 41962416, "step": 72280 }, { "epoch": 10.766309204647007, "grad_norm": 1.2062501907348633, "learning_rate": 2.602003510666822e-05, "loss": 0.5313, "num_input_tokens_seen": 41965200, "step": 72285 }, { "epoch": 10.767053917187965, "grad_norm": 1.3659260272979736, "learning_rate": 2.6016788382477238e-05, "loss": 0.7017, "num_input_tokens_seen": 41968176, "step": 72290 }, { "epoch": 10.767798629728924, "grad_norm": 1.2909384965896606, "learning_rate": 2.6013541641108646e-05, "loss": 0.443, "num_input_tokens_seen": 41971056, "step": 72295 }, { "epoch": 10.768543342269885, "grad_norm": 2.0462770462036133, "learning_rate": 2.6010294882617304e-05, "loss": 0.8112, "num_input_tokens_seen": 41973872, "step": 72300 }, { "epoch": 10.769288054810843, "grad_norm": 2.624981641769409, "learning_rate": 2.6007048107058058e-05, "loss": 0.6815, "num_input_tokens_seen": 41976848, "step": 72305 }, { "epoch": 10.770032767351802, "grad_norm": 0.9227169156074524, "learning_rate": 2.6003801314485755e-05, "loss": 0.5827, "num_input_tokens_seen": 41979568, "step": 72310 }, { "epoch": 10.770777479892761, "grad_norm": 2.040109872817993, "learning_rate": 2.600055450495525e-05, "loss": 0.6031, "num_input_tokens_seen": 41982576, "step": 72315 }, { "epoch": 10.771522192433721, "grad_norm": 0.968670129776001, "learning_rate": 2.5997307678521392e-05, "loss": 0.714, "num_input_tokens_seen": 41985584, "step": 72320 }, { "epoch": 10.77226690497468, "grad_norm": 1.3493744134902954, "learning_rate": 2.5994060835239036e-05, "loss": 0.5418, "num_input_tokens_seen": 41989168, "step": 72325 }, { "epoch": 10.773011617515639, "grad_norm": 1.0447496175765991, "learning_rate": 2.5990813975163032e-05, "loss": 0.7145, "num_input_tokens_seen": 41992080, "step": 72330 }, { "epoch": 10.773756330056598, "grad_norm": 1.2590590715408325, "learning_rate": 2.598756709834823e-05, "loss": 0.6728, "num_input_tokens_seen": 41994928, "step": 72335 }, { "epoch": 10.774501042597556, "grad_norm": 1.5250861644744873, "learning_rate": 2.598432020484949e-05, "loss": 0.7203, "num_input_tokens_seen": 41997744, "step": 72340 }, { "epoch": 10.775245755138517, "grad_norm": 1.882092833518982, "learning_rate": 2.598107329472166e-05, "loss": 0.5455, "num_input_tokens_seen": 42000400, "step": 72345 }, { "epoch": 10.775990467679476, "grad_norm": 2.0492196083068848, "learning_rate": 2.5977826368019598e-05, "loss": 0.6639, "num_input_tokens_seen": 42003120, "step": 72350 }, { "epoch": 10.776735180220435, "grad_norm": 0.9351598620414734, "learning_rate": 2.5974579424798146e-05, "loss": 0.7192, "num_input_tokens_seen": 42006032, "step": 72355 }, { "epoch": 10.777479892761393, "grad_norm": 1.7777458429336548, "learning_rate": 2.5971332465112165e-05, "loss": 0.7925, "num_input_tokens_seen": 42009232, "step": 72360 }, { "epoch": 10.778224605302354, "grad_norm": 1.2188918590545654, "learning_rate": 2.5968085489016507e-05, "loss": 0.7709, "num_input_tokens_seen": 42012464, "step": 72365 }, { "epoch": 10.778969317843313, "grad_norm": 1.2648341655731201, "learning_rate": 2.5964838496566035e-05, "loss": 0.6239, "num_input_tokens_seen": 42015440, "step": 72370 }, { "epoch": 10.779714030384271, "grad_norm": 2.2130203247070312, "learning_rate": 2.596159148781559e-05, "loss": 0.5194, "num_input_tokens_seen": 42018320, "step": 72375 }, { "epoch": 10.78045874292523, "grad_norm": 1.0100609064102173, "learning_rate": 2.5958344462820045e-05, "loss": 0.6288, "num_input_tokens_seen": 42021136, "step": 72380 }, { "epoch": 10.78120345546619, "grad_norm": 1.2529362440109253, "learning_rate": 2.5955097421634244e-05, "loss": 0.687, "num_input_tokens_seen": 42024272, "step": 72385 }, { "epoch": 10.78194816800715, "grad_norm": 1.2494010925292969, "learning_rate": 2.5951850364313036e-05, "loss": 0.5227, "num_input_tokens_seen": 42026960, "step": 72390 }, { "epoch": 10.782692880548108, "grad_norm": 0.9628458619117737, "learning_rate": 2.5948603290911283e-05, "loss": 0.5528, "num_input_tokens_seen": 42030096, "step": 72395 }, { "epoch": 10.783437593089067, "grad_norm": 1.284579873085022, "learning_rate": 2.594535620148384e-05, "loss": 0.5697, "num_input_tokens_seen": 42032880, "step": 72400 }, { "epoch": 10.784182305630027, "grad_norm": 1.105674147605896, "learning_rate": 2.5942109096085566e-05, "loss": 0.5719, "num_input_tokens_seen": 42035696, "step": 72405 }, { "epoch": 10.784927018170986, "grad_norm": 0.8535409569740295, "learning_rate": 2.5938861974771316e-05, "loss": 0.6708, "num_input_tokens_seen": 42038352, "step": 72410 }, { "epoch": 10.785671730711945, "grad_norm": 1.693847417831421, "learning_rate": 2.5935614837595958e-05, "loss": 0.6714, "num_input_tokens_seen": 42041168, "step": 72415 }, { "epoch": 10.786416443252904, "grad_norm": 1.3738622665405273, "learning_rate": 2.5932367684614328e-05, "loss": 0.4991, "num_input_tokens_seen": 42044016, "step": 72420 }, { "epoch": 10.787161155793864, "grad_norm": 1.722522497177124, "learning_rate": 2.59291205158813e-05, "loss": 0.5216, "num_input_tokens_seen": 42046800, "step": 72425 }, { "epoch": 10.787905868334823, "grad_norm": 1.3920185565948486, "learning_rate": 2.5925873331451715e-05, "loss": 0.4967, "num_input_tokens_seen": 42049456, "step": 72430 }, { "epoch": 10.788650580875782, "grad_norm": 1.536606788635254, "learning_rate": 2.5922626131380444e-05, "loss": 0.4082, "num_input_tokens_seen": 42052240, "step": 72435 }, { "epoch": 10.78939529341674, "grad_norm": 2.248798131942749, "learning_rate": 2.5919378915722347e-05, "loss": 0.7583, "num_input_tokens_seen": 42055024, "step": 72440 }, { "epoch": 10.790140005957701, "grad_norm": 1.466657280921936, "learning_rate": 2.5916131684532274e-05, "loss": 0.5538, "num_input_tokens_seen": 42057648, "step": 72445 }, { "epoch": 10.79088471849866, "grad_norm": 1.7212779521942139, "learning_rate": 2.5912884437865093e-05, "loss": 0.622, "num_input_tokens_seen": 42060752, "step": 72450 }, { "epoch": 10.791629431039619, "grad_norm": 1.929505705833435, "learning_rate": 2.5909637175775652e-05, "loss": 0.524, "num_input_tokens_seen": 42063728, "step": 72455 }, { "epoch": 10.792374143580577, "grad_norm": 1.6440796852111816, "learning_rate": 2.5906389898318817e-05, "loss": 0.612, "num_input_tokens_seen": 42066352, "step": 72460 }, { "epoch": 10.793118856121538, "grad_norm": 0.9684421420097351, "learning_rate": 2.5903142605549445e-05, "loss": 0.4998, "num_input_tokens_seen": 42069328, "step": 72465 }, { "epoch": 10.793863568662497, "grad_norm": 1.3105831146240234, "learning_rate": 2.58998952975224e-05, "loss": 0.5158, "num_input_tokens_seen": 42071984, "step": 72470 }, { "epoch": 10.794608281203455, "grad_norm": 1.0493756532669067, "learning_rate": 2.5896647974292533e-05, "loss": 0.5933, "num_input_tokens_seen": 42075280, "step": 72475 }, { "epoch": 10.795352993744414, "grad_norm": 1.5699959993362427, "learning_rate": 2.589340063591471e-05, "loss": 0.6404, "num_input_tokens_seen": 42078288, "step": 72480 }, { "epoch": 10.796097706285373, "grad_norm": 2.3371243476867676, "learning_rate": 2.5890153282443797e-05, "loss": 0.8322, "num_input_tokens_seen": 42081328, "step": 72485 }, { "epoch": 10.796842418826333, "grad_norm": 2.246865749359131, "learning_rate": 2.5886905913934643e-05, "loss": 0.6396, "num_input_tokens_seen": 42084400, "step": 72490 }, { "epoch": 10.797587131367292, "grad_norm": 0.9701111316680908, "learning_rate": 2.5883658530442117e-05, "loss": 0.6009, "num_input_tokens_seen": 42087440, "step": 72495 }, { "epoch": 10.798331843908251, "grad_norm": 1.786817193031311, "learning_rate": 2.5880411132021083e-05, "loss": 0.6246, "num_input_tokens_seen": 42090416, "step": 72500 }, { "epoch": 10.799076556449211, "grad_norm": 1.5790265798568726, "learning_rate": 2.5877163718726394e-05, "loss": 0.685, "num_input_tokens_seen": 42093200, "step": 72505 }, { "epoch": 10.79982126899017, "grad_norm": 1.7040003538131714, "learning_rate": 2.5873916290612915e-05, "loss": 0.5969, "num_input_tokens_seen": 42096304, "step": 72510 }, { "epoch": 10.800565981531129, "grad_norm": 1.7769440412521362, "learning_rate": 2.5870668847735512e-05, "loss": 0.6101, "num_input_tokens_seen": 42099472, "step": 72515 }, { "epoch": 10.801310694072088, "grad_norm": 1.2335612773895264, "learning_rate": 2.5867421390149037e-05, "loss": 0.696, "num_input_tokens_seen": 42102352, "step": 72520 }, { "epoch": 10.802055406613047, "grad_norm": 1.6704812049865723, "learning_rate": 2.5864173917908363e-05, "loss": 0.5302, "num_input_tokens_seen": 42105200, "step": 72525 }, { "epoch": 10.802800119154007, "grad_norm": 1.372450590133667, "learning_rate": 2.5860926431068344e-05, "loss": 0.6555, "num_input_tokens_seen": 42108240, "step": 72530 }, { "epoch": 10.803544831694966, "grad_norm": 1.677795648574829, "learning_rate": 2.5857678929683855e-05, "loss": 0.6311, "num_input_tokens_seen": 42110928, "step": 72535 }, { "epoch": 10.804289544235925, "grad_norm": 0.9277697205543518, "learning_rate": 2.585443141380975e-05, "loss": 0.6195, "num_input_tokens_seen": 42113776, "step": 72540 }, { "epoch": 10.805034256776883, "grad_norm": 1.117274284362793, "learning_rate": 2.5851183883500895e-05, "loss": 0.8279, "num_input_tokens_seen": 42116592, "step": 72545 }, { "epoch": 10.805778969317844, "grad_norm": 2.2095096111297607, "learning_rate": 2.5847936338812158e-05, "loss": 0.6077, "num_input_tokens_seen": 42119248, "step": 72550 }, { "epoch": 10.806523681858803, "grad_norm": 1.3520911931991577, "learning_rate": 2.5844688779798393e-05, "loss": 0.6295, "num_input_tokens_seen": 42122256, "step": 72555 }, { "epoch": 10.807268394399761, "grad_norm": 1.2111420631408691, "learning_rate": 2.5841441206514468e-05, "loss": 0.743, "num_input_tokens_seen": 42125168, "step": 72560 }, { "epoch": 10.80801310694072, "grad_norm": 2.850306987762451, "learning_rate": 2.583819361901525e-05, "loss": 0.8575, "num_input_tokens_seen": 42128016, "step": 72565 }, { "epoch": 10.80875781948168, "grad_norm": 1.212087869644165, "learning_rate": 2.5834946017355598e-05, "loss": 0.8426, "num_input_tokens_seen": 42131152, "step": 72570 }, { "epoch": 10.80950253202264, "grad_norm": 1.217397928237915, "learning_rate": 2.583169840159039e-05, "loss": 0.5783, "num_input_tokens_seen": 42134128, "step": 72575 }, { "epoch": 10.810247244563598, "grad_norm": 2.056764841079712, "learning_rate": 2.582845077177448e-05, "loss": 0.7292, "num_input_tokens_seen": 42136976, "step": 72580 }, { "epoch": 10.810991957104557, "grad_norm": 1.0272459983825684, "learning_rate": 2.5825203127962737e-05, "loss": 0.6394, "num_input_tokens_seen": 42140080, "step": 72585 }, { "epoch": 10.811736669645517, "grad_norm": 0.7893958687782288, "learning_rate": 2.582195547021003e-05, "loss": 0.5761, "num_input_tokens_seen": 42142608, "step": 72590 }, { "epoch": 10.812481382186476, "grad_norm": 1.2240797281265259, "learning_rate": 2.581870779857121e-05, "loss": 0.5415, "num_input_tokens_seen": 42145392, "step": 72595 }, { "epoch": 10.813226094727435, "grad_norm": 1.7831276655197144, "learning_rate": 2.581546011310116e-05, "loss": 0.623, "num_input_tokens_seen": 42148560, "step": 72600 }, { "epoch": 10.813970807268394, "grad_norm": 1.2608472108840942, "learning_rate": 2.5812212413854738e-05, "loss": 0.5089, "num_input_tokens_seen": 42151280, "step": 72605 }, { "epoch": 10.814715519809354, "grad_norm": 0.7278404235839844, "learning_rate": 2.5808964700886812e-05, "loss": 0.5097, "num_input_tokens_seen": 42154320, "step": 72610 }, { "epoch": 10.815460232350313, "grad_norm": 2.014909505844116, "learning_rate": 2.5805716974252257e-05, "loss": 0.7199, "num_input_tokens_seen": 42157264, "step": 72615 }, { "epoch": 10.816204944891272, "grad_norm": 2.015336275100708, "learning_rate": 2.5802469234005927e-05, "loss": 0.6379, "num_input_tokens_seen": 42160016, "step": 72620 }, { "epoch": 10.81694965743223, "grad_norm": 1.7842423915863037, "learning_rate": 2.57992214802027e-05, "loss": 0.5836, "num_input_tokens_seen": 42163088, "step": 72625 }, { "epoch": 10.817694369973191, "grad_norm": 1.5052601099014282, "learning_rate": 2.579597371289743e-05, "loss": 0.6799, "num_input_tokens_seen": 42166000, "step": 72630 }, { "epoch": 10.81843908251415, "grad_norm": 3.1290910243988037, "learning_rate": 2.5792725932144996e-05, "loss": 0.8462, "num_input_tokens_seen": 42168656, "step": 72635 }, { "epoch": 10.819183795055109, "grad_norm": 1.488186240196228, "learning_rate": 2.5789478138000262e-05, "loss": 0.6312, "num_input_tokens_seen": 42171472, "step": 72640 }, { "epoch": 10.819928507596067, "grad_norm": 1.321858286857605, "learning_rate": 2.5786230330518096e-05, "loss": 0.7358, "num_input_tokens_seen": 42174288, "step": 72645 }, { "epoch": 10.820673220137028, "grad_norm": 1.1078439950942993, "learning_rate": 2.5782982509753377e-05, "loss": 0.6608, "num_input_tokens_seen": 42177328, "step": 72650 }, { "epoch": 10.821417932677987, "grad_norm": 3.199718713760376, "learning_rate": 2.5779734675760957e-05, "loss": 0.5501, "num_input_tokens_seen": 42180080, "step": 72655 }, { "epoch": 10.822162645218945, "grad_norm": 1.335128664970398, "learning_rate": 2.5776486828595715e-05, "loss": 0.5259, "num_input_tokens_seen": 42182992, "step": 72660 }, { "epoch": 10.822907357759904, "grad_norm": 1.340855360031128, "learning_rate": 2.5773238968312514e-05, "loss": 0.4866, "num_input_tokens_seen": 42185968, "step": 72665 }, { "epoch": 10.823652070300863, "grad_norm": 2.150486946105957, "learning_rate": 2.5769991094966228e-05, "loss": 0.7297, "num_input_tokens_seen": 42188656, "step": 72670 }, { "epoch": 10.824396782841823, "grad_norm": 1.2706350088119507, "learning_rate": 2.5766743208611726e-05, "loss": 0.5674, "num_input_tokens_seen": 42191696, "step": 72675 }, { "epoch": 10.825141495382782, "grad_norm": 1.3514846563339233, "learning_rate": 2.576349530930388e-05, "loss": 0.4631, "num_input_tokens_seen": 42194352, "step": 72680 }, { "epoch": 10.825886207923741, "grad_norm": 1.5474600791931152, "learning_rate": 2.5760247397097553e-05, "loss": 0.611, "num_input_tokens_seen": 42197520, "step": 72685 }, { "epoch": 10.826630920464702, "grad_norm": 1.4138072729110718, "learning_rate": 2.5756999472047617e-05, "loss": 0.5803, "num_input_tokens_seen": 42200176, "step": 72690 }, { "epoch": 10.82737563300566, "grad_norm": 1.795235276222229, "learning_rate": 2.575375153420894e-05, "loss": 0.7473, "num_input_tokens_seen": 42203024, "step": 72695 }, { "epoch": 10.828120345546619, "grad_norm": 1.163072943687439, "learning_rate": 2.5750503583636402e-05, "loss": 0.6259, "num_input_tokens_seen": 42205840, "step": 72700 }, { "epoch": 10.828865058087578, "grad_norm": 1.7463724613189697, "learning_rate": 2.5747255620384868e-05, "loss": 0.6117, "num_input_tokens_seen": 42208720, "step": 72705 }, { "epoch": 10.829609770628537, "grad_norm": 0.634519100189209, "learning_rate": 2.5744007644509215e-05, "loss": 0.5187, "num_input_tokens_seen": 42211600, "step": 72710 }, { "epoch": 10.830354483169497, "grad_norm": 1.6051356792449951, "learning_rate": 2.574075965606431e-05, "loss": 0.6635, "num_input_tokens_seen": 42214640, "step": 72715 }, { "epoch": 10.831099195710456, "grad_norm": 2.2842655181884766, "learning_rate": 2.5737511655105018e-05, "loss": 0.7661, "num_input_tokens_seen": 42217520, "step": 72720 }, { "epoch": 10.831843908251415, "grad_norm": 1.012678861618042, "learning_rate": 2.573426364168622e-05, "loss": 0.5424, "num_input_tokens_seen": 42220464, "step": 72725 }, { "epoch": 10.832588620792373, "grad_norm": 1.6829441785812378, "learning_rate": 2.5731015615862774e-05, "loss": 0.5716, "num_input_tokens_seen": 42223568, "step": 72730 }, { "epoch": 10.833333333333334, "grad_norm": 3.464604616165161, "learning_rate": 2.572776757768957e-05, "loss": 0.609, "num_input_tokens_seen": 42226288, "step": 72735 }, { "epoch": 10.834078045874293, "grad_norm": 0.6182753443717957, "learning_rate": 2.5724519527221468e-05, "loss": 0.4142, "num_input_tokens_seen": 42228880, "step": 72740 }, { "epoch": 10.834822758415251, "grad_norm": 1.5434149503707886, "learning_rate": 2.5721271464513354e-05, "loss": 0.5222, "num_input_tokens_seen": 42232016, "step": 72745 }, { "epoch": 10.83556747095621, "grad_norm": 1.159896731376648, "learning_rate": 2.571802338962009e-05, "loss": 0.6994, "num_input_tokens_seen": 42235024, "step": 72750 }, { "epoch": 10.83631218349717, "grad_norm": 0.8644822835922241, "learning_rate": 2.5714775302596545e-05, "loss": 0.4946, "num_input_tokens_seen": 42237936, "step": 72755 }, { "epoch": 10.83705689603813, "grad_norm": 1.6426678895950317, "learning_rate": 2.57115272034976e-05, "loss": 0.5362, "num_input_tokens_seen": 42240976, "step": 72760 }, { "epoch": 10.837801608579088, "grad_norm": 1.6092426776885986, "learning_rate": 2.5708279092378123e-05, "loss": 0.5703, "num_input_tokens_seen": 42243824, "step": 72765 }, { "epoch": 10.838546321120047, "grad_norm": 1.1009244918823242, "learning_rate": 2.5705030969292992e-05, "loss": 0.5556, "num_input_tokens_seen": 42246544, "step": 72770 }, { "epoch": 10.839291033661008, "grad_norm": 1.6385207176208496, "learning_rate": 2.5701782834297078e-05, "loss": 0.6504, "num_input_tokens_seen": 42249488, "step": 72775 }, { "epoch": 10.840035746201966, "grad_norm": 1.4596383571624756, "learning_rate": 2.5698534687445263e-05, "loss": 0.6374, "num_input_tokens_seen": 42252656, "step": 72780 }, { "epoch": 10.840780458742925, "grad_norm": 3.896718978881836, "learning_rate": 2.5695286528792413e-05, "loss": 0.5649, "num_input_tokens_seen": 42255408, "step": 72785 }, { "epoch": 10.841525171283884, "grad_norm": 1.2492024898529053, "learning_rate": 2.56920383583934e-05, "loss": 0.6893, "num_input_tokens_seen": 42258288, "step": 72790 }, { "epoch": 10.842269883824844, "grad_norm": 1.7215921878814697, "learning_rate": 2.56887901763031e-05, "loss": 0.6588, "num_input_tokens_seen": 42260880, "step": 72795 }, { "epoch": 10.843014596365803, "grad_norm": 2.4005074501037598, "learning_rate": 2.5685541982576395e-05, "loss": 0.5049, "num_input_tokens_seen": 42263664, "step": 72800 }, { "epoch": 10.843759308906762, "grad_norm": 2.3341593742370605, "learning_rate": 2.5682293777268153e-05, "loss": 0.6805, "num_input_tokens_seen": 42266672, "step": 72805 }, { "epoch": 10.84450402144772, "grad_norm": 2.2694199085235596, "learning_rate": 2.567904556043325e-05, "loss": 0.7766, "num_input_tokens_seen": 42269392, "step": 72810 }, { "epoch": 10.845248733988681, "grad_norm": 1.5986900329589844, "learning_rate": 2.5675797332126566e-05, "loss": 0.7703, "num_input_tokens_seen": 42272240, "step": 72815 }, { "epoch": 10.84599344652964, "grad_norm": 1.9160763025283813, "learning_rate": 2.567254909240297e-05, "loss": 0.7371, "num_input_tokens_seen": 42274992, "step": 72820 }, { "epoch": 10.846738159070599, "grad_norm": 0.7410143613815308, "learning_rate": 2.566930084131734e-05, "loss": 0.5356, "num_input_tokens_seen": 42278160, "step": 72825 }, { "epoch": 10.847482871611557, "grad_norm": 2.4906325340270996, "learning_rate": 2.566605257892456e-05, "loss": 0.5886, "num_input_tokens_seen": 42280880, "step": 72830 }, { "epoch": 10.848227584152518, "grad_norm": 1.4500577449798584, "learning_rate": 2.5662804305279485e-05, "loss": 0.6336, "num_input_tokens_seen": 42283728, "step": 72835 }, { "epoch": 10.848972296693477, "grad_norm": 1.6238961219787598, "learning_rate": 2.5659556020437015e-05, "loss": 0.6836, "num_input_tokens_seen": 42286416, "step": 72840 }, { "epoch": 10.849717009234435, "grad_norm": 1.5453977584838867, "learning_rate": 2.5656307724452016e-05, "loss": 0.76, "num_input_tokens_seen": 42289264, "step": 72845 }, { "epoch": 10.850461721775394, "grad_norm": 1.1297258138656616, "learning_rate": 2.565305941737936e-05, "loss": 0.5952, "num_input_tokens_seen": 42291824, "step": 72850 }, { "epoch": 10.851206434316353, "grad_norm": 0.7885382175445557, "learning_rate": 2.5649811099273935e-05, "loss": 0.6802, "num_input_tokens_seen": 42294448, "step": 72855 }, { "epoch": 10.851951146857314, "grad_norm": 0.8284671902656555, "learning_rate": 2.564656277019061e-05, "loss": 0.5865, "num_input_tokens_seen": 42297424, "step": 72860 }, { "epoch": 10.852695859398272, "grad_norm": 1.237326741218567, "learning_rate": 2.5643314430184257e-05, "loss": 0.5477, "num_input_tokens_seen": 42300272, "step": 72865 }, { "epoch": 10.853440571939231, "grad_norm": 1.2380752563476562, "learning_rate": 2.564006607930977e-05, "loss": 0.6586, "num_input_tokens_seen": 42303120, "step": 72870 }, { "epoch": 10.85418528448019, "grad_norm": 0.7764873504638672, "learning_rate": 2.5636817717622015e-05, "loss": 0.4892, "num_input_tokens_seen": 42306128, "step": 72875 }, { "epoch": 10.85492999702115, "grad_norm": 0.8786134123802185, "learning_rate": 2.5633569345175873e-05, "loss": 0.3263, "num_input_tokens_seen": 42309104, "step": 72880 }, { "epoch": 10.85567470956211, "grad_norm": 1.5649470090866089, "learning_rate": 2.5630320962026217e-05, "loss": 0.5743, "num_input_tokens_seen": 42312336, "step": 72885 }, { "epoch": 10.856419422103068, "grad_norm": 1.7608534097671509, "learning_rate": 2.5627072568227927e-05, "loss": 0.8846, "num_input_tokens_seen": 42315312, "step": 72890 }, { "epoch": 10.857164134644027, "grad_norm": 1.1377310752868652, "learning_rate": 2.5623824163835887e-05, "loss": 0.6402, "num_input_tokens_seen": 42318512, "step": 72895 }, { "epoch": 10.857908847184987, "grad_norm": 1.1774592399597168, "learning_rate": 2.5620575748904968e-05, "loss": 0.6454, "num_input_tokens_seen": 42321584, "step": 72900 }, { "epoch": 10.858653559725946, "grad_norm": 0.9050168395042419, "learning_rate": 2.5617327323490055e-05, "loss": 0.7078, "num_input_tokens_seen": 42324400, "step": 72905 }, { "epoch": 10.859398272266905, "grad_norm": 1.2406864166259766, "learning_rate": 2.5614078887646025e-05, "loss": 0.6536, "num_input_tokens_seen": 42327312, "step": 72910 }, { "epoch": 10.860142984807863, "grad_norm": 2.2371714115142822, "learning_rate": 2.5610830441427762e-05, "loss": 0.6176, "num_input_tokens_seen": 42330384, "step": 72915 }, { "epoch": 10.860887697348824, "grad_norm": 1.5512452125549316, "learning_rate": 2.5607581984890134e-05, "loss": 0.5676, "num_input_tokens_seen": 42333328, "step": 72920 }, { "epoch": 10.861632409889783, "grad_norm": 1.2921268939971924, "learning_rate": 2.5604333518088026e-05, "loss": 0.5234, "num_input_tokens_seen": 42336400, "step": 72925 }, { "epoch": 10.862377122430741, "grad_norm": 1.456870198249817, "learning_rate": 2.560108504107631e-05, "loss": 0.6258, "num_input_tokens_seen": 42339312, "step": 72930 }, { "epoch": 10.8631218349717, "grad_norm": 0.9946088194847107, "learning_rate": 2.5597836553909884e-05, "loss": 0.6205, "num_input_tokens_seen": 42342032, "step": 72935 }, { "epoch": 10.86386654751266, "grad_norm": 1.022814154624939, "learning_rate": 2.5594588056643608e-05, "loss": 0.6365, "num_input_tokens_seen": 42345072, "step": 72940 }, { "epoch": 10.86461126005362, "grad_norm": 1.925611138343811, "learning_rate": 2.5591339549332383e-05, "loss": 0.6333, "num_input_tokens_seen": 42347760, "step": 72945 }, { "epoch": 10.865355972594578, "grad_norm": 1.6828056573867798, "learning_rate": 2.5588091032031075e-05, "loss": 0.712, "num_input_tokens_seen": 42350608, "step": 72950 }, { "epoch": 10.866100685135537, "grad_norm": 1.8816879987716675, "learning_rate": 2.5584842504794558e-05, "loss": 0.5011, "num_input_tokens_seen": 42353968, "step": 72955 }, { "epoch": 10.866845397676498, "grad_norm": 1.9567190408706665, "learning_rate": 2.5581593967677724e-05, "loss": 0.6064, "num_input_tokens_seen": 42356720, "step": 72960 }, { "epoch": 10.867590110217456, "grad_norm": 1.142397403717041, "learning_rate": 2.557834542073545e-05, "loss": 0.4786, "num_input_tokens_seen": 42359440, "step": 72965 }, { "epoch": 10.868334822758415, "grad_norm": 1.6662322282791138, "learning_rate": 2.557509686402262e-05, "loss": 0.6292, "num_input_tokens_seen": 42362224, "step": 72970 }, { "epoch": 10.869079535299374, "grad_norm": 1.6233842372894287, "learning_rate": 2.5571848297594116e-05, "loss": 0.4887, "num_input_tokens_seen": 42365040, "step": 72975 }, { "epoch": 10.869824247840334, "grad_norm": 2.285116195678711, "learning_rate": 2.5568599721504814e-05, "loss": 0.5827, "num_input_tokens_seen": 42367952, "step": 72980 }, { "epoch": 10.870568960381293, "grad_norm": 0.9321961998939514, "learning_rate": 2.5565351135809597e-05, "loss": 0.509, "num_input_tokens_seen": 42370960, "step": 72985 }, { "epoch": 10.871313672922252, "grad_norm": 1.005430817604065, "learning_rate": 2.5562102540563355e-05, "loss": 0.4376, "num_input_tokens_seen": 42373968, "step": 72990 }, { "epoch": 10.87205838546321, "grad_norm": 2.4916670322418213, "learning_rate": 2.5558853935820948e-05, "loss": 0.7506, "num_input_tokens_seen": 42376752, "step": 72995 }, { "epoch": 10.872803098004171, "grad_norm": 1.1133701801300049, "learning_rate": 2.555560532163728e-05, "loss": 0.5448, "num_input_tokens_seen": 42379280, "step": 73000 }, { "epoch": 10.87354781054513, "grad_norm": 1.4576064348220825, "learning_rate": 2.555235669806722e-05, "loss": 0.767, "num_input_tokens_seen": 42381936, "step": 73005 }, { "epoch": 10.874292523086089, "grad_norm": 0.7874130010604858, "learning_rate": 2.554910806516566e-05, "loss": 0.5808, "num_input_tokens_seen": 42384624, "step": 73010 }, { "epoch": 10.875037235627047, "grad_norm": 1.8778576850891113, "learning_rate": 2.5545859422987478e-05, "loss": 0.6937, "num_input_tokens_seen": 42387376, "step": 73015 }, { "epoch": 10.875781948168008, "grad_norm": 0.8044260740280151, "learning_rate": 2.554261077158755e-05, "loss": 0.7964, "num_input_tokens_seen": 42389904, "step": 73020 }, { "epoch": 10.876526660708967, "grad_norm": 1.0693951845169067, "learning_rate": 2.5539362111020765e-05, "loss": 0.6095, "num_input_tokens_seen": 42392912, "step": 73025 }, { "epoch": 10.877271373249926, "grad_norm": 1.3535606861114502, "learning_rate": 2.5536113441342014e-05, "loss": 0.5367, "num_input_tokens_seen": 42395856, "step": 73030 }, { "epoch": 10.878016085790884, "grad_norm": 0.9606912136077881, "learning_rate": 2.5532864762606164e-05, "loss": 0.5132, "num_input_tokens_seen": 42398608, "step": 73035 }, { "epoch": 10.878760798331843, "grad_norm": 2.192082405090332, "learning_rate": 2.55296160748681e-05, "loss": 0.6956, "num_input_tokens_seen": 42401328, "step": 73040 }, { "epoch": 10.879505510872804, "grad_norm": 1.7790707349777222, "learning_rate": 2.5526367378182725e-05, "loss": 0.5617, "num_input_tokens_seen": 42404432, "step": 73045 }, { "epoch": 10.880250223413762, "grad_norm": 1.4779696464538574, "learning_rate": 2.55231186726049e-05, "loss": 0.5896, "num_input_tokens_seen": 42407280, "step": 73050 }, { "epoch": 10.880994935954721, "grad_norm": 1.783181071281433, "learning_rate": 2.5519869958189513e-05, "loss": 0.7398, "num_input_tokens_seen": 42410160, "step": 73055 }, { "epoch": 10.88173964849568, "grad_norm": 0.8626022934913635, "learning_rate": 2.5516621234991456e-05, "loss": 0.8449, "num_input_tokens_seen": 42413232, "step": 73060 }, { "epoch": 10.88248436103664, "grad_norm": 1.4360010623931885, "learning_rate": 2.551337250306561e-05, "loss": 0.5275, "num_input_tokens_seen": 42416080, "step": 73065 }, { "epoch": 10.8832290735776, "grad_norm": 1.1139546632766724, "learning_rate": 2.5510123762466853e-05, "loss": 0.4431, "num_input_tokens_seen": 42418928, "step": 73070 }, { "epoch": 10.883973786118558, "grad_norm": 2.257347345352173, "learning_rate": 2.5506875013250075e-05, "loss": 0.5705, "num_input_tokens_seen": 42421776, "step": 73075 }, { "epoch": 10.884718498659517, "grad_norm": 1.1585662364959717, "learning_rate": 2.5503626255470164e-05, "loss": 0.5956, "num_input_tokens_seen": 42424624, "step": 73080 }, { "epoch": 10.885463211200477, "grad_norm": 1.5363856554031372, "learning_rate": 2.5500377489181992e-05, "loss": 0.6376, "num_input_tokens_seen": 42427920, "step": 73085 }, { "epoch": 10.886207923741436, "grad_norm": 1.772926688194275, "learning_rate": 2.5497128714440456e-05, "loss": 0.5545, "num_input_tokens_seen": 42430768, "step": 73090 }, { "epoch": 10.886952636282395, "grad_norm": 1.4740239381790161, "learning_rate": 2.549387993130043e-05, "loss": 0.5141, "num_input_tokens_seen": 42433616, "step": 73095 }, { "epoch": 10.887697348823353, "grad_norm": 1.4474542140960693, "learning_rate": 2.5490631139816806e-05, "loss": 0.4656, "num_input_tokens_seen": 42436560, "step": 73100 }, { "epoch": 10.888442061364314, "grad_norm": 1.7950588464736938, "learning_rate": 2.548738234004447e-05, "loss": 0.6576, "num_input_tokens_seen": 42439216, "step": 73105 }, { "epoch": 10.889186773905273, "grad_norm": 2.0338141918182373, "learning_rate": 2.5484133532038307e-05, "loss": 0.7222, "num_input_tokens_seen": 42442704, "step": 73110 }, { "epoch": 10.889931486446232, "grad_norm": 1.5096898078918457, "learning_rate": 2.5480884715853197e-05, "loss": 0.5748, "num_input_tokens_seen": 42445680, "step": 73115 }, { "epoch": 10.89067619898719, "grad_norm": 1.518705129623413, "learning_rate": 2.547763589154403e-05, "loss": 0.5191, "num_input_tokens_seen": 42448432, "step": 73120 }, { "epoch": 10.89142091152815, "grad_norm": 1.1251492500305176, "learning_rate": 2.5474387059165687e-05, "loss": 0.6265, "num_input_tokens_seen": 42451248, "step": 73125 }, { "epoch": 10.89216562406911, "grad_norm": 1.3049297332763672, "learning_rate": 2.547113821877306e-05, "loss": 0.5797, "num_input_tokens_seen": 42454000, "step": 73130 }, { "epoch": 10.892910336610068, "grad_norm": 2.7153775691986084, "learning_rate": 2.5467889370421027e-05, "loss": 0.6347, "num_input_tokens_seen": 42456816, "step": 73135 }, { "epoch": 10.893655049151027, "grad_norm": 1.2965924739837646, "learning_rate": 2.546464051416448e-05, "loss": 0.5699, "num_input_tokens_seen": 42459536, "step": 73140 }, { "epoch": 10.894399761691988, "grad_norm": 1.468912959098816, "learning_rate": 2.5461391650058307e-05, "loss": 0.6135, "num_input_tokens_seen": 42462416, "step": 73145 }, { "epoch": 10.895144474232946, "grad_norm": 0.5375168919563293, "learning_rate": 2.5458142778157396e-05, "loss": 0.5424, "num_input_tokens_seen": 42465328, "step": 73150 }, { "epoch": 10.895889186773905, "grad_norm": 2.361314296722412, "learning_rate": 2.545489389851662e-05, "loss": 0.64, "num_input_tokens_seen": 42468080, "step": 73155 }, { "epoch": 10.896633899314864, "grad_norm": 1.6257662773132324, "learning_rate": 2.5451645011190872e-05, "loss": 0.6925, "num_input_tokens_seen": 42470864, "step": 73160 }, { "epoch": 10.897378611855824, "grad_norm": 1.7613279819488525, "learning_rate": 2.5448396116235046e-05, "loss": 0.7125, "num_input_tokens_seen": 42473968, "step": 73165 }, { "epoch": 10.898123324396783, "grad_norm": 0.8871791362762451, "learning_rate": 2.5445147213704017e-05, "loss": 0.595, "num_input_tokens_seen": 42476656, "step": 73170 }, { "epoch": 10.898868036937742, "grad_norm": 1.987434983253479, "learning_rate": 2.5441898303652688e-05, "loss": 0.595, "num_input_tokens_seen": 42479472, "step": 73175 }, { "epoch": 10.8996127494787, "grad_norm": 1.176340937614441, "learning_rate": 2.5438649386135932e-05, "loss": 0.678, "num_input_tokens_seen": 42482096, "step": 73180 }, { "epoch": 10.90035746201966, "grad_norm": 3.6155591011047363, "learning_rate": 2.5435400461208637e-05, "loss": 0.6971, "num_input_tokens_seen": 42485104, "step": 73185 }, { "epoch": 10.90110217456062, "grad_norm": 1.2292757034301758, "learning_rate": 2.5432151528925702e-05, "loss": 0.7034, "num_input_tokens_seen": 42488240, "step": 73190 }, { "epoch": 10.901846887101579, "grad_norm": 1.5178680419921875, "learning_rate": 2.5428902589341996e-05, "loss": 0.7457, "num_input_tokens_seen": 42491312, "step": 73195 }, { "epoch": 10.902591599642538, "grad_norm": 1.3935117721557617, "learning_rate": 2.542565364251242e-05, "loss": 0.6965, "num_input_tokens_seen": 42494288, "step": 73200 }, { "epoch": 10.903336312183498, "grad_norm": 2.6881959438323975, "learning_rate": 2.542240468849186e-05, "loss": 0.6554, "num_input_tokens_seen": 42497040, "step": 73205 }, { "epoch": 10.904081024724457, "grad_norm": 0.9934018850326538, "learning_rate": 2.5419155727335204e-05, "loss": 0.3959, "num_input_tokens_seen": 42499952, "step": 73210 }, { "epoch": 10.904825737265416, "grad_norm": 3.0750672817230225, "learning_rate": 2.5415906759097336e-05, "loss": 0.6518, "num_input_tokens_seen": 42502640, "step": 73215 }, { "epoch": 10.905570449806374, "grad_norm": 1.8372713327407837, "learning_rate": 2.5412657783833143e-05, "loss": 0.623, "num_input_tokens_seen": 42505584, "step": 73220 }, { "epoch": 10.906315162347333, "grad_norm": 2.5269899368286133, "learning_rate": 2.5409408801597517e-05, "loss": 0.6138, "num_input_tokens_seen": 42508432, "step": 73225 }, { "epoch": 10.907059874888294, "grad_norm": 2.6553688049316406, "learning_rate": 2.540615981244535e-05, "loss": 0.588, "num_input_tokens_seen": 42511440, "step": 73230 }, { "epoch": 10.907804587429252, "grad_norm": 1.286481499671936, "learning_rate": 2.5402910816431525e-05, "loss": 0.4782, "num_input_tokens_seen": 42514192, "step": 73235 }, { "epoch": 10.908549299970211, "grad_norm": 1.0254480838775635, "learning_rate": 2.5399661813610925e-05, "loss": 0.5683, "num_input_tokens_seen": 42516848, "step": 73240 }, { "epoch": 10.90929401251117, "grad_norm": 1.2758021354675293, "learning_rate": 2.5396412804038455e-05, "loss": 0.6195, "num_input_tokens_seen": 42519920, "step": 73245 }, { "epoch": 10.91003872505213, "grad_norm": 1.3863693475723267, "learning_rate": 2.5393163787768988e-05, "loss": 0.5742, "num_input_tokens_seen": 42522576, "step": 73250 }, { "epoch": 10.91078343759309, "grad_norm": 1.1464594602584839, "learning_rate": 2.5389914764857413e-05, "loss": 0.6517, "num_input_tokens_seen": 42525424, "step": 73255 }, { "epoch": 10.911528150134048, "grad_norm": 1.2804279327392578, "learning_rate": 2.538666573535863e-05, "loss": 0.6112, "num_input_tokens_seen": 42528240, "step": 73260 }, { "epoch": 10.912272862675007, "grad_norm": 1.5699013471603394, "learning_rate": 2.5383416699327524e-05, "loss": 0.5569, "num_input_tokens_seen": 42531152, "step": 73265 }, { "epoch": 10.913017575215967, "grad_norm": 1.4888144731521606, "learning_rate": 2.5380167656818978e-05, "loss": 0.5901, "num_input_tokens_seen": 42533808, "step": 73270 }, { "epoch": 10.913762287756926, "grad_norm": 4.078337669372559, "learning_rate": 2.537691860788789e-05, "loss": 0.7972, "num_input_tokens_seen": 42536624, "step": 73275 }, { "epoch": 10.914507000297885, "grad_norm": 1.6830605268478394, "learning_rate": 2.5373669552589146e-05, "loss": 0.6251, "num_input_tokens_seen": 42539472, "step": 73280 }, { "epoch": 10.915251712838844, "grad_norm": 1.1340856552124023, "learning_rate": 2.537042049097763e-05, "loss": 0.6804, "num_input_tokens_seen": 42542320, "step": 73285 }, { "epoch": 10.915996425379804, "grad_norm": 1.096993088722229, "learning_rate": 2.5367171423108238e-05, "loss": 0.5072, "num_input_tokens_seen": 42545040, "step": 73290 }, { "epoch": 10.916741137920763, "grad_norm": 1.3440767526626587, "learning_rate": 2.5363922349035857e-05, "loss": 0.5127, "num_input_tokens_seen": 42548016, "step": 73295 }, { "epoch": 10.917485850461722, "grad_norm": 2.1806368827819824, "learning_rate": 2.5360673268815378e-05, "loss": 0.6343, "num_input_tokens_seen": 42550736, "step": 73300 }, { "epoch": 10.91823056300268, "grad_norm": 0.9916164875030518, "learning_rate": 2.535742418250169e-05, "loss": 0.5454, "num_input_tokens_seen": 42553520, "step": 73305 }, { "epoch": 10.918975275543641, "grad_norm": 0.956791341304779, "learning_rate": 2.535417509014969e-05, "loss": 0.4344, "num_input_tokens_seen": 42556528, "step": 73310 }, { "epoch": 10.9197199880846, "grad_norm": 1.159846544265747, "learning_rate": 2.5350925991814263e-05, "loss": 0.6955, "num_input_tokens_seen": 42559440, "step": 73315 }, { "epoch": 10.920464700625558, "grad_norm": 1.219255805015564, "learning_rate": 2.5347676887550286e-05, "loss": 0.8435, "num_input_tokens_seen": 42562320, "step": 73320 }, { "epoch": 10.921209413166517, "grad_norm": 1.591773509979248, "learning_rate": 2.534442777741267e-05, "loss": 0.6451, "num_input_tokens_seen": 42564976, "step": 73325 }, { "epoch": 10.921954125707478, "grad_norm": 1.1754608154296875, "learning_rate": 2.5341178661456293e-05, "loss": 0.7484, "num_input_tokens_seen": 42568112, "step": 73330 }, { "epoch": 10.922698838248436, "grad_norm": 1.134918212890625, "learning_rate": 2.533792953973605e-05, "loss": 0.6997, "num_input_tokens_seen": 42571088, "step": 73335 }, { "epoch": 10.923443550789395, "grad_norm": 1.474452257156372, "learning_rate": 2.533468041230683e-05, "loss": 0.467, "num_input_tokens_seen": 42573904, "step": 73340 }, { "epoch": 10.924188263330354, "grad_norm": 1.0625066757202148, "learning_rate": 2.5331431279223528e-05, "loss": 0.4113, "num_input_tokens_seen": 42577360, "step": 73345 }, { "epoch": 10.924932975871315, "grad_norm": 1.5204874277114868, "learning_rate": 2.5328182140541028e-05, "loss": 0.5429, "num_input_tokens_seen": 42580464, "step": 73350 }, { "epoch": 10.925677688412273, "grad_norm": 1.1549437046051025, "learning_rate": 2.5324932996314233e-05, "loss": 0.6836, "num_input_tokens_seen": 42583376, "step": 73355 }, { "epoch": 10.926422400953232, "grad_norm": 0.7498301267623901, "learning_rate": 2.5321683846598015e-05, "loss": 0.5398, "num_input_tokens_seen": 42586672, "step": 73360 }, { "epoch": 10.92716711349419, "grad_norm": 1.303409457206726, "learning_rate": 2.531843469144728e-05, "loss": 0.6415, "num_input_tokens_seen": 42589648, "step": 73365 }, { "epoch": 10.92791182603515, "grad_norm": 0.9829881191253662, "learning_rate": 2.5315185530916907e-05, "loss": 0.4584, "num_input_tokens_seen": 42592496, "step": 73370 }, { "epoch": 10.92865653857611, "grad_norm": 1.2578024864196777, "learning_rate": 2.5311936365061804e-05, "loss": 0.4967, "num_input_tokens_seen": 42595504, "step": 73375 }, { "epoch": 10.929401251117069, "grad_norm": 1.0358995199203491, "learning_rate": 2.530868719393685e-05, "loss": 0.5623, "num_input_tokens_seen": 42598640, "step": 73380 }, { "epoch": 10.930145963658028, "grad_norm": 1.3083845376968384, "learning_rate": 2.5305438017596937e-05, "loss": 0.721, "num_input_tokens_seen": 42601424, "step": 73385 }, { "epoch": 10.930890676198988, "grad_norm": 1.7135063409805298, "learning_rate": 2.5302188836096963e-05, "loss": 0.6704, "num_input_tokens_seen": 42603952, "step": 73390 }, { "epoch": 10.931635388739947, "grad_norm": 3.6780998706817627, "learning_rate": 2.5298939649491816e-05, "loss": 0.7533, "num_input_tokens_seen": 42606672, "step": 73395 }, { "epoch": 10.932380101280906, "grad_norm": 1.6930429935455322, "learning_rate": 2.5295690457836384e-05, "loss": 0.564, "num_input_tokens_seen": 42609968, "step": 73400 }, { "epoch": 10.933124813821864, "grad_norm": 1.1986358165740967, "learning_rate": 2.529244126118556e-05, "loss": 0.6271, "num_input_tokens_seen": 42612752, "step": 73405 }, { "epoch": 10.933869526362823, "grad_norm": 0.991054356098175, "learning_rate": 2.5289192059594253e-05, "loss": 0.5694, "num_input_tokens_seen": 42615536, "step": 73410 }, { "epoch": 10.934614238903784, "grad_norm": 0.9920824766159058, "learning_rate": 2.5285942853117327e-05, "loss": 0.5977, "num_input_tokens_seen": 42618224, "step": 73415 }, { "epoch": 10.935358951444742, "grad_norm": 2.281421422958374, "learning_rate": 2.5282693641809683e-05, "loss": 0.6676, "num_input_tokens_seen": 42621168, "step": 73420 }, { "epoch": 10.936103663985701, "grad_norm": 1.702487826347351, "learning_rate": 2.5279444425726228e-05, "loss": 0.3753, "num_input_tokens_seen": 42623856, "step": 73425 }, { "epoch": 10.93684837652666, "grad_norm": 1.2551591396331787, "learning_rate": 2.5276195204921837e-05, "loss": 0.7744, "num_input_tokens_seen": 42626576, "step": 73430 }, { "epoch": 10.93759308906762, "grad_norm": 1.474900484085083, "learning_rate": 2.5272945979451413e-05, "loss": 0.5545, "num_input_tokens_seen": 42629680, "step": 73435 }, { "epoch": 10.93833780160858, "grad_norm": 1.6034610271453857, "learning_rate": 2.5269696749369844e-05, "loss": 0.4815, "num_input_tokens_seen": 42632400, "step": 73440 }, { "epoch": 10.939082514149538, "grad_norm": 1.2503526210784912, "learning_rate": 2.5266447514732023e-05, "loss": 0.4306, "num_input_tokens_seen": 42635184, "step": 73445 }, { "epoch": 10.939827226690497, "grad_norm": 1.7347705364227295, "learning_rate": 2.5263198275592835e-05, "loss": 0.6129, "num_input_tokens_seen": 42638256, "step": 73450 }, { "epoch": 10.940571939231457, "grad_norm": 1.6101441383361816, "learning_rate": 2.5259949032007186e-05, "loss": 0.6188, "num_input_tokens_seen": 42640976, "step": 73455 }, { "epoch": 10.941316651772416, "grad_norm": 1.0577458143234253, "learning_rate": 2.5256699784029958e-05, "loss": 0.5425, "num_input_tokens_seen": 42643792, "step": 73460 }, { "epoch": 10.942061364313375, "grad_norm": 1.5871018171310425, "learning_rate": 2.525345053171605e-05, "loss": 0.7252, "num_input_tokens_seen": 42646768, "step": 73465 }, { "epoch": 10.942806076854334, "grad_norm": 1.3287808895111084, "learning_rate": 2.525020127512035e-05, "loss": 0.5598, "num_input_tokens_seen": 42649744, "step": 73470 }, { "epoch": 10.943550789395294, "grad_norm": 1.3630924224853516, "learning_rate": 2.524695201429776e-05, "loss": 0.631, "num_input_tokens_seen": 42652336, "step": 73475 }, { "epoch": 10.944295501936253, "grad_norm": 1.6728979349136353, "learning_rate": 2.5243702749303173e-05, "loss": 0.6964, "num_input_tokens_seen": 42655280, "step": 73480 }, { "epoch": 10.945040214477212, "grad_norm": 2.8710827827453613, "learning_rate": 2.5240453480191463e-05, "loss": 0.5786, "num_input_tokens_seen": 42657840, "step": 73485 }, { "epoch": 10.94578492701817, "grad_norm": 1.5360172986984253, "learning_rate": 2.5237204207017533e-05, "loss": 0.8127, "num_input_tokens_seen": 42660912, "step": 73490 }, { "epoch": 10.946529639559131, "grad_norm": 1.3367794752120972, "learning_rate": 2.523395492983629e-05, "loss": 0.6003, "num_input_tokens_seen": 42664112, "step": 73495 }, { "epoch": 10.94727435210009, "grad_norm": 1.4015555381774902, "learning_rate": 2.5230705648702608e-05, "loss": 0.4197, "num_input_tokens_seen": 42666832, "step": 73500 }, { "epoch": 10.948019064641048, "grad_norm": 1.3513944149017334, "learning_rate": 2.52274563636714e-05, "loss": 0.5668, "num_input_tokens_seen": 42669712, "step": 73505 }, { "epoch": 10.948763777182007, "grad_norm": 1.1206145286560059, "learning_rate": 2.5224207074797533e-05, "loss": 0.5984, "num_input_tokens_seen": 42672656, "step": 73510 }, { "epoch": 10.949508489722968, "grad_norm": 0.9183616638183594, "learning_rate": 2.522095778213593e-05, "loss": 0.4366, "num_input_tokens_seen": 42675408, "step": 73515 }, { "epoch": 10.950253202263927, "grad_norm": 2.7088797092437744, "learning_rate": 2.5217708485741458e-05, "loss": 0.5137, "num_input_tokens_seen": 42678512, "step": 73520 }, { "epoch": 10.950997914804885, "grad_norm": 0.9425503015518188, "learning_rate": 2.5214459185669028e-05, "loss": 0.5867, "num_input_tokens_seen": 42681200, "step": 73525 }, { "epoch": 10.951742627345844, "grad_norm": 1.6698743104934692, "learning_rate": 2.5211209881973525e-05, "loss": 0.5851, "num_input_tokens_seen": 42684880, "step": 73530 }, { "epoch": 10.952487339886805, "grad_norm": 3.1834657192230225, "learning_rate": 2.5207960574709843e-05, "loss": 0.605, "num_input_tokens_seen": 42687888, "step": 73535 }, { "epoch": 10.953232052427763, "grad_norm": 1.7007976770401, "learning_rate": 2.520471126393289e-05, "loss": 0.6248, "num_input_tokens_seen": 42690768, "step": 73540 }, { "epoch": 10.953976764968722, "grad_norm": 1.2642604112625122, "learning_rate": 2.5201461949697534e-05, "loss": 0.5777, "num_input_tokens_seen": 42693616, "step": 73545 }, { "epoch": 10.95472147750968, "grad_norm": 0.9958037734031677, "learning_rate": 2.5198212632058694e-05, "loss": 0.4916, "num_input_tokens_seen": 42696592, "step": 73550 }, { "epoch": 10.95546619005064, "grad_norm": 1.7766488790512085, "learning_rate": 2.519496331107125e-05, "loss": 0.6109, "num_input_tokens_seen": 42699472, "step": 73555 }, { "epoch": 10.9562109025916, "grad_norm": 1.0514419078826904, "learning_rate": 2.51917139867901e-05, "loss": 0.6821, "num_input_tokens_seen": 42702416, "step": 73560 }, { "epoch": 10.956955615132559, "grad_norm": 1.7997660636901855, "learning_rate": 2.5188464659270133e-05, "loss": 0.7251, "num_input_tokens_seen": 42705520, "step": 73565 }, { "epoch": 10.957700327673518, "grad_norm": 1.4179953336715698, "learning_rate": 2.5185215328566247e-05, "loss": 0.5833, "num_input_tokens_seen": 42708368, "step": 73570 }, { "epoch": 10.958445040214476, "grad_norm": 1.6062370538711548, "learning_rate": 2.5181965994733343e-05, "loss": 0.8081, "num_input_tokens_seen": 42711248, "step": 73575 }, { "epoch": 10.959189752755437, "grad_norm": 0.9989087581634521, "learning_rate": 2.5178716657826302e-05, "loss": 0.6213, "num_input_tokens_seen": 42714256, "step": 73580 }, { "epoch": 10.959934465296396, "grad_norm": 1.7468976974487305, "learning_rate": 2.5175467317900026e-05, "loss": 0.8443, "num_input_tokens_seen": 42717328, "step": 73585 }, { "epoch": 10.960679177837354, "grad_norm": 1.2861558198928833, "learning_rate": 2.517221797500941e-05, "loss": 0.571, "num_input_tokens_seen": 42720528, "step": 73590 }, { "epoch": 10.961423890378313, "grad_norm": 1.0807594060897827, "learning_rate": 2.516896862920935e-05, "loss": 0.5384, "num_input_tokens_seen": 42723472, "step": 73595 }, { "epoch": 10.962168602919274, "grad_norm": 1.0475225448608398, "learning_rate": 2.5165719280554728e-05, "loss": 0.5014, "num_input_tokens_seen": 42726576, "step": 73600 }, { "epoch": 10.962913315460233, "grad_norm": 1.1465719938278198, "learning_rate": 2.5162469929100452e-05, "loss": 0.5145, "num_input_tokens_seen": 42729872, "step": 73605 }, { "epoch": 10.963658028001191, "grad_norm": 2.2864787578582764, "learning_rate": 2.5159220574901417e-05, "loss": 0.799, "num_input_tokens_seen": 42732720, "step": 73610 }, { "epoch": 10.96440274054215, "grad_norm": 1.8320499658584595, "learning_rate": 2.5155971218012503e-05, "loss": 0.5995, "num_input_tokens_seen": 42735600, "step": 73615 }, { "epoch": 10.96514745308311, "grad_norm": 1.0755711793899536, "learning_rate": 2.5152721858488615e-05, "loss": 0.6841, "num_input_tokens_seen": 42738576, "step": 73620 }, { "epoch": 10.96589216562407, "grad_norm": 1.6724885702133179, "learning_rate": 2.5149472496384645e-05, "loss": 0.5006, "num_input_tokens_seen": 42741328, "step": 73625 }, { "epoch": 10.966636878165028, "grad_norm": 1.3177191019058228, "learning_rate": 2.5146223131755493e-05, "loss": 0.5486, "num_input_tokens_seen": 42744336, "step": 73630 }, { "epoch": 10.967381590705987, "grad_norm": 0.8657515645027161, "learning_rate": 2.514297376465605e-05, "loss": 0.5357, "num_input_tokens_seen": 42747280, "step": 73635 }, { "epoch": 10.968126303246947, "grad_norm": 0.8453801870346069, "learning_rate": 2.5139724395141207e-05, "loss": 0.6838, "num_input_tokens_seen": 42750512, "step": 73640 }, { "epoch": 10.968871015787906, "grad_norm": 1.8248142004013062, "learning_rate": 2.513647502326587e-05, "loss": 0.7132, "num_input_tokens_seen": 42753296, "step": 73645 }, { "epoch": 10.969615728328865, "grad_norm": 2.0376434326171875, "learning_rate": 2.513322564908492e-05, "loss": 0.8039, "num_input_tokens_seen": 42756016, "step": 73650 }, { "epoch": 10.970360440869824, "grad_norm": 1.5825016498565674, "learning_rate": 2.512997627265326e-05, "loss": 0.6382, "num_input_tokens_seen": 42759088, "step": 73655 }, { "epoch": 10.971105153410784, "grad_norm": 1.7878695726394653, "learning_rate": 2.5126726894025782e-05, "loss": 0.5835, "num_input_tokens_seen": 42761968, "step": 73660 }, { "epoch": 10.971849865951743, "grad_norm": 1.379188895225525, "learning_rate": 2.5123477513257376e-05, "loss": 0.644, "num_input_tokens_seen": 42764624, "step": 73665 }, { "epoch": 10.972594578492702, "grad_norm": 1.3157416582107544, "learning_rate": 2.5120228130402955e-05, "loss": 0.6007, "num_input_tokens_seen": 42767536, "step": 73670 }, { "epoch": 10.97333929103366, "grad_norm": 1.588383436203003, "learning_rate": 2.5116978745517394e-05, "loss": 0.6588, "num_input_tokens_seen": 42770480, "step": 73675 }, { "epoch": 10.974084003574621, "grad_norm": 1.0273997783660889, "learning_rate": 2.5113729358655602e-05, "loss": 0.5158, "num_input_tokens_seen": 42773360, "step": 73680 }, { "epoch": 10.97482871611558, "grad_norm": 1.6279423236846924, "learning_rate": 2.5110479969872463e-05, "loss": 0.6157, "num_input_tokens_seen": 42776112, "step": 73685 }, { "epoch": 10.975573428656539, "grad_norm": 0.9950658082962036, "learning_rate": 2.510723057922288e-05, "loss": 0.5335, "num_input_tokens_seen": 42779216, "step": 73690 }, { "epoch": 10.976318141197497, "grad_norm": 1.4604277610778809, "learning_rate": 2.510398118676174e-05, "loss": 0.5607, "num_input_tokens_seen": 42782288, "step": 73695 }, { "epoch": 10.977062853738456, "grad_norm": 1.1723731756210327, "learning_rate": 2.5100731792543948e-05, "loss": 0.6493, "num_input_tokens_seen": 42785072, "step": 73700 }, { "epoch": 10.977807566279417, "grad_norm": 1.7629626989364624, "learning_rate": 2.5097482396624393e-05, "loss": 0.7901, "num_input_tokens_seen": 42787888, "step": 73705 }, { "epoch": 10.978552278820375, "grad_norm": 1.270757794380188, "learning_rate": 2.5094232999057975e-05, "loss": 0.824, "num_input_tokens_seen": 42791024, "step": 73710 }, { "epoch": 10.979296991361334, "grad_norm": 1.045031189918518, "learning_rate": 2.5090983599899587e-05, "loss": 0.545, "num_input_tokens_seen": 42794032, "step": 73715 }, { "epoch": 10.980041703902295, "grad_norm": 1.371235728263855, "learning_rate": 2.508773419920412e-05, "loss": 0.6685, "num_input_tokens_seen": 42796688, "step": 73720 }, { "epoch": 10.980786416443253, "grad_norm": 2.17189884185791, "learning_rate": 2.508448479702647e-05, "loss": 0.5634, "num_input_tokens_seen": 42799760, "step": 73725 }, { "epoch": 10.981531128984212, "grad_norm": 0.967147946357727, "learning_rate": 2.5081235393421537e-05, "loss": 0.4326, "num_input_tokens_seen": 42802576, "step": 73730 }, { "epoch": 10.982275841525171, "grad_norm": 2.011202335357666, "learning_rate": 2.507798598844422e-05, "loss": 0.6098, "num_input_tokens_seen": 42805456, "step": 73735 }, { "epoch": 10.98302055406613, "grad_norm": 1.0006153583526611, "learning_rate": 2.5074736582149405e-05, "loss": 0.5365, "num_input_tokens_seen": 42808336, "step": 73740 }, { "epoch": 10.98376526660709, "grad_norm": 1.1177377700805664, "learning_rate": 2.507148717459199e-05, "loss": 0.5147, "num_input_tokens_seen": 42811376, "step": 73745 }, { "epoch": 10.984509979148049, "grad_norm": 1.1321017742156982, "learning_rate": 2.5068237765826875e-05, "loss": 0.4642, "num_input_tokens_seen": 42814352, "step": 73750 }, { "epoch": 10.985254691689008, "grad_norm": 1.7910408973693848, "learning_rate": 2.5064988355908952e-05, "loss": 0.6725, "num_input_tokens_seen": 42817168, "step": 73755 }, { "epoch": 10.985999404229966, "grad_norm": 2.634780168533325, "learning_rate": 2.5061738944893115e-05, "loss": 0.8149, "num_input_tokens_seen": 42819952, "step": 73760 }, { "epoch": 10.986744116770927, "grad_norm": 1.6095885038375854, "learning_rate": 2.5058489532834262e-05, "loss": 0.6224, "num_input_tokens_seen": 42823056, "step": 73765 }, { "epoch": 10.987488829311886, "grad_norm": 0.7894454002380371, "learning_rate": 2.5055240119787287e-05, "loss": 0.5285, "num_input_tokens_seen": 42826000, "step": 73770 }, { "epoch": 10.988233541852845, "grad_norm": 1.4128366708755493, "learning_rate": 2.5051990705807092e-05, "loss": 0.577, "num_input_tokens_seen": 42828912, "step": 73775 }, { "epoch": 10.988978254393803, "grad_norm": 1.6207867860794067, "learning_rate": 2.504874129094856e-05, "loss": 0.541, "num_input_tokens_seen": 42832016, "step": 73780 }, { "epoch": 10.989722966934764, "grad_norm": 2.531301498413086, "learning_rate": 2.504549187526659e-05, "loss": 0.8708, "num_input_tokens_seen": 42834736, "step": 73785 }, { "epoch": 10.990467679475723, "grad_norm": 1.1207162141799927, "learning_rate": 2.504224245881609e-05, "loss": 0.5829, "num_input_tokens_seen": 42837776, "step": 73790 }, { "epoch": 10.991212392016681, "grad_norm": 1.245693564414978, "learning_rate": 2.5038993041651947e-05, "loss": 0.5124, "num_input_tokens_seen": 42840400, "step": 73795 }, { "epoch": 10.99195710455764, "grad_norm": 1.61470365524292, "learning_rate": 2.503574362382905e-05, "loss": 0.6003, "num_input_tokens_seen": 42843408, "step": 73800 }, { "epoch": 10.9927018170986, "grad_norm": 1.3334745168685913, "learning_rate": 2.5032494205402303e-05, "loss": 0.6462, "num_input_tokens_seen": 42846384, "step": 73805 }, { "epoch": 10.99344652963956, "grad_norm": 1.569512128829956, "learning_rate": 2.5029244786426603e-05, "loss": 0.5674, "num_input_tokens_seen": 42849264, "step": 73810 }, { "epoch": 10.994191242180518, "grad_norm": 1.898802399635315, "learning_rate": 2.5025995366956835e-05, "loss": 0.6646, "num_input_tokens_seen": 42852080, "step": 73815 }, { "epoch": 10.994935954721477, "grad_norm": 0.8296959400177002, "learning_rate": 2.5022745947047904e-05, "loss": 0.5881, "num_input_tokens_seen": 42855504, "step": 73820 }, { "epoch": 10.995680667262437, "grad_norm": 1.2038456201553345, "learning_rate": 2.5019496526754705e-05, "loss": 0.7222, "num_input_tokens_seen": 42858832, "step": 73825 }, { "epoch": 10.996425379803396, "grad_norm": 1.0401538610458374, "learning_rate": 2.501624710613213e-05, "loss": 0.5879, "num_input_tokens_seen": 42861712, "step": 73830 }, { "epoch": 10.997170092344355, "grad_norm": 1.9633623361587524, "learning_rate": 2.501299768523508e-05, "loss": 0.7328, "num_input_tokens_seen": 42864368, "step": 73835 }, { "epoch": 10.997914804885314, "grad_norm": 1.939515233039856, "learning_rate": 2.5009748264118442e-05, "loss": 0.6142, "num_input_tokens_seen": 42867184, "step": 73840 }, { "epoch": 10.998659517426274, "grad_norm": 1.6644967794418335, "learning_rate": 2.500649884283713e-05, "loss": 0.5521, "num_input_tokens_seen": 42870352, "step": 73845 }, { "epoch": 10.999404229967233, "grad_norm": 1.218471646308899, "learning_rate": 2.5003249421446012e-05, "loss": 0.5661, "num_input_tokens_seen": 42873520, "step": 73850 }, { "epoch": 11.0, "eval_loss": 0.6528652310371399, "eval_runtime": 46.968, "eval_samples_per_second": 63.533, "eval_steps_per_second": 15.883, "num_input_tokens_seen": 42875264, "step": 73854 }, { "epoch": 11.000148942508192, "grad_norm": 0.9741465449333191, "learning_rate": 2.5e-05, "loss": 0.608, "num_input_tokens_seen": 42875808, "step": 73855 }, { "epoch": 11.00089365504915, "grad_norm": 1.1860252618789673, "learning_rate": 2.4996750578553997e-05, "loss": 0.4027, "num_input_tokens_seen": 42878944, "step": 73860 }, { "epoch": 11.001638367590111, "grad_norm": 1.5507651567459106, "learning_rate": 2.499350115716288e-05, "loss": 0.7155, "num_input_tokens_seen": 42881696, "step": 73865 }, { "epoch": 11.00238308013107, "grad_norm": 2.011387825012207, "learning_rate": 2.4990251735881563e-05, "loss": 0.5721, "num_input_tokens_seen": 42884480, "step": 73870 }, { "epoch": 11.003127792672029, "grad_norm": 1.6578062772750854, "learning_rate": 2.4987002314764926e-05, "loss": 0.5846, "num_input_tokens_seen": 42887488, "step": 73875 }, { "epoch": 11.003872505212987, "grad_norm": 1.5973210334777832, "learning_rate": 2.4983752893867877e-05, "loss": 0.6703, "num_input_tokens_seen": 42890112, "step": 73880 }, { "epoch": 11.004617217753948, "grad_norm": 1.3926918506622314, "learning_rate": 2.4980503473245298e-05, "loss": 0.6656, "num_input_tokens_seen": 42893248, "step": 73885 }, { "epoch": 11.005361930294907, "grad_norm": 2.074385404586792, "learning_rate": 2.4977254052952102e-05, "loss": 0.8969, "num_input_tokens_seen": 42895744, "step": 73890 }, { "epoch": 11.006106642835865, "grad_norm": 1.71793532371521, "learning_rate": 2.4974004633043168e-05, "loss": 0.5595, "num_input_tokens_seen": 42898528, "step": 73895 }, { "epoch": 11.006851355376824, "grad_norm": 1.2168550491333008, "learning_rate": 2.4970755213573403e-05, "loss": 0.5468, "num_input_tokens_seen": 42901600, "step": 73900 }, { "epoch": 11.007596067917783, "grad_norm": 1.3078850507736206, "learning_rate": 2.4967505794597703e-05, "loss": 0.6355, "num_input_tokens_seen": 42904768, "step": 73905 }, { "epoch": 11.008340780458743, "grad_norm": 0.7544187903404236, "learning_rate": 2.4964256376170954e-05, "loss": 0.5625, "num_input_tokens_seen": 42907584, "step": 73910 }, { "epoch": 11.009085492999702, "grad_norm": 1.6232893466949463, "learning_rate": 2.4961006958348066e-05, "loss": 0.6213, "num_input_tokens_seen": 42910848, "step": 73915 }, { "epoch": 11.009830205540661, "grad_norm": 1.346933364868164, "learning_rate": 2.495775754118391e-05, "loss": 0.453, "num_input_tokens_seen": 42913984, "step": 73920 }, { "epoch": 11.01057491808162, "grad_norm": 0.9541075825691223, "learning_rate": 2.4954508124733413e-05, "loss": 0.6711, "num_input_tokens_seen": 42917376, "step": 73925 }, { "epoch": 11.01131963062258, "grad_norm": 1.3998204469680786, "learning_rate": 2.495125870905144e-05, "loss": 0.5567, "num_input_tokens_seen": 42920448, "step": 73930 }, { "epoch": 11.012064343163539, "grad_norm": 1.4335052967071533, "learning_rate": 2.4948009294192913e-05, "loss": 0.5193, "num_input_tokens_seen": 42923584, "step": 73935 }, { "epoch": 11.012809055704498, "grad_norm": 1.8057479858398438, "learning_rate": 2.494475988021272e-05, "loss": 0.7324, "num_input_tokens_seen": 42926144, "step": 73940 }, { "epoch": 11.013553768245457, "grad_norm": 1.4766714572906494, "learning_rate": 2.4941510467165744e-05, "loss": 0.6654, "num_input_tokens_seen": 42929216, "step": 73945 }, { "epoch": 11.014298480786417, "grad_norm": 1.9539321660995483, "learning_rate": 2.4938261055106894e-05, "loss": 0.5257, "num_input_tokens_seen": 42932192, "step": 73950 }, { "epoch": 11.015043193327376, "grad_norm": 1.3756294250488281, "learning_rate": 2.493501164409105e-05, "loss": 0.5038, "num_input_tokens_seen": 42935104, "step": 73955 }, { "epoch": 11.015787905868335, "grad_norm": 2.3689026832580566, "learning_rate": 2.493176223417313e-05, "loss": 0.5282, "num_input_tokens_seen": 42937920, "step": 73960 }, { "epoch": 11.016532618409293, "grad_norm": 1.1931949853897095, "learning_rate": 2.4928512825408006e-05, "loss": 0.4775, "num_input_tokens_seen": 42940608, "step": 73965 }, { "epoch": 11.017277330950254, "grad_norm": 1.9593381881713867, "learning_rate": 2.4925263417850598e-05, "loss": 0.5658, "num_input_tokens_seen": 42943840, "step": 73970 }, { "epoch": 11.018022043491213, "grad_norm": 2.0164005756378174, "learning_rate": 2.4922014011555784e-05, "loss": 0.5148, "num_input_tokens_seen": 42946592, "step": 73975 }, { "epoch": 11.018766756032171, "grad_norm": 0.964975118637085, "learning_rate": 2.4918764606578465e-05, "loss": 0.6388, "num_input_tokens_seen": 42949760, "step": 73980 }, { "epoch": 11.01951146857313, "grad_norm": 2.8681390285491943, "learning_rate": 2.491551520297354e-05, "loss": 0.691, "num_input_tokens_seen": 42952864, "step": 73985 }, { "epoch": 11.02025618111409, "grad_norm": 1.401566505432129, "learning_rate": 2.4912265800795885e-05, "loss": 0.4638, "num_input_tokens_seen": 42955616, "step": 73990 }, { "epoch": 11.02100089365505, "grad_norm": 1.2014720439910889, "learning_rate": 2.4909016400100423e-05, "loss": 0.5145, "num_input_tokens_seen": 42958560, "step": 73995 }, { "epoch": 11.021745606196008, "grad_norm": 1.2079520225524902, "learning_rate": 2.490576700094203e-05, "loss": 0.5718, "num_input_tokens_seen": 42961344, "step": 74000 }, { "epoch": 11.022490318736967, "grad_norm": 2.02494740486145, "learning_rate": 2.490251760337561e-05, "loss": 0.6465, "num_input_tokens_seen": 42964192, "step": 74005 }, { "epoch": 11.023235031277927, "grad_norm": 1.864249348640442, "learning_rate": 2.4899268207456055e-05, "loss": 0.6909, "num_input_tokens_seen": 42967040, "step": 74010 }, { "epoch": 11.023979743818886, "grad_norm": 1.038955569267273, "learning_rate": 2.4896018813238263e-05, "loss": 0.489, "num_input_tokens_seen": 42970208, "step": 74015 }, { "epoch": 11.024724456359845, "grad_norm": 1.8041691780090332, "learning_rate": 2.4892769420777134e-05, "loss": 0.6731, "num_input_tokens_seen": 42973024, "step": 74020 }, { "epoch": 11.025469168900804, "grad_norm": 2.3010857105255127, "learning_rate": 2.4889520030127543e-05, "loss": 0.6119, "num_input_tokens_seen": 42976000, "step": 74025 }, { "epoch": 11.026213881441764, "grad_norm": 0.7979864478111267, "learning_rate": 2.488627064134441e-05, "loss": 0.5506, "num_input_tokens_seen": 42979040, "step": 74030 }, { "epoch": 11.026958593982723, "grad_norm": 1.153939962387085, "learning_rate": 2.4883021254482612e-05, "loss": 0.7044, "num_input_tokens_seen": 42981888, "step": 74035 }, { "epoch": 11.027703306523682, "grad_norm": 2.0173308849334717, "learning_rate": 2.487977186959705e-05, "loss": 0.6252, "num_input_tokens_seen": 42984672, "step": 74040 }, { "epoch": 11.02844801906464, "grad_norm": 1.5045578479766846, "learning_rate": 2.487652248674262e-05, "loss": 0.7436, "num_input_tokens_seen": 42987616, "step": 74045 }, { "epoch": 11.029192731605601, "grad_norm": 2.0360660552978516, "learning_rate": 2.4873273105974227e-05, "loss": 0.5409, "num_input_tokens_seen": 42990496, "step": 74050 }, { "epoch": 11.02993744414656, "grad_norm": 1.4580295085906982, "learning_rate": 2.487002372734674e-05, "loss": 0.6218, "num_input_tokens_seen": 42993280, "step": 74055 }, { "epoch": 11.030682156687519, "grad_norm": 1.3294544219970703, "learning_rate": 2.4866774350915084e-05, "loss": 0.4621, "num_input_tokens_seen": 42996352, "step": 74060 }, { "epoch": 11.031426869228477, "grad_norm": 1.3596736192703247, "learning_rate": 2.486352497673414e-05, "loss": 0.5707, "num_input_tokens_seen": 42999072, "step": 74065 }, { "epoch": 11.032171581769436, "grad_norm": 1.55498468875885, "learning_rate": 2.4860275604858796e-05, "loss": 0.5255, "num_input_tokens_seen": 43001824, "step": 74070 }, { "epoch": 11.032916294310397, "grad_norm": 1.4206748008728027, "learning_rate": 2.485702623534396e-05, "loss": 0.4781, "num_input_tokens_seen": 43004544, "step": 74075 }, { "epoch": 11.033661006851355, "grad_norm": 1.1343919038772583, "learning_rate": 2.485377686824451e-05, "loss": 0.7, "num_input_tokens_seen": 43007584, "step": 74080 }, { "epoch": 11.034405719392314, "grad_norm": 1.9865237474441528, "learning_rate": 2.485052750361536e-05, "loss": 0.6569, "num_input_tokens_seen": 43010208, "step": 74085 }, { "epoch": 11.035150431933273, "grad_norm": 1.1713696718215942, "learning_rate": 2.4847278141511387e-05, "loss": 0.5693, "num_input_tokens_seen": 43013216, "step": 74090 }, { "epoch": 11.035895144474233, "grad_norm": 1.5534616708755493, "learning_rate": 2.4844028781987506e-05, "loss": 0.5453, "num_input_tokens_seen": 43016096, "step": 74095 }, { "epoch": 11.036639857015192, "grad_norm": 1.3168916702270508, "learning_rate": 2.48407794250986e-05, "loss": 0.5829, "num_input_tokens_seen": 43018528, "step": 74100 }, { "epoch": 11.037384569556151, "grad_norm": 1.1177196502685547, "learning_rate": 2.4837530070899557e-05, "loss": 0.5472, "num_input_tokens_seen": 43021408, "step": 74105 }, { "epoch": 11.03812928209711, "grad_norm": 1.0103323459625244, "learning_rate": 2.483428071944528e-05, "loss": 0.5921, "num_input_tokens_seen": 43024032, "step": 74110 }, { "epoch": 11.03887399463807, "grad_norm": 1.9017033576965332, "learning_rate": 2.483103137079066e-05, "loss": 0.7629, "num_input_tokens_seen": 43026880, "step": 74115 }, { "epoch": 11.039618707179029, "grad_norm": 1.84364652633667, "learning_rate": 2.4827782024990596e-05, "loss": 0.6946, "num_input_tokens_seen": 43029952, "step": 74120 }, { "epoch": 11.040363419719988, "grad_norm": 1.743022084236145, "learning_rate": 2.4824532682099973e-05, "loss": 0.6842, "num_input_tokens_seen": 43032800, "step": 74125 }, { "epoch": 11.041108132260947, "grad_norm": 1.1785500049591064, "learning_rate": 2.48212833421737e-05, "loss": 0.5846, "num_input_tokens_seen": 43035648, "step": 74130 }, { "epoch": 11.041852844801907, "grad_norm": 1.6318873167037964, "learning_rate": 2.4818034005266663e-05, "loss": 0.8856, "num_input_tokens_seen": 43038368, "step": 74135 }, { "epoch": 11.042597557342866, "grad_norm": 1.0797384977340698, "learning_rate": 2.481478467143376e-05, "loss": 0.5803, "num_input_tokens_seen": 43041056, "step": 74140 }, { "epoch": 11.043342269883825, "grad_norm": 2.0493109226226807, "learning_rate": 2.4811535340729876e-05, "loss": 0.5569, "num_input_tokens_seen": 43043872, "step": 74145 }, { "epoch": 11.044086982424783, "grad_norm": 1.3837991952896118, "learning_rate": 2.4808286013209905e-05, "loss": 0.6496, "num_input_tokens_seen": 43046912, "step": 74150 }, { "epoch": 11.044831694965744, "grad_norm": 0.6303198337554932, "learning_rate": 2.4805036688928758e-05, "loss": 0.6846, "num_input_tokens_seen": 43049600, "step": 74155 }, { "epoch": 11.045576407506703, "grad_norm": 1.8642749786376953, "learning_rate": 2.4801787367941305e-05, "loss": 0.649, "num_input_tokens_seen": 43052512, "step": 74160 }, { "epoch": 11.046321120047661, "grad_norm": 1.3733491897583008, "learning_rate": 2.4798538050302468e-05, "loss": 0.5682, "num_input_tokens_seen": 43055744, "step": 74165 }, { "epoch": 11.04706583258862, "grad_norm": 3.2694814205169678, "learning_rate": 2.4795288736067118e-05, "loss": 0.676, "num_input_tokens_seen": 43058976, "step": 74170 }, { "epoch": 11.04781054512958, "grad_norm": 2.3698549270629883, "learning_rate": 2.4792039425290163e-05, "loss": 0.5681, "num_input_tokens_seen": 43062112, "step": 74175 }, { "epoch": 11.04855525767054, "grad_norm": 1.3185811042785645, "learning_rate": 2.4788790118026487e-05, "loss": 0.5446, "num_input_tokens_seen": 43064992, "step": 74180 }, { "epoch": 11.049299970211498, "grad_norm": 1.5360075235366821, "learning_rate": 2.4785540814330978e-05, "loss": 0.6277, "num_input_tokens_seen": 43067872, "step": 74185 }, { "epoch": 11.050044682752457, "grad_norm": 1.7654187679290771, "learning_rate": 2.478229151425855e-05, "loss": 0.5772, "num_input_tokens_seen": 43070752, "step": 74190 }, { "epoch": 11.050789395293418, "grad_norm": 1.3317290544509888, "learning_rate": 2.4779042217864077e-05, "loss": 0.5026, "num_input_tokens_seen": 43073568, "step": 74195 }, { "epoch": 11.051534107834376, "grad_norm": 1.4353924989700317, "learning_rate": 2.477579292520247e-05, "loss": 0.6781, "num_input_tokens_seen": 43076320, "step": 74200 }, { "epoch": 11.052278820375335, "grad_norm": 1.2709107398986816, "learning_rate": 2.477254363632861e-05, "loss": 0.4821, "num_input_tokens_seen": 43078976, "step": 74205 }, { "epoch": 11.053023532916294, "grad_norm": 1.316003441810608, "learning_rate": 2.4769294351297398e-05, "loss": 0.5587, "num_input_tokens_seen": 43081696, "step": 74210 }, { "epoch": 11.053768245457254, "grad_norm": 2.665255308151245, "learning_rate": 2.4766045070163713e-05, "loss": 0.6486, "num_input_tokens_seen": 43084768, "step": 74215 }, { "epoch": 11.054512957998213, "grad_norm": 1.1369630098342896, "learning_rate": 2.476279579298247e-05, "loss": 0.604, "num_input_tokens_seen": 43087520, "step": 74220 }, { "epoch": 11.055257670539172, "grad_norm": 0.8554942607879639, "learning_rate": 2.475954651980855e-05, "loss": 0.6019, "num_input_tokens_seen": 43090080, "step": 74225 }, { "epoch": 11.05600238308013, "grad_norm": 1.070935845375061, "learning_rate": 2.4756297250696837e-05, "loss": 0.6969, "num_input_tokens_seen": 43092960, "step": 74230 }, { "epoch": 11.056747095621091, "grad_norm": 1.450115442276001, "learning_rate": 2.4753047985702243e-05, "loss": 0.5957, "num_input_tokens_seen": 43095584, "step": 74235 }, { "epoch": 11.05749180816205, "grad_norm": 1.2926855087280273, "learning_rate": 2.474979872487965e-05, "loss": 0.5478, "num_input_tokens_seen": 43098336, "step": 74240 }, { "epoch": 11.058236520703009, "grad_norm": 1.5003901720046997, "learning_rate": 2.474654946828396e-05, "loss": 0.5981, "num_input_tokens_seen": 43101088, "step": 74245 }, { "epoch": 11.058981233243967, "grad_norm": 2.157846689224243, "learning_rate": 2.474330021597004e-05, "loss": 0.5363, "num_input_tokens_seen": 43103968, "step": 74250 }, { "epoch": 11.059725945784926, "grad_norm": 2.0672848224639893, "learning_rate": 2.474005096799282e-05, "loss": 0.4508, "num_input_tokens_seen": 43106592, "step": 74255 }, { "epoch": 11.060470658325887, "grad_norm": 1.1891578435897827, "learning_rate": 2.4736801724407174e-05, "loss": 0.7411, "num_input_tokens_seen": 43109280, "step": 74260 }, { "epoch": 11.061215370866845, "grad_norm": 2.7610175609588623, "learning_rate": 2.4733552485267983e-05, "loss": 0.8796, "num_input_tokens_seen": 43112000, "step": 74265 }, { "epoch": 11.061960083407804, "grad_norm": 2.0327234268188477, "learning_rate": 2.4730303250630165e-05, "loss": 0.6289, "num_input_tokens_seen": 43114848, "step": 74270 }, { "epoch": 11.062704795948763, "grad_norm": 2.19053053855896, "learning_rate": 2.4727054020548592e-05, "loss": 0.6426, "num_input_tokens_seen": 43117920, "step": 74275 }, { "epoch": 11.063449508489724, "grad_norm": 1.737709641456604, "learning_rate": 2.4723804795078172e-05, "loss": 0.6577, "num_input_tokens_seen": 43120544, "step": 74280 }, { "epoch": 11.064194221030682, "grad_norm": 1.2226080894470215, "learning_rate": 2.4720555574273775e-05, "loss": 0.6299, "num_input_tokens_seen": 43123520, "step": 74285 }, { "epoch": 11.064938933571641, "grad_norm": 1.345332145690918, "learning_rate": 2.471730635819032e-05, "loss": 0.6702, "num_input_tokens_seen": 43126592, "step": 74290 }, { "epoch": 11.0656836461126, "grad_norm": 1.2864186763763428, "learning_rate": 2.4714057146882676e-05, "loss": 0.5936, "num_input_tokens_seen": 43129600, "step": 74295 }, { "epoch": 11.06642835865356, "grad_norm": 1.1598306894302368, "learning_rate": 2.4710807940405756e-05, "loss": 0.5783, "num_input_tokens_seen": 43132384, "step": 74300 }, { "epoch": 11.067173071194519, "grad_norm": 1.2969897985458374, "learning_rate": 2.470755873881444e-05, "loss": 0.5612, "num_input_tokens_seen": 43134944, "step": 74305 }, { "epoch": 11.067917783735478, "grad_norm": 1.616378903388977, "learning_rate": 2.470430954216362e-05, "loss": 0.6362, "num_input_tokens_seen": 43137856, "step": 74310 }, { "epoch": 11.068662496276437, "grad_norm": 1.7746254205703735, "learning_rate": 2.4701060350508194e-05, "loss": 0.5547, "num_input_tokens_seen": 43140512, "step": 74315 }, { "epoch": 11.069407208817397, "grad_norm": 2.2092292308807373, "learning_rate": 2.4697811163903036e-05, "loss": 0.8216, "num_input_tokens_seen": 43143200, "step": 74320 }, { "epoch": 11.070151921358356, "grad_norm": 2.793271064758301, "learning_rate": 2.4694561982403065e-05, "loss": 0.6881, "num_input_tokens_seen": 43146208, "step": 74325 }, { "epoch": 11.070896633899315, "grad_norm": 1.1685569286346436, "learning_rate": 2.4691312806063154e-05, "loss": 0.7275, "num_input_tokens_seen": 43149056, "step": 74330 }, { "epoch": 11.071641346440273, "grad_norm": 1.4373854398727417, "learning_rate": 2.4688063634938198e-05, "loss": 0.5995, "num_input_tokens_seen": 43151744, "step": 74335 }, { "epoch": 11.072386058981234, "grad_norm": 2.239945888519287, "learning_rate": 2.46848144690831e-05, "loss": 0.5597, "num_input_tokens_seen": 43155040, "step": 74340 }, { "epoch": 11.073130771522193, "grad_norm": 1.9921252727508545, "learning_rate": 2.468156530855273e-05, "loss": 0.6396, "num_input_tokens_seen": 43158144, "step": 74345 }, { "epoch": 11.073875484063151, "grad_norm": 1.6298165321350098, "learning_rate": 2.4678316153401994e-05, "loss": 0.499, "num_input_tokens_seen": 43161056, "step": 74350 }, { "epoch": 11.07462019660411, "grad_norm": 2.522714138031006, "learning_rate": 2.4675067003685776e-05, "loss": 0.604, "num_input_tokens_seen": 43164288, "step": 74355 }, { "epoch": 11.07536490914507, "grad_norm": 2.22727108001709, "learning_rate": 2.4671817859458974e-05, "loss": 0.6827, "num_input_tokens_seen": 43166848, "step": 74360 }, { "epoch": 11.07610962168603, "grad_norm": 1.7743209600448608, "learning_rate": 2.4668568720776478e-05, "loss": 0.6433, "num_input_tokens_seen": 43169600, "step": 74365 }, { "epoch": 11.076854334226988, "grad_norm": 2.0363903045654297, "learning_rate": 2.466531958769317e-05, "loss": 0.6605, "num_input_tokens_seen": 43172288, "step": 74370 }, { "epoch": 11.077599046767947, "grad_norm": 1.3034026622772217, "learning_rate": 2.466207046026395e-05, "loss": 0.5225, "num_input_tokens_seen": 43175168, "step": 74375 }, { "epoch": 11.078343759308908, "grad_norm": 0.8292586207389832, "learning_rate": 2.4658821338543713e-05, "loss": 0.4625, "num_input_tokens_seen": 43178304, "step": 74380 }, { "epoch": 11.079088471849866, "grad_norm": 1.7402969598770142, "learning_rate": 2.465557222258734e-05, "loss": 0.6845, "num_input_tokens_seen": 43181184, "step": 74385 }, { "epoch": 11.079833184390825, "grad_norm": 1.2851637601852417, "learning_rate": 2.4652323112449716e-05, "loss": 0.5199, "num_input_tokens_seen": 43184064, "step": 74390 }, { "epoch": 11.080577896931784, "grad_norm": 1.0380443334579468, "learning_rate": 2.464907400818575e-05, "loss": 0.6164, "num_input_tokens_seen": 43186976, "step": 74395 }, { "epoch": 11.081322609472744, "grad_norm": 1.368019461631775, "learning_rate": 2.4645824909850316e-05, "loss": 0.4974, "num_input_tokens_seen": 43189888, "step": 74400 }, { "epoch": 11.082067322013703, "grad_norm": 1.6105031967163086, "learning_rate": 2.4642575817498313e-05, "loss": 0.6106, "num_input_tokens_seen": 43192896, "step": 74405 }, { "epoch": 11.082812034554662, "grad_norm": 2.300555467605591, "learning_rate": 2.463932673118462e-05, "loss": 0.6831, "num_input_tokens_seen": 43195936, "step": 74410 }, { "epoch": 11.08355674709562, "grad_norm": 1.3608973026275635, "learning_rate": 2.463607765096415e-05, "loss": 0.7294, "num_input_tokens_seen": 43198496, "step": 74415 }, { "epoch": 11.08430145963658, "grad_norm": 1.2215052843093872, "learning_rate": 2.4632828576891774e-05, "loss": 0.6085, "num_input_tokens_seen": 43201344, "step": 74420 }, { "epoch": 11.08504617217754, "grad_norm": 2.662788152694702, "learning_rate": 2.4629579509022374e-05, "loss": 0.5672, "num_input_tokens_seen": 43204192, "step": 74425 }, { "epoch": 11.085790884718499, "grad_norm": 0.7007113099098206, "learning_rate": 2.4626330447410864e-05, "loss": 0.6549, "num_input_tokens_seen": 43207168, "step": 74430 }, { "epoch": 11.086535597259457, "grad_norm": 1.3843941688537598, "learning_rate": 2.4623081392112117e-05, "loss": 0.439, "num_input_tokens_seen": 43210464, "step": 74435 }, { "epoch": 11.087280309800416, "grad_norm": 1.0241090059280396, "learning_rate": 2.4619832343181028e-05, "loss": 0.5983, "num_input_tokens_seen": 43213344, "step": 74440 }, { "epoch": 11.088025022341377, "grad_norm": 1.2576265335083008, "learning_rate": 2.461658330067248e-05, "loss": 0.588, "num_input_tokens_seen": 43216096, "step": 74445 }, { "epoch": 11.088769734882336, "grad_norm": 3.057762384414673, "learning_rate": 2.4613334264641373e-05, "loss": 0.6843, "num_input_tokens_seen": 43219136, "step": 74450 }, { "epoch": 11.089514447423294, "grad_norm": 3.1165220737457275, "learning_rate": 2.4610085235142586e-05, "loss": 0.6881, "num_input_tokens_seen": 43222304, "step": 74455 }, { "epoch": 11.090259159964253, "grad_norm": 1.3024914264678955, "learning_rate": 2.4606836212231018e-05, "loss": 0.6255, "num_input_tokens_seen": 43224960, "step": 74460 }, { "epoch": 11.091003872505214, "grad_norm": 3.271467447280884, "learning_rate": 2.4603587195961554e-05, "loss": 0.9784, "num_input_tokens_seen": 43227936, "step": 74465 }, { "epoch": 11.091748585046172, "grad_norm": 0.8141810894012451, "learning_rate": 2.460033818638908e-05, "loss": 0.5831, "num_input_tokens_seen": 43230816, "step": 74470 }, { "epoch": 11.092493297587131, "grad_norm": 1.3727115392684937, "learning_rate": 2.4597089183568488e-05, "loss": 0.7462, "num_input_tokens_seen": 43233504, "step": 74475 }, { "epoch": 11.09323801012809, "grad_norm": 1.6172221899032593, "learning_rate": 2.4593840187554654e-05, "loss": 0.6157, "num_input_tokens_seen": 43236096, "step": 74480 }, { "epoch": 11.09398272266905, "grad_norm": 0.821042001247406, "learning_rate": 2.459059119840249e-05, "loss": 0.7701, "num_input_tokens_seen": 43239104, "step": 74485 }, { "epoch": 11.09472743521001, "grad_norm": 1.4375946521759033, "learning_rate": 2.4587342216166856e-05, "loss": 0.6346, "num_input_tokens_seen": 43242016, "step": 74490 }, { "epoch": 11.095472147750968, "grad_norm": 0.9267085194587708, "learning_rate": 2.458409324090267e-05, "loss": 0.5381, "num_input_tokens_seen": 43244864, "step": 74495 }, { "epoch": 11.096216860291927, "grad_norm": 0.9855858683586121, "learning_rate": 2.45808442726648e-05, "loss": 0.7674, "num_input_tokens_seen": 43247904, "step": 74500 }, { "epoch": 11.096961572832887, "grad_norm": 2.366316318511963, "learning_rate": 2.4577595311508143e-05, "loss": 0.6382, "num_input_tokens_seen": 43250848, "step": 74505 }, { "epoch": 11.097706285373846, "grad_norm": 1.2338916063308716, "learning_rate": 2.4574346357487588e-05, "loss": 0.44, "num_input_tokens_seen": 43253504, "step": 74510 }, { "epoch": 11.098450997914805, "grad_norm": 1.7538886070251465, "learning_rate": 2.4571097410658006e-05, "loss": 0.7217, "num_input_tokens_seen": 43256384, "step": 74515 }, { "epoch": 11.099195710455763, "grad_norm": 1.5558003187179565, "learning_rate": 2.4567848471074307e-05, "loss": 0.5797, "num_input_tokens_seen": 43259040, "step": 74520 }, { "epoch": 11.099940422996724, "grad_norm": 1.2869654893875122, "learning_rate": 2.4564599538791362e-05, "loss": 0.5111, "num_input_tokens_seen": 43263104, "step": 74525 }, { "epoch": 11.100685135537683, "grad_norm": 1.3357458114624023, "learning_rate": 2.4561350613864074e-05, "loss": 0.6437, "num_input_tokens_seen": 43265920, "step": 74530 }, { "epoch": 11.101429848078642, "grad_norm": 1.5370829105377197, "learning_rate": 2.4558101696347315e-05, "loss": 0.5548, "num_input_tokens_seen": 43269024, "step": 74535 }, { "epoch": 11.1021745606196, "grad_norm": 1.8681719303131104, "learning_rate": 2.4554852786295985e-05, "loss": 0.7924, "num_input_tokens_seen": 43271744, "step": 74540 }, { "epoch": 11.10291927316056, "grad_norm": 1.8747048377990723, "learning_rate": 2.4551603883764963e-05, "loss": 0.5731, "num_input_tokens_seen": 43274624, "step": 74545 }, { "epoch": 11.10366398570152, "grad_norm": 1.1297404766082764, "learning_rate": 2.454835498880913e-05, "loss": 0.5595, "num_input_tokens_seen": 43277472, "step": 74550 }, { "epoch": 11.104408698242478, "grad_norm": 1.3971847295761108, "learning_rate": 2.454510610148339e-05, "loss": 0.6508, "num_input_tokens_seen": 43280704, "step": 74555 }, { "epoch": 11.105153410783437, "grad_norm": 1.821987271308899, "learning_rate": 2.454185722184261e-05, "loss": 0.5696, "num_input_tokens_seen": 43283456, "step": 74560 }, { "epoch": 11.105898123324398, "grad_norm": 1.0189882516860962, "learning_rate": 2.4538608349941695e-05, "loss": 0.4643, "num_input_tokens_seen": 43286208, "step": 74565 }, { "epoch": 11.106642835865356, "grad_norm": 1.6948559284210205, "learning_rate": 2.453535948583552e-05, "loss": 0.6507, "num_input_tokens_seen": 43288992, "step": 74570 }, { "epoch": 11.107387548406315, "grad_norm": 1.1874322891235352, "learning_rate": 2.453211062957898e-05, "loss": 0.5868, "num_input_tokens_seen": 43292032, "step": 74575 }, { "epoch": 11.108132260947274, "grad_norm": 1.5456920862197876, "learning_rate": 2.4528861781226942e-05, "loss": 0.5961, "num_input_tokens_seen": 43295264, "step": 74580 }, { "epoch": 11.108876973488233, "grad_norm": 1.5103514194488525, "learning_rate": 2.4525612940834315e-05, "loss": 0.5214, "num_input_tokens_seen": 43297824, "step": 74585 }, { "epoch": 11.109621686029193, "grad_norm": 1.2694295644760132, "learning_rate": 2.452236410845598e-05, "loss": 0.5473, "num_input_tokens_seen": 43300480, "step": 74590 }, { "epoch": 11.110366398570152, "grad_norm": 1.8226608037948608, "learning_rate": 2.4519115284146806e-05, "loss": 0.5449, "num_input_tokens_seen": 43303360, "step": 74595 }, { "epoch": 11.11111111111111, "grad_norm": 1.2357516288757324, "learning_rate": 2.45158664679617e-05, "loss": 0.4537, "num_input_tokens_seen": 43306176, "step": 74600 }, { "epoch": 11.11185582365207, "grad_norm": 2.881133556365967, "learning_rate": 2.4512617659955532e-05, "loss": 0.6361, "num_input_tokens_seen": 43308992, "step": 74605 }, { "epoch": 11.11260053619303, "grad_norm": 1.0748975276947021, "learning_rate": 2.45093688601832e-05, "loss": 0.6035, "num_input_tokens_seen": 43311808, "step": 74610 }, { "epoch": 11.113345248733989, "grad_norm": 1.4329560995101929, "learning_rate": 2.450612006869957e-05, "loss": 0.731, "num_input_tokens_seen": 43314368, "step": 74615 }, { "epoch": 11.114089961274948, "grad_norm": 1.8325893878936768, "learning_rate": 2.450287128555955e-05, "loss": 0.6167, "num_input_tokens_seen": 43317568, "step": 74620 }, { "epoch": 11.114834673815906, "grad_norm": 1.0072163343429565, "learning_rate": 2.4499622510818017e-05, "loss": 0.6828, "num_input_tokens_seen": 43320416, "step": 74625 }, { "epoch": 11.115579386356867, "grad_norm": 1.4746919870376587, "learning_rate": 2.4496373744529842e-05, "loss": 0.5843, "num_input_tokens_seen": 43323296, "step": 74630 }, { "epoch": 11.116324098897826, "grad_norm": 1.0819029808044434, "learning_rate": 2.449312498674993e-05, "loss": 0.3515, "num_input_tokens_seen": 43326400, "step": 74635 }, { "epoch": 11.117068811438784, "grad_norm": 1.3213565349578857, "learning_rate": 2.448987623753315e-05, "loss": 0.4563, "num_input_tokens_seen": 43329248, "step": 74640 }, { "epoch": 11.117813523979743, "grad_norm": 0.9573512673377991, "learning_rate": 2.44866274969344e-05, "loss": 0.5981, "num_input_tokens_seen": 43332320, "step": 74645 }, { "epoch": 11.118558236520704, "grad_norm": 1.0818843841552734, "learning_rate": 2.4483378765008543e-05, "loss": 0.5444, "num_input_tokens_seen": 43335392, "step": 74650 }, { "epoch": 11.119302949061662, "grad_norm": 1.0319185256958008, "learning_rate": 2.4480130041810493e-05, "loss": 0.6746, "num_input_tokens_seen": 43338464, "step": 74655 }, { "epoch": 11.120047661602621, "grad_norm": 1.3750369548797607, "learning_rate": 2.4476881327395108e-05, "loss": 0.5678, "num_input_tokens_seen": 43341152, "step": 74660 }, { "epoch": 11.12079237414358, "grad_norm": 0.9586120247840881, "learning_rate": 2.447363262181728e-05, "loss": 0.5314, "num_input_tokens_seen": 43343968, "step": 74665 }, { "epoch": 11.12153708668454, "grad_norm": 1.581737756729126, "learning_rate": 2.44703839251319e-05, "loss": 0.4565, "num_input_tokens_seen": 43346752, "step": 74670 }, { "epoch": 11.1222817992255, "grad_norm": 1.1128371953964233, "learning_rate": 2.4467135237393842e-05, "loss": 0.6716, "num_input_tokens_seen": 43349600, "step": 74675 }, { "epoch": 11.123026511766458, "grad_norm": 1.1731925010681152, "learning_rate": 2.4463886558658e-05, "loss": 0.5949, "num_input_tokens_seen": 43352576, "step": 74680 }, { "epoch": 11.123771224307417, "grad_norm": 1.2828292846679688, "learning_rate": 2.4460637888979234e-05, "loss": 0.4654, "num_input_tokens_seen": 43355712, "step": 74685 }, { "epoch": 11.124515936848377, "grad_norm": 1.1928654909133911, "learning_rate": 2.4457389228412457e-05, "loss": 0.6245, "num_input_tokens_seen": 43358368, "step": 74690 }, { "epoch": 11.125260649389336, "grad_norm": 1.1889830827713013, "learning_rate": 2.445414057701253e-05, "loss": 0.5823, "num_input_tokens_seen": 43361088, "step": 74695 }, { "epoch": 11.126005361930295, "grad_norm": 1.3453233242034912, "learning_rate": 2.4450891934834345e-05, "loss": 0.5715, "num_input_tokens_seen": 43364352, "step": 74700 }, { "epoch": 11.126750074471254, "grad_norm": 1.9435858726501465, "learning_rate": 2.4447643301932785e-05, "loss": 0.5054, "num_input_tokens_seen": 43367264, "step": 74705 }, { "epoch": 11.127494787012214, "grad_norm": 1.8257945775985718, "learning_rate": 2.4444394678362727e-05, "loss": 0.5736, "num_input_tokens_seen": 43370464, "step": 74710 }, { "epoch": 11.128239499553173, "grad_norm": 1.3902415037155151, "learning_rate": 2.444114606417906e-05, "loss": 0.6252, "num_input_tokens_seen": 43373344, "step": 74715 }, { "epoch": 11.128984212094132, "grad_norm": 1.1983695030212402, "learning_rate": 2.4437897459436654e-05, "loss": 0.538, "num_input_tokens_seen": 43376224, "step": 74720 }, { "epoch": 11.12972892463509, "grad_norm": 1.90363609790802, "learning_rate": 2.4434648864190405e-05, "loss": 0.5309, "num_input_tokens_seen": 43379232, "step": 74725 }, { "epoch": 11.13047363717605, "grad_norm": 1.7269378900527954, "learning_rate": 2.4431400278495188e-05, "loss": 0.5999, "num_input_tokens_seen": 43381984, "step": 74730 }, { "epoch": 11.13121834971701, "grad_norm": 1.0357389450073242, "learning_rate": 2.442815170240589e-05, "loss": 0.5528, "num_input_tokens_seen": 43384928, "step": 74735 }, { "epoch": 11.131963062257968, "grad_norm": 1.6144202947616577, "learning_rate": 2.442490313597738e-05, "loss": 0.7934, "num_input_tokens_seen": 43387840, "step": 74740 }, { "epoch": 11.132707774798927, "grad_norm": 1.2804150581359863, "learning_rate": 2.4421654579264553e-05, "loss": 0.5637, "num_input_tokens_seen": 43390656, "step": 74745 }, { "epoch": 11.133452487339888, "grad_norm": 1.2619023323059082, "learning_rate": 2.4418406032322286e-05, "loss": 0.648, "num_input_tokens_seen": 43393856, "step": 74750 }, { "epoch": 11.134197199880846, "grad_norm": 1.7125725746154785, "learning_rate": 2.4415157495205445e-05, "loss": 0.5945, "num_input_tokens_seen": 43396800, "step": 74755 }, { "epoch": 11.134941912421805, "grad_norm": 1.7798813581466675, "learning_rate": 2.4411908967968938e-05, "loss": 0.6241, "num_input_tokens_seen": 43399488, "step": 74760 }, { "epoch": 11.135686624962764, "grad_norm": 1.527451515197754, "learning_rate": 2.4408660450667626e-05, "loss": 0.5753, "num_input_tokens_seen": 43402464, "step": 74765 }, { "epoch": 11.136431337503723, "grad_norm": 2.392793893814087, "learning_rate": 2.4405411943356398e-05, "loss": 0.7136, "num_input_tokens_seen": 43405504, "step": 74770 }, { "epoch": 11.137176050044683, "grad_norm": 3.295736789703369, "learning_rate": 2.440216344609012e-05, "loss": 0.8717, "num_input_tokens_seen": 43408384, "step": 74775 }, { "epoch": 11.137920762585642, "grad_norm": 1.3107805252075195, "learning_rate": 2.4398914958923695e-05, "loss": 0.6052, "num_input_tokens_seen": 43411392, "step": 74780 }, { "epoch": 11.1386654751266, "grad_norm": 2.4568376541137695, "learning_rate": 2.439566648191199e-05, "loss": 0.7023, "num_input_tokens_seen": 43414336, "step": 74785 }, { "epoch": 11.13941018766756, "grad_norm": 1.5065354108810425, "learning_rate": 2.439241801510987e-05, "loss": 0.5379, "num_input_tokens_seen": 43417024, "step": 74790 }, { "epoch": 11.14015490020852, "grad_norm": 1.2256501913070679, "learning_rate": 2.4389169558572247e-05, "loss": 0.6345, "num_input_tokens_seen": 43419648, "step": 74795 }, { "epoch": 11.140899612749479, "grad_norm": 1.9575539827346802, "learning_rate": 2.4385921112353978e-05, "loss": 0.6399, "num_input_tokens_seen": 43422560, "step": 74800 }, { "epoch": 11.141644325290438, "grad_norm": 1.5288612842559814, "learning_rate": 2.438267267650995e-05, "loss": 0.4981, "num_input_tokens_seen": 43425760, "step": 74805 }, { "epoch": 11.142389037831396, "grad_norm": 1.328460693359375, "learning_rate": 2.4379424251095034e-05, "loss": 0.6689, "num_input_tokens_seen": 43428544, "step": 74810 }, { "epoch": 11.143133750372357, "grad_norm": 1.6939034461975098, "learning_rate": 2.4376175836164122e-05, "loss": 0.6116, "num_input_tokens_seen": 43431392, "step": 74815 }, { "epoch": 11.143878462913316, "grad_norm": 0.9062399864196777, "learning_rate": 2.4372927431772076e-05, "loss": 0.6624, "num_input_tokens_seen": 43434496, "step": 74820 }, { "epoch": 11.144623175454274, "grad_norm": 0.9786755442619324, "learning_rate": 2.4369679037973792e-05, "loss": 0.6902, "num_input_tokens_seen": 43437472, "step": 74825 }, { "epoch": 11.145367887995233, "grad_norm": 0.9996261596679688, "learning_rate": 2.436643065482414e-05, "loss": 0.3766, "num_input_tokens_seen": 43440288, "step": 74830 }, { "epoch": 11.146112600536194, "grad_norm": 2.7090728282928467, "learning_rate": 2.4363182282377994e-05, "loss": 0.6958, "num_input_tokens_seen": 43443008, "step": 74835 }, { "epoch": 11.146857313077152, "grad_norm": 1.7480701208114624, "learning_rate": 2.4359933920690242e-05, "loss": 0.6428, "num_input_tokens_seen": 43446112, "step": 74840 }, { "epoch": 11.147602025618111, "grad_norm": 1.718691110610962, "learning_rate": 2.4356685569815742e-05, "loss": 0.592, "num_input_tokens_seen": 43448992, "step": 74845 }, { "epoch": 11.14834673815907, "grad_norm": 1.528153419494629, "learning_rate": 2.43534372298094e-05, "loss": 0.4755, "num_input_tokens_seen": 43451936, "step": 74850 }, { "epoch": 11.14909145070003, "grad_norm": 1.2307188510894775, "learning_rate": 2.4350188900726068e-05, "loss": 0.676, "num_input_tokens_seen": 43454560, "step": 74855 }, { "epoch": 11.14983616324099, "grad_norm": 1.3090406656265259, "learning_rate": 2.4346940582620644e-05, "loss": 0.599, "num_input_tokens_seen": 43457760, "step": 74860 }, { "epoch": 11.150580875781948, "grad_norm": 0.8652274012565613, "learning_rate": 2.4343692275547993e-05, "loss": 0.5531, "num_input_tokens_seen": 43460992, "step": 74865 }, { "epoch": 11.151325588322907, "grad_norm": 3.5013389587402344, "learning_rate": 2.434044397956299e-05, "loss": 0.5264, "num_input_tokens_seen": 43463904, "step": 74870 }, { "epoch": 11.152070300863867, "grad_norm": 1.8511584997177124, "learning_rate": 2.433719569472052e-05, "loss": 0.8387, "num_input_tokens_seen": 43466816, "step": 74875 }, { "epoch": 11.152815013404826, "grad_norm": 1.1733765602111816, "learning_rate": 2.433394742107545e-05, "loss": 0.5092, "num_input_tokens_seen": 43469568, "step": 74880 }, { "epoch": 11.153559725945785, "grad_norm": 1.6275182962417603, "learning_rate": 2.4330699158682666e-05, "loss": 0.5354, "num_input_tokens_seen": 43472576, "step": 74885 }, { "epoch": 11.154304438486744, "grad_norm": 1.6610523462295532, "learning_rate": 2.432745090759703e-05, "loss": 0.5859, "num_input_tokens_seen": 43475392, "step": 74890 }, { "epoch": 11.155049151027704, "grad_norm": 2.5894076824188232, "learning_rate": 2.432420266787344e-05, "loss": 0.5981, "num_input_tokens_seen": 43478336, "step": 74895 }, { "epoch": 11.155793863568663, "grad_norm": 1.4443243741989136, "learning_rate": 2.4320954439566752e-05, "loss": 0.5603, "num_input_tokens_seen": 43481216, "step": 74900 }, { "epoch": 11.156538576109622, "grad_norm": 0.9247133135795593, "learning_rate": 2.4317706222731853e-05, "loss": 0.5725, "num_input_tokens_seen": 43484416, "step": 74905 }, { "epoch": 11.15728328865058, "grad_norm": 1.0047476291656494, "learning_rate": 2.4314458017423618e-05, "loss": 0.5191, "num_input_tokens_seen": 43487040, "step": 74910 }, { "epoch": 11.158028001191541, "grad_norm": 1.7625616788864136, "learning_rate": 2.43112098236969e-05, "loss": 0.7163, "num_input_tokens_seen": 43489952, "step": 74915 }, { "epoch": 11.1587727137325, "grad_norm": 1.7155545949935913, "learning_rate": 2.430796164160661e-05, "loss": 0.6433, "num_input_tokens_seen": 43492832, "step": 74920 }, { "epoch": 11.159517426273458, "grad_norm": 1.9128767251968384, "learning_rate": 2.430471347120759e-05, "loss": 0.6604, "num_input_tokens_seen": 43495584, "step": 74925 }, { "epoch": 11.160262138814417, "grad_norm": 2.447683334350586, "learning_rate": 2.4301465312554743e-05, "loss": 0.5996, "num_input_tokens_seen": 43498496, "step": 74930 }, { "epoch": 11.161006851355376, "grad_norm": 1.6190053224563599, "learning_rate": 2.429821716570292e-05, "loss": 0.6152, "num_input_tokens_seen": 43501632, "step": 74935 }, { "epoch": 11.161751563896336, "grad_norm": 0.9598563313484192, "learning_rate": 2.4294969030707013e-05, "loss": 0.4512, "num_input_tokens_seen": 43504544, "step": 74940 }, { "epoch": 11.162496276437295, "grad_norm": 1.0692766904830933, "learning_rate": 2.4291720907621886e-05, "loss": 0.6097, "num_input_tokens_seen": 43507392, "step": 74945 }, { "epoch": 11.163240988978254, "grad_norm": 1.2015149593353271, "learning_rate": 2.4288472796502407e-05, "loss": 0.5722, "num_input_tokens_seen": 43510176, "step": 74950 }, { "epoch": 11.163985701519213, "grad_norm": 1.5046385526657104, "learning_rate": 2.4285224697403464e-05, "loss": 0.5796, "num_input_tokens_seen": 43512960, "step": 74955 }, { "epoch": 11.164730414060173, "grad_norm": 2.5293097496032715, "learning_rate": 2.4281976610379914e-05, "loss": 0.5671, "num_input_tokens_seen": 43515712, "step": 74960 }, { "epoch": 11.165475126601132, "grad_norm": 1.8556854724884033, "learning_rate": 2.427872853548665e-05, "loss": 0.6305, "num_input_tokens_seen": 43518528, "step": 74965 }, { "epoch": 11.16621983914209, "grad_norm": 1.3297232389450073, "learning_rate": 2.427548047277853e-05, "loss": 0.4946, "num_input_tokens_seen": 43521344, "step": 74970 }, { "epoch": 11.16696455168305, "grad_norm": 1.5536915063858032, "learning_rate": 2.4272232422310436e-05, "loss": 0.6465, "num_input_tokens_seen": 43524000, "step": 74975 }, { "epoch": 11.16770926422401, "grad_norm": 0.8485996723175049, "learning_rate": 2.4268984384137225e-05, "loss": 0.4011, "num_input_tokens_seen": 43526752, "step": 74980 }, { "epoch": 11.168453976764969, "grad_norm": 1.557455062866211, "learning_rate": 2.426573635831379e-05, "loss": 0.5723, "num_input_tokens_seen": 43529696, "step": 74985 }, { "epoch": 11.169198689305928, "grad_norm": 1.830913782119751, "learning_rate": 2.426248834489499e-05, "loss": 0.7508, "num_input_tokens_seen": 43532352, "step": 74990 }, { "epoch": 11.169943401846886, "grad_norm": 1.2560158967971802, "learning_rate": 2.4259240343935695e-05, "loss": 0.5136, "num_input_tokens_seen": 43535296, "step": 74995 }, { "epoch": 11.170688114387847, "grad_norm": 1.2013649940490723, "learning_rate": 2.4255992355490788e-05, "loss": 0.5079, "num_input_tokens_seen": 43538496, "step": 75000 }, { "epoch": 11.171432826928806, "grad_norm": 2.4215409755706787, "learning_rate": 2.425274437961513e-05, "loss": 0.7094, "num_input_tokens_seen": 43541408, "step": 75005 }, { "epoch": 11.172177539469764, "grad_norm": 1.1781326532363892, "learning_rate": 2.42494964163636e-05, "loss": 0.5332, "num_input_tokens_seen": 43544672, "step": 75010 }, { "epoch": 11.172922252010723, "grad_norm": 1.2561607360839844, "learning_rate": 2.4246248465791058e-05, "loss": 0.6725, "num_input_tokens_seen": 43547680, "step": 75015 }, { "epoch": 11.173666964551684, "grad_norm": 0.8872179388999939, "learning_rate": 2.4243000527952388e-05, "loss": 0.5646, "num_input_tokens_seen": 43550656, "step": 75020 }, { "epoch": 11.174411677092642, "grad_norm": 2.7692999839782715, "learning_rate": 2.423975260290246e-05, "loss": 0.7765, "num_input_tokens_seen": 43553664, "step": 75025 }, { "epoch": 11.175156389633601, "grad_norm": 1.946197748184204, "learning_rate": 2.4236504690696125e-05, "loss": 0.6057, "num_input_tokens_seen": 43556576, "step": 75030 }, { "epoch": 11.17590110217456, "grad_norm": 1.034716248512268, "learning_rate": 2.423325679138828e-05, "loss": 0.608, "num_input_tokens_seen": 43559168, "step": 75035 }, { "epoch": 11.17664581471552, "grad_norm": 1.5133984088897705, "learning_rate": 2.4230008905033774e-05, "loss": 0.491, "num_input_tokens_seen": 43561792, "step": 75040 }, { "epoch": 11.17739052725648, "grad_norm": 1.7903224229812622, "learning_rate": 2.4226761031687496e-05, "loss": 0.6998, "num_input_tokens_seen": 43564768, "step": 75045 }, { "epoch": 11.178135239797438, "grad_norm": 1.3759357929229736, "learning_rate": 2.4223513171404288e-05, "loss": 0.6696, "num_input_tokens_seen": 43567648, "step": 75050 }, { "epoch": 11.178879952338397, "grad_norm": 1.6997212171554565, "learning_rate": 2.4220265324239045e-05, "loss": 0.6111, "num_input_tokens_seen": 43570528, "step": 75055 }, { "epoch": 11.179624664879357, "grad_norm": 1.5661401748657227, "learning_rate": 2.4217017490246626e-05, "loss": 0.493, "num_input_tokens_seen": 43573440, "step": 75060 }, { "epoch": 11.180369377420316, "grad_norm": 2.017347812652588, "learning_rate": 2.4213769669481906e-05, "loss": 0.6058, "num_input_tokens_seen": 43576448, "step": 75065 }, { "epoch": 11.181114089961275, "grad_norm": 1.0221518278121948, "learning_rate": 2.421052186199974e-05, "loss": 0.5538, "num_input_tokens_seen": 43579264, "step": 75070 }, { "epoch": 11.181858802502234, "grad_norm": 1.1803460121154785, "learning_rate": 2.420727406785501e-05, "loss": 0.6394, "num_input_tokens_seen": 43582176, "step": 75075 }, { "epoch": 11.182603515043194, "grad_norm": 1.3505113124847412, "learning_rate": 2.4204026287102578e-05, "loss": 0.7436, "num_input_tokens_seen": 43585088, "step": 75080 }, { "epoch": 11.183348227584153, "grad_norm": 1.6879138946533203, "learning_rate": 2.4200778519797306e-05, "loss": 0.5583, "num_input_tokens_seen": 43587744, "step": 75085 }, { "epoch": 11.184092940125112, "grad_norm": 1.0046268701553345, "learning_rate": 2.4197530765994082e-05, "loss": 0.4515, "num_input_tokens_seen": 43590528, "step": 75090 }, { "epoch": 11.18483765266607, "grad_norm": 1.941097617149353, "learning_rate": 2.4194283025747748e-05, "loss": 0.7393, "num_input_tokens_seen": 43593248, "step": 75095 }, { "epoch": 11.18558236520703, "grad_norm": 1.2188681364059448, "learning_rate": 2.4191035299113194e-05, "loss": 0.7015, "num_input_tokens_seen": 43596160, "step": 75100 }, { "epoch": 11.18632707774799, "grad_norm": 1.9863674640655518, "learning_rate": 2.418778758614526e-05, "loss": 0.5713, "num_input_tokens_seen": 43598880, "step": 75105 }, { "epoch": 11.187071790288948, "grad_norm": 0.8124751448631287, "learning_rate": 2.4184539886898846e-05, "loss": 0.5841, "num_input_tokens_seen": 43601696, "step": 75110 }, { "epoch": 11.187816502829907, "grad_norm": 2.0004117488861084, "learning_rate": 2.41812922014288e-05, "loss": 0.4948, "num_input_tokens_seen": 43604672, "step": 75115 }, { "epoch": 11.188561215370866, "grad_norm": 1.134127140045166, "learning_rate": 2.417804452978998e-05, "loss": 0.5251, "num_input_tokens_seen": 43607424, "step": 75120 }, { "epoch": 11.189305927911827, "grad_norm": 1.3686163425445557, "learning_rate": 2.417479687203727e-05, "loss": 0.6813, "num_input_tokens_seen": 43610528, "step": 75125 }, { "epoch": 11.190050640452785, "grad_norm": 1.668561577796936, "learning_rate": 2.4171549228225524e-05, "loss": 0.8235, "num_input_tokens_seen": 43613376, "step": 75130 }, { "epoch": 11.190795352993744, "grad_norm": 1.2282195091247559, "learning_rate": 2.4168301598409617e-05, "loss": 0.4849, "num_input_tokens_seen": 43616384, "step": 75135 }, { "epoch": 11.191540065534703, "grad_norm": 1.5908335447311401, "learning_rate": 2.41650539826444e-05, "loss": 0.5552, "num_input_tokens_seen": 43619104, "step": 75140 }, { "epoch": 11.192284778075663, "grad_norm": 1.0154038667678833, "learning_rate": 2.416180638098476e-05, "loss": 0.5351, "num_input_tokens_seen": 43622080, "step": 75145 }, { "epoch": 11.193029490616622, "grad_norm": 1.251978874206543, "learning_rate": 2.4158558793485545e-05, "loss": 0.6538, "num_input_tokens_seen": 43625056, "step": 75150 }, { "epoch": 11.19377420315758, "grad_norm": 0.9258641004562378, "learning_rate": 2.4155311220201616e-05, "loss": 0.4799, "num_input_tokens_seen": 43628096, "step": 75155 }, { "epoch": 11.19451891569854, "grad_norm": 0.7956051230430603, "learning_rate": 2.4152063661187855e-05, "loss": 0.5092, "num_input_tokens_seen": 43630912, "step": 75160 }, { "epoch": 11.1952636282395, "grad_norm": 1.3199788331985474, "learning_rate": 2.414881611649911e-05, "loss": 0.5574, "num_input_tokens_seen": 43633664, "step": 75165 }, { "epoch": 11.196008340780459, "grad_norm": 1.2404779195785522, "learning_rate": 2.414556858619026e-05, "loss": 0.7287, "num_input_tokens_seen": 43636512, "step": 75170 }, { "epoch": 11.196753053321418, "grad_norm": 1.2446067333221436, "learning_rate": 2.4142321070316147e-05, "loss": 0.608, "num_input_tokens_seen": 43639488, "step": 75175 }, { "epoch": 11.197497765862376, "grad_norm": 1.292365550994873, "learning_rate": 2.4139073568931658e-05, "loss": 0.413, "num_input_tokens_seen": 43642240, "step": 75180 }, { "epoch": 11.198242478403337, "grad_norm": 1.435534954071045, "learning_rate": 2.4135826082091636e-05, "loss": 0.5698, "num_input_tokens_seen": 43644992, "step": 75185 }, { "epoch": 11.198987190944296, "grad_norm": 1.8283029794692993, "learning_rate": 2.4132578609850966e-05, "loss": 0.6332, "num_input_tokens_seen": 43647872, "step": 75190 }, { "epoch": 11.199731903485254, "grad_norm": 1.8440070152282715, "learning_rate": 2.41293311522645e-05, "loss": 0.5705, "num_input_tokens_seen": 43650720, "step": 75195 }, { "epoch": 11.200476616026213, "grad_norm": 1.1388083696365356, "learning_rate": 2.4126083709387094e-05, "loss": 0.7132, "num_input_tokens_seen": 43653312, "step": 75200 }, { "epoch": 11.201221328567174, "grad_norm": 2.5283913612365723, "learning_rate": 2.4122836281273618e-05, "loss": 0.6545, "num_input_tokens_seen": 43656000, "step": 75205 }, { "epoch": 11.201966041108133, "grad_norm": 1.4512379169464111, "learning_rate": 2.411958886797892e-05, "loss": 0.7002, "num_input_tokens_seen": 43658784, "step": 75210 }, { "epoch": 11.202710753649091, "grad_norm": 1.359575867652893, "learning_rate": 2.4116341469557888e-05, "loss": 0.6536, "num_input_tokens_seen": 43661664, "step": 75215 }, { "epoch": 11.20345546619005, "grad_norm": 1.726646900177002, "learning_rate": 2.4113094086065356e-05, "loss": 0.4634, "num_input_tokens_seen": 43664544, "step": 75220 }, { "epoch": 11.20420017873101, "grad_norm": 2.6753005981445312, "learning_rate": 2.4109846717556206e-05, "loss": 0.8132, "num_input_tokens_seen": 43667776, "step": 75225 }, { "epoch": 11.20494489127197, "grad_norm": 3.293696403503418, "learning_rate": 2.4106599364085296e-05, "loss": 0.7813, "num_input_tokens_seen": 43670688, "step": 75230 }, { "epoch": 11.205689603812928, "grad_norm": 0.8185096979141235, "learning_rate": 2.4103352025707473e-05, "loss": 0.4996, "num_input_tokens_seen": 43673792, "step": 75235 }, { "epoch": 11.206434316353887, "grad_norm": 1.9699519872665405, "learning_rate": 2.4100104702477614e-05, "loss": 0.618, "num_input_tokens_seen": 43676800, "step": 75240 }, { "epoch": 11.207179028894847, "grad_norm": 0.9850800633430481, "learning_rate": 2.4096857394450557e-05, "loss": 0.6153, "num_input_tokens_seen": 43679392, "step": 75245 }, { "epoch": 11.207923741435806, "grad_norm": 2.269103765487671, "learning_rate": 2.4093610101681192e-05, "loss": 0.684, "num_input_tokens_seen": 43681984, "step": 75250 }, { "epoch": 11.208668453976765, "grad_norm": 1.2389960289001465, "learning_rate": 2.4090362824224347e-05, "loss": 0.7261, "num_input_tokens_seen": 43684864, "step": 75255 }, { "epoch": 11.209413166517724, "grad_norm": 1.9900468587875366, "learning_rate": 2.4087115562134913e-05, "loss": 0.7635, "num_input_tokens_seen": 43687776, "step": 75260 }, { "epoch": 11.210157879058684, "grad_norm": 1.6063671112060547, "learning_rate": 2.4083868315467725e-05, "loss": 0.7398, "num_input_tokens_seen": 43690720, "step": 75265 }, { "epoch": 11.210902591599643, "grad_norm": 1.2800925970077515, "learning_rate": 2.4080621084277656e-05, "loss": 0.727, "num_input_tokens_seen": 43693696, "step": 75270 }, { "epoch": 11.211647304140602, "grad_norm": 1.6907333135604858, "learning_rate": 2.4077373868619562e-05, "loss": 0.6066, "num_input_tokens_seen": 43696512, "step": 75275 }, { "epoch": 11.21239201668156, "grad_norm": 1.2494044303894043, "learning_rate": 2.4074126668548287e-05, "loss": 0.6074, "num_input_tokens_seen": 43699424, "step": 75280 }, { "epoch": 11.21313672922252, "grad_norm": 1.708064079284668, "learning_rate": 2.4070879484118712e-05, "loss": 0.6695, "num_input_tokens_seen": 43702176, "step": 75285 }, { "epoch": 11.21388144176348, "grad_norm": 1.1026887893676758, "learning_rate": 2.4067632315385675e-05, "loss": 0.4309, "num_input_tokens_seen": 43705216, "step": 75290 }, { "epoch": 11.214626154304439, "grad_norm": 1.9429445266723633, "learning_rate": 2.4064385162404048e-05, "loss": 0.5897, "num_input_tokens_seen": 43708256, "step": 75295 }, { "epoch": 11.215370866845397, "grad_norm": 2.713344097137451, "learning_rate": 2.406113802522868e-05, "loss": 0.8286, "num_input_tokens_seen": 43711520, "step": 75300 }, { "epoch": 11.216115579386356, "grad_norm": 0.9765719771385193, "learning_rate": 2.4057890903914437e-05, "loss": 0.7424, "num_input_tokens_seen": 43714368, "step": 75305 }, { "epoch": 11.216860291927317, "grad_norm": 1.229819655418396, "learning_rate": 2.405464379851617e-05, "loss": 0.6372, "num_input_tokens_seen": 43717216, "step": 75310 }, { "epoch": 11.217605004468275, "grad_norm": 2.7576539516448975, "learning_rate": 2.4051396709088726e-05, "loss": 0.6083, "num_input_tokens_seen": 43719936, "step": 75315 }, { "epoch": 11.218349717009234, "grad_norm": 1.4066556692123413, "learning_rate": 2.4048149635686977e-05, "loss": 0.7422, "num_input_tokens_seen": 43722784, "step": 75320 }, { "epoch": 11.219094429550193, "grad_norm": 0.9725735783576965, "learning_rate": 2.4044902578365765e-05, "loss": 0.4276, "num_input_tokens_seen": 43725792, "step": 75325 }, { "epoch": 11.219839142091153, "grad_norm": 1.0990629196166992, "learning_rate": 2.404165553717996e-05, "loss": 0.6536, "num_input_tokens_seen": 43728832, "step": 75330 }, { "epoch": 11.220583854632112, "grad_norm": 1.4232838153839111, "learning_rate": 2.4038408512184406e-05, "loss": 0.7148, "num_input_tokens_seen": 43731776, "step": 75335 }, { "epoch": 11.221328567173071, "grad_norm": 1.1614807844161987, "learning_rate": 2.4035161503433974e-05, "loss": 0.3919, "num_input_tokens_seen": 43734656, "step": 75340 }, { "epoch": 11.22207327971403, "grad_norm": 1.0509765148162842, "learning_rate": 2.4031914510983492e-05, "loss": 0.5602, "num_input_tokens_seen": 43737664, "step": 75345 }, { "epoch": 11.22281799225499, "grad_norm": 1.5965571403503418, "learning_rate": 2.402866753488784e-05, "loss": 0.6692, "num_input_tokens_seen": 43740544, "step": 75350 }, { "epoch": 11.223562704795949, "grad_norm": 1.579066276550293, "learning_rate": 2.4025420575201867e-05, "loss": 0.5897, "num_input_tokens_seen": 43743232, "step": 75355 }, { "epoch": 11.224307417336908, "grad_norm": 1.0656617879867554, "learning_rate": 2.402217363198041e-05, "loss": 0.4969, "num_input_tokens_seen": 43746272, "step": 75360 }, { "epoch": 11.225052129877866, "grad_norm": 1.3499867916107178, "learning_rate": 2.4018926705278347e-05, "loss": 0.6475, "num_input_tokens_seen": 43749184, "step": 75365 }, { "epoch": 11.225796842418827, "grad_norm": 1.3335949182510376, "learning_rate": 2.4015679795150513e-05, "loss": 0.4952, "num_input_tokens_seen": 43752192, "step": 75370 }, { "epoch": 11.226541554959786, "grad_norm": 1.4206480979919434, "learning_rate": 2.4012432901651778e-05, "loss": 0.5245, "num_input_tokens_seen": 43754944, "step": 75375 }, { "epoch": 11.227286267500745, "grad_norm": 1.365699052810669, "learning_rate": 2.400918602483697e-05, "loss": 0.6493, "num_input_tokens_seen": 43758016, "step": 75380 }, { "epoch": 11.228030980041703, "grad_norm": 1.9245288372039795, "learning_rate": 2.400593916476097e-05, "loss": 0.7249, "num_input_tokens_seen": 43760992, "step": 75385 }, { "epoch": 11.228775692582664, "grad_norm": 1.1661826372146606, "learning_rate": 2.4002692321478617e-05, "loss": 0.5337, "num_input_tokens_seen": 43764192, "step": 75390 }, { "epoch": 11.229520405123623, "grad_norm": 1.3223634958267212, "learning_rate": 2.399944549504476e-05, "loss": 0.4405, "num_input_tokens_seen": 43766880, "step": 75395 }, { "epoch": 11.230265117664581, "grad_norm": 1.613127589225769, "learning_rate": 2.399619868551425e-05, "loss": 0.7803, "num_input_tokens_seen": 43769696, "step": 75400 }, { "epoch": 11.23100983020554, "grad_norm": 1.687961220741272, "learning_rate": 2.3992951892941948e-05, "loss": 0.5415, "num_input_tokens_seen": 43772640, "step": 75405 }, { "epoch": 11.2317545427465, "grad_norm": 1.5731101036071777, "learning_rate": 2.3989705117382705e-05, "loss": 0.4967, "num_input_tokens_seen": 43775712, "step": 75410 }, { "epoch": 11.23249925528746, "grad_norm": 0.5758827924728394, "learning_rate": 2.3986458358891353e-05, "loss": 0.5471, "num_input_tokens_seen": 43778592, "step": 75415 }, { "epoch": 11.233243967828418, "grad_norm": 1.497617244720459, "learning_rate": 2.3983211617522768e-05, "loss": 0.7792, "num_input_tokens_seen": 43781472, "step": 75420 }, { "epoch": 11.233988680369377, "grad_norm": 0.9603890776634216, "learning_rate": 2.3979964893331787e-05, "loss": 0.6316, "num_input_tokens_seen": 43784512, "step": 75425 }, { "epoch": 11.234733392910337, "grad_norm": 1.6285133361816406, "learning_rate": 2.397671818637327e-05, "loss": 0.6699, "num_input_tokens_seen": 43787264, "step": 75430 }, { "epoch": 11.235478105451296, "grad_norm": 1.704465389251709, "learning_rate": 2.3973471496702052e-05, "loss": 0.567, "num_input_tokens_seen": 43789952, "step": 75435 }, { "epoch": 11.236222817992255, "grad_norm": 1.3540430068969727, "learning_rate": 2.397022482437298e-05, "loss": 0.5447, "num_input_tokens_seen": 43792864, "step": 75440 }, { "epoch": 11.236967530533214, "grad_norm": 1.8376306295394897, "learning_rate": 2.3966978169440927e-05, "loss": 0.5224, "num_input_tokens_seen": 43795616, "step": 75445 }, { "epoch": 11.237712243074174, "grad_norm": 3.1394248008728027, "learning_rate": 2.396373153196072e-05, "loss": 0.7334, "num_input_tokens_seen": 43798528, "step": 75450 }, { "epoch": 11.238456955615133, "grad_norm": 1.3483891487121582, "learning_rate": 2.3960484911987223e-05, "loss": 0.4773, "num_input_tokens_seen": 43801440, "step": 75455 }, { "epoch": 11.239201668156092, "grad_norm": 2.185694694519043, "learning_rate": 2.395723830957527e-05, "loss": 0.6441, "num_input_tokens_seen": 43804320, "step": 75460 }, { "epoch": 11.23994638069705, "grad_norm": 1.04463791847229, "learning_rate": 2.3953991724779723e-05, "loss": 0.5718, "num_input_tokens_seen": 43806944, "step": 75465 }, { "epoch": 11.24069109323801, "grad_norm": 1.2661768198013306, "learning_rate": 2.395074515765542e-05, "loss": 0.6826, "num_input_tokens_seen": 43809632, "step": 75470 }, { "epoch": 11.24143580577897, "grad_norm": 2.5112452507019043, "learning_rate": 2.3947498608257204e-05, "loss": 0.6627, "num_input_tokens_seen": 43812096, "step": 75475 }, { "epoch": 11.242180518319929, "grad_norm": 0.9834961295127869, "learning_rate": 2.394425207663994e-05, "loss": 0.4403, "num_input_tokens_seen": 43814688, "step": 75480 }, { "epoch": 11.242925230860887, "grad_norm": 1.5126709938049316, "learning_rate": 2.3941005562858452e-05, "loss": 0.5338, "num_input_tokens_seen": 43817632, "step": 75485 }, { "epoch": 11.243669943401846, "grad_norm": 1.914490818977356, "learning_rate": 2.393775906696761e-05, "loss": 0.6632, "num_input_tokens_seen": 43820288, "step": 75490 }, { "epoch": 11.244414655942807, "grad_norm": 1.628656268119812, "learning_rate": 2.393451258902224e-05, "loss": 0.7549, "num_input_tokens_seen": 43823008, "step": 75495 }, { "epoch": 11.245159368483765, "grad_norm": 1.8301982879638672, "learning_rate": 2.3931266129077206e-05, "loss": 0.6399, "num_input_tokens_seen": 43826336, "step": 75500 }, { "epoch": 11.245904081024724, "grad_norm": 2.1051342487335205, "learning_rate": 2.3928019687187338e-05, "loss": 0.6048, "num_input_tokens_seen": 43829472, "step": 75505 }, { "epoch": 11.246648793565683, "grad_norm": 2.121037721633911, "learning_rate": 2.392477326340749e-05, "loss": 0.8667, "num_input_tokens_seen": 43832448, "step": 75510 }, { "epoch": 11.247393506106643, "grad_norm": 0.8822638988494873, "learning_rate": 2.3921526857792508e-05, "loss": 0.7882, "num_input_tokens_seen": 43835488, "step": 75515 }, { "epoch": 11.248138218647602, "grad_norm": 1.7465602159500122, "learning_rate": 2.3918280470397226e-05, "loss": 0.6885, "num_input_tokens_seen": 43838368, "step": 75520 }, { "epoch": 11.248882931188561, "grad_norm": 1.3355858325958252, "learning_rate": 2.3915034101276504e-05, "loss": 0.7019, "num_input_tokens_seen": 43841152, "step": 75525 }, { "epoch": 11.24962764372952, "grad_norm": 3.488621711730957, "learning_rate": 2.3911787750485172e-05, "loss": 0.6736, "num_input_tokens_seen": 43843936, "step": 75530 }, { "epoch": 11.25037235627048, "grad_norm": 0.9979077577590942, "learning_rate": 2.3908541418078087e-05, "loss": 0.5265, "num_input_tokens_seen": 43846720, "step": 75535 }, { "epoch": 11.251117068811439, "grad_norm": 1.0633710622787476, "learning_rate": 2.3905295104110076e-05, "loss": 0.6421, "num_input_tokens_seen": 43849728, "step": 75540 }, { "epoch": 11.251861781352398, "grad_norm": 1.8543239831924438, "learning_rate": 2.3902048808636e-05, "loss": 0.8038, "num_input_tokens_seen": 43852672, "step": 75545 }, { "epoch": 11.252606493893357, "grad_norm": 1.187351107597351, "learning_rate": 2.3898802531710693e-05, "loss": 0.6913, "num_input_tokens_seen": 43855360, "step": 75550 }, { "epoch": 11.253351206434317, "grad_norm": 1.2004269361495972, "learning_rate": 2.389555627338899e-05, "loss": 0.561, "num_input_tokens_seen": 43858144, "step": 75555 }, { "epoch": 11.254095918975276, "grad_norm": 2.2020604610443115, "learning_rate": 2.3892310033725747e-05, "loss": 0.5893, "num_input_tokens_seen": 43860992, "step": 75560 }, { "epoch": 11.254840631516235, "grad_norm": 1.909827470779419, "learning_rate": 2.38890638127758e-05, "loss": 0.6385, "num_input_tokens_seen": 43863936, "step": 75565 }, { "epoch": 11.255585344057193, "grad_norm": 0.954620897769928, "learning_rate": 2.3885817610593994e-05, "loss": 0.5337, "num_input_tokens_seen": 43866912, "step": 75570 }, { "epoch": 11.256330056598154, "grad_norm": 1.6269948482513428, "learning_rate": 2.3882571427235156e-05, "loss": 0.5918, "num_input_tokens_seen": 43869696, "step": 75575 }, { "epoch": 11.257074769139113, "grad_norm": 1.7085109949111938, "learning_rate": 2.3879325262754152e-05, "loss": 0.6386, "num_input_tokens_seen": 43873056, "step": 75580 }, { "epoch": 11.257819481680071, "grad_norm": 2.5422425270080566, "learning_rate": 2.3876079117205795e-05, "loss": 0.656, "num_input_tokens_seen": 43875872, "step": 75585 }, { "epoch": 11.25856419422103, "grad_norm": 1.9928531646728516, "learning_rate": 2.387283299064495e-05, "loss": 0.6129, "num_input_tokens_seen": 43878784, "step": 75590 }, { "epoch": 11.25930890676199, "grad_norm": 1.34788179397583, "learning_rate": 2.3869586883126445e-05, "loss": 0.6747, "num_input_tokens_seen": 43881760, "step": 75595 }, { "epoch": 11.26005361930295, "grad_norm": 0.9388378858566284, "learning_rate": 2.3866340794705117e-05, "loss": 0.5567, "num_input_tokens_seen": 43884544, "step": 75600 }, { "epoch": 11.260798331843908, "grad_norm": 1.356972336769104, "learning_rate": 2.3863094725435813e-05, "loss": 0.6705, "num_input_tokens_seen": 43887392, "step": 75605 }, { "epoch": 11.261543044384867, "grad_norm": 2.2386832237243652, "learning_rate": 2.3859848675373358e-05, "loss": 0.7911, "num_input_tokens_seen": 43890144, "step": 75610 }, { "epoch": 11.262287756925826, "grad_norm": 1.1409087181091309, "learning_rate": 2.385660264457261e-05, "loss": 0.5694, "num_input_tokens_seen": 43892832, "step": 75615 }, { "epoch": 11.263032469466786, "grad_norm": 1.306714415550232, "learning_rate": 2.385335663308839e-05, "loss": 0.7377, "num_input_tokens_seen": 43895680, "step": 75620 }, { "epoch": 11.263777182007745, "grad_norm": 1.442064642906189, "learning_rate": 2.3850110640975555e-05, "loss": 0.4697, "num_input_tokens_seen": 43898720, "step": 75625 }, { "epoch": 11.264521894548704, "grad_norm": 1.2362194061279297, "learning_rate": 2.3846864668288933e-05, "loss": 0.4435, "num_input_tokens_seen": 43901504, "step": 75630 }, { "epoch": 11.265266607089663, "grad_norm": 1.8583530187606812, "learning_rate": 2.3843618715083353e-05, "loss": 0.5999, "num_input_tokens_seen": 43904704, "step": 75635 }, { "epoch": 11.266011319630623, "grad_norm": 1.0632829666137695, "learning_rate": 2.3840372781413667e-05, "loss": 0.4909, "num_input_tokens_seen": 43907392, "step": 75640 }, { "epoch": 11.266756032171582, "grad_norm": 1.0460402965545654, "learning_rate": 2.3837126867334687e-05, "loss": 0.4585, "num_input_tokens_seen": 43910176, "step": 75645 }, { "epoch": 11.26750074471254, "grad_norm": 1.5192188024520874, "learning_rate": 2.3833880972901285e-05, "loss": 0.4168, "num_input_tokens_seen": 43912960, "step": 75650 }, { "epoch": 11.2682454572535, "grad_norm": 1.0407122373580933, "learning_rate": 2.3830635098168267e-05, "loss": 0.5027, "num_input_tokens_seen": 43915552, "step": 75655 }, { "epoch": 11.26899016979446, "grad_norm": 2.2201132774353027, "learning_rate": 2.3827389243190486e-05, "loss": 0.6204, "num_input_tokens_seen": 43918464, "step": 75660 }, { "epoch": 11.269734882335419, "grad_norm": 0.639807939529419, "learning_rate": 2.3824143408022773e-05, "loss": 0.5544, "num_input_tokens_seen": 43921792, "step": 75665 }, { "epoch": 11.270479594876377, "grad_norm": 1.8611875772476196, "learning_rate": 2.3820897592719964e-05, "loss": 0.5765, "num_input_tokens_seen": 43924448, "step": 75670 }, { "epoch": 11.271224307417336, "grad_norm": 1.2661898136138916, "learning_rate": 2.3817651797336894e-05, "loss": 0.8195, "num_input_tokens_seen": 43927456, "step": 75675 }, { "epoch": 11.271969019958297, "grad_norm": 1.3878940343856812, "learning_rate": 2.3814406021928382e-05, "loss": 0.5744, "num_input_tokens_seen": 43930592, "step": 75680 }, { "epoch": 11.272713732499255, "grad_norm": 1.0054060220718384, "learning_rate": 2.381116026654929e-05, "loss": 0.7206, "num_input_tokens_seen": 43933728, "step": 75685 }, { "epoch": 11.273458445040214, "grad_norm": 0.9496224522590637, "learning_rate": 2.3807914531254417e-05, "loss": 0.6628, "num_input_tokens_seen": 43936736, "step": 75690 }, { "epoch": 11.274203157581173, "grad_norm": 2.0664610862731934, "learning_rate": 2.3804668816098635e-05, "loss": 0.6467, "num_input_tokens_seen": 43939616, "step": 75695 }, { "epoch": 11.274947870122134, "grad_norm": 2.427009344100952, "learning_rate": 2.3801423121136752e-05, "loss": 0.8332, "num_input_tokens_seen": 43942720, "step": 75700 }, { "epoch": 11.275692582663092, "grad_norm": 1.2338135242462158, "learning_rate": 2.379817744642361e-05, "loss": 0.5597, "num_input_tokens_seen": 43945536, "step": 75705 }, { "epoch": 11.276437295204051, "grad_norm": 0.8192841410636902, "learning_rate": 2.379493179201403e-05, "loss": 0.6094, "num_input_tokens_seen": 43948672, "step": 75710 }, { "epoch": 11.27718200774501, "grad_norm": 0.9765292406082153, "learning_rate": 2.3791686157962866e-05, "loss": 0.5578, "num_input_tokens_seen": 43951488, "step": 75715 }, { "epoch": 11.27792672028597, "grad_norm": 1.104275107383728, "learning_rate": 2.378844054432493e-05, "loss": 0.6381, "num_input_tokens_seen": 43954432, "step": 75720 }, { "epoch": 11.278671432826929, "grad_norm": 2.677638530731201, "learning_rate": 2.378519495115506e-05, "loss": 0.5395, "num_input_tokens_seen": 43957280, "step": 75725 }, { "epoch": 11.279416145367888, "grad_norm": 1.6962119340896606, "learning_rate": 2.3781949378508085e-05, "loss": 0.5714, "num_input_tokens_seen": 43959872, "step": 75730 }, { "epoch": 11.280160857908847, "grad_norm": 1.0753337144851685, "learning_rate": 2.3778703826438833e-05, "loss": 0.4802, "num_input_tokens_seen": 43962560, "step": 75735 }, { "epoch": 11.280905570449807, "grad_norm": 1.5081207752227783, "learning_rate": 2.377545829500215e-05, "loss": 0.7798, "num_input_tokens_seen": 43965600, "step": 75740 }, { "epoch": 11.281650282990766, "grad_norm": 1.355876088142395, "learning_rate": 2.377221278425284e-05, "loss": 0.6358, "num_input_tokens_seen": 43968512, "step": 75745 }, { "epoch": 11.282394995531725, "grad_norm": 0.7039982676506042, "learning_rate": 2.376896729424576e-05, "loss": 0.6191, "num_input_tokens_seen": 43971808, "step": 75750 }, { "epoch": 11.283139708072683, "grad_norm": 1.90381920337677, "learning_rate": 2.3765721825035724e-05, "loss": 0.461, "num_input_tokens_seen": 43974816, "step": 75755 }, { "epoch": 11.283884420613644, "grad_norm": 1.6847690343856812, "learning_rate": 2.376247637667756e-05, "loss": 0.5843, "num_input_tokens_seen": 43978016, "step": 75760 }, { "epoch": 11.284629133154603, "grad_norm": 1.1487555503845215, "learning_rate": 2.3759230949226103e-05, "loss": 0.5141, "num_input_tokens_seen": 43981184, "step": 75765 }, { "epoch": 11.285373845695561, "grad_norm": 0.9592903256416321, "learning_rate": 2.375598554273617e-05, "loss": 0.6702, "num_input_tokens_seen": 43984480, "step": 75770 }, { "epoch": 11.28611855823652, "grad_norm": 1.3441581726074219, "learning_rate": 2.3752740157262607e-05, "loss": 0.6302, "num_input_tokens_seen": 43987424, "step": 75775 }, { "epoch": 11.28686327077748, "grad_norm": 2.2353553771972656, "learning_rate": 2.374949479286022e-05, "loss": 0.5819, "num_input_tokens_seen": 43990144, "step": 75780 }, { "epoch": 11.28760798331844, "grad_norm": 0.9175338745117188, "learning_rate": 2.374624944958386e-05, "loss": 0.5141, "num_input_tokens_seen": 43993248, "step": 75785 }, { "epoch": 11.288352695859398, "grad_norm": 0.8386777639389038, "learning_rate": 2.3743004127488332e-05, "loss": 0.7242, "num_input_tokens_seen": 43997184, "step": 75790 }, { "epoch": 11.289097408400357, "grad_norm": 1.3780869245529175, "learning_rate": 2.373975882662848e-05, "loss": 0.7231, "num_input_tokens_seen": 44000192, "step": 75795 }, { "epoch": 11.289842120941316, "grad_norm": 2.041569948196411, "learning_rate": 2.3736513547059124e-05, "loss": 0.6078, "num_input_tokens_seen": 44003232, "step": 75800 }, { "epoch": 11.290586833482276, "grad_norm": 1.751650333404541, "learning_rate": 2.373326828883507e-05, "loss": 0.6114, "num_input_tokens_seen": 44006208, "step": 75805 }, { "epoch": 11.291331546023235, "grad_norm": 1.5069411993026733, "learning_rate": 2.3730023052011178e-05, "loss": 0.7303, "num_input_tokens_seen": 44008992, "step": 75810 }, { "epoch": 11.292076258564194, "grad_norm": 2.158503293991089, "learning_rate": 2.3726777836642243e-05, "loss": 0.5857, "num_input_tokens_seen": 44011904, "step": 75815 }, { "epoch": 11.292820971105153, "grad_norm": 0.7921425700187683, "learning_rate": 2.3723532642783114e-05, "loss": 0.5468, "num_input_tokens_seen": 44014656, "step": 75820 }, { "epoch": 11.293565683646113, "grad_norm": 1.5959967374801636, "learning_rate": 2.3720287470488596e-05, "loss": 0.6901, "num_input_tokens_seen": 44017472, "step": 75825 }, { "epoch": 11.294310396187072, "grad_norm": 1.3119311332702637, "learning_rate": 2.3717042319813527e-05, "loss": 0.4771, "num_input_tokens_seen": 44020640, "step": 75830 }, { "epoch": 11.29505510872803, "grad_norm": 1.514656662940979, "learning_rate": 2.3713797190812726e-05, "loss": 0.6249, "num_input_tokens_seen": 44023648, "step": 75835 }, { "epoch": 11.29579982126899, "grad_norm": 1.3126872777938843, "learning_rate": 2.3710552083540998e-05, "loss": 0.6438, "num_input_tokens_seen": 44026592, "step": 75840 }, { "epoch": 11.29654453380995, "grad_norm": 1.9044965505599976, "learning_rate": 2.3707306998053198e-05, "loss": 0.4851, "num_input_tokens_seen": 44029440, "step": 75845 }, { "epoch": 11.297289246350909, "grad_norm": 1.6485059261322021, "learning_rate": 2.370406193440412e-05, "loss": 0.7895, "num_input_tokens_seen": 44032256, "step": 75850 }, { "epoch": 11.298033958891867, "grad_norm": 1.2380601167678833, "learning_rate": 2.3700816892648608e-05, "loss": 0.6973, "num_input_tokens_seen": 44035456, "step": 75855 }, { "epoch": 11.298778671432826, "grad_norm": 1.7780866622924805, "learning_rate": 2.369757187284147e-05, "loss": 0.699, "num_input_tokens_seen": 44038368, "step": 75860 }, { "epoch": 11.299523383973787, "grad_norm": 1.3083007335662842, "learning_rate": 2.3694326875037536e-05, "loss": 0.6478, "num_input_tokens_seen": 44041600, "step": 75865 }, { "epoch": 11.300268096514746, "grad_norm": 1.0774825811386108, "learning_rate": 2.3691081899291613e-05, "loss": 0.8139, "num_input_tokens_seen": 44044736, "step": 75870 }, { "epoch": 11.301012809055704, "grad_norm": 2.545213460922241, "learning_rate": 2.3687836945658542e-05, "loss": 0.7421, "num_input_tokens_seen": 44047744, "step": 75875 }, { "epoch": 11.301757521596663, "grad_norm": 1.730124592781067, "learning_rate": 2.3684592014193133e-05, "loss": 0.7361, "num_input_tokens_seen": 44050560, "step": 75880 }, { "epoch": 11.302502234137624, "grad_norm": 1.210525631904602, "learning_rate": 2.3681347104950193e-05, "loss": 0.4921, "num_input_tokens_seen": 44053344, "step": 75885 }, { "epoch": 11.303246946678582, "grad_norm": 2.229785203933716, "learning_rate": 2.3678102217984564e-05, "loss": 0.684, "num_input_tokens_seen": 44056000, "step": 75890 }, { "epoch": 11.303991659219541, "grad_norm": 0.9848033785820007, "learning_rate": 2.3674857353351048e-05, "loss": 0.3804, "num_input_tokens_seen": 44058976, "step": 75895 }, { "epoch": 11.3047363717605, "grad_norm": 1.0574846267700195, "learning_rate": 2.3671612511104476e-05, "loss": 0.4819, "num_input_tokens_seen": 44062048, "step": 75900 }, { "epoch": 11.30548108430146, "grad_norm": 1.757855772972107, "learning_rate": 2.366836769129965e-05, "loss": 0.7181, "num_input_tokens_seen": 44065120, "step": 75905 }, { "epoch": 11.30622579684242, "grad_norm": 1.030403971672058, "learning_rate": 2.366512289399141e-05, "loss": 0.6058, "num_input_tokens_seen": 44067904, "step": 75910 }, { "epoch": 11.306970509383378, "grad_norm": 2.624796152114868, "learning_rate": 2.3661878119234562e-05, "loss": 0.8322, "num_input_tokens_seen": 44071008, "step": 75915 }, { "epoch": 11.307715221924337, "grad_norm": 4.376768589019775, "learning_rate": 2.3658633367083914e-05, "loss": 0.7762, "num_input_tokens_seen": 44073888, "step": 75920 }, { "epoch": 11.308459934465297, "grad_norm": 2.44018292427063, "learning_rate": 2.3655388637594298e-05, "loss": 0.6755, "num_input_tokens_seen": 44076544, "step": 75925 }, { "epoch": 11.309204647006256, "grad_norm": 1.0031708478927612, "learning_rate": 2.3652143930820523e-05, "loss": 0.5492, "num_input_tokens_seen": 44079456, "step": 75930 }, { "epoch": 11.309949359547215, "grad_norm": 1.5304510593414307, "learning_rate": 2.364889924681741e-05, "loss": 0.6733, "num_input_tokens_seen": 44082272, "step": 75935 }, { "epoch": 11.310694072088173, "grad_norm": 1.677361249923706, "learning_rate": 2.364565458563976e-05, "loss": 0.6936, "num_input_tokens_seen": 44085088, "step": 75940 }, { "epoch": 11.311438784629134, "grad_norm": 1.3694639205932617, "learning_rate": 2.364240994734241e-05, "loss": 0.4977, "num_input_tokens_seen": 44088064, "step": 75945 }, { "epoch": 11.312183497170093, "grad_norm": 2.2410731315612793, "learning_rate": 2.3639165331980157e-05, "loss": 0.6889, "num_input_tokens_seen": 44090816, "step": 75950 }, { "epoch": 11.312928209711052, "grad_norm": 0.8866641521453857, "learning_rate": 2.3635920739607828e-05, "loss": 0.8158, "num_input_tokens_seen": 44093696, "step": 75955 }, { "epoch": 11.31367292225201, "grad_norm": 0.9181566834449768, "learning_rate": 2.3632676170280235e-05, "loss": 0.5226, "num_input_tokens_seen": 44096512, "step": 75960 }, { "epoch": 11.31441763479297, "grad_norm": 1.2020225524902344, "learning_rate": 2.362943162405218e-05, "loss": 0.583, "num_input_tokens_seen": 44099712, "step": 75965 }, { "epoch": 11.31516234733393, "grad_norm": 2.3389461040496826, "learning_rate": 2.3626187100978496e-05, "loss": 0.4236, "num_input_tokens_seen": 44102560, "step": 75970 }, { "epoch": 11.315907059874888, "grad_norm": 2.625020980834961, "learning_rate": 2.362294260111397e-05, "loss": 0.5864, "num_input_tokens_seen": 44105472, "step": 75975 }, { "epoch": 11.316651772415847, "grad_norm": 1.8245139122009277, "learning_rate": 2.3619698124513438e-05, "loss": 0.5364, "num_input_tokens_seen": 44108384, "step": 75980 }, { "epoch": 11.317396484956806, "grad_norm": 1.0860891342163086, "learning_rate": 2.3616453671231694e-05, "loss": 0.5643, "num_input_tokens_seen": 44111424, "step": 75985 }, { "epoch": 11.318141197497766, "grad_norm": 0.7839048504829407, "learning_rate": 2.3613209241323574e-05, "loss": 0.569, "num_input_tokens_seen": 44114496, "step": 75990 }, { "epoch": 11.318885910038725, "grad_norm": 2.5882461071014404, "learning_rate": 2.360996483484387e-05, "loss": 0.6956, "num_input_tokens_seen": 44117696, "step": 75995 }, { "epoch": 11.319630622579684, "grad_norm": 2.0272903442382812, "learning_rate": 2.360672045184739e-05, "loss": 0.749, "num_input_tokens_seen": 44120608, "step": 76000 }, { "epoch": 11.320375335120643, "grad_norm": 1.9363164901733398, "learning_rate": 2.3603476092388963e-05, "loss": 0.5843, "num_input_tokens_seen": 44123392, "step": 76005 }, { "epoch": 11.321120047661603, "grad_norm": 1.7211031913757324, "learning_rate": 2.3600231756523373e-05, "loss": 0.6191, "num_input_tokens_seen": 44126624, "step": 76010 }, { "epoch": 11.321864760202562, "grad_norm": 1.8637299537658691, "learning_rate": 2.3596987444305456e-05, "loss": 0.7586, "num_input_tokens_seen": 44129600, "step": 76015 }, { "epoch": 11.32260947274352, "grad_norm": 2.1729228496551514, "learning_rate": 2.359374315579e-05, "loss": 0.5446, "num_input_tokens_seen": 44132384, "step": 76020 }, { "epoch": 11.32335418528448, "grad_norm": 1.3709206581115723, "learning_rate": 2.3590498891031838e-05, "loss": 0.5835, "num_input_tokens_seen": 44135264, "step": 76025 }, { "epoch": 11.32409889782544, "grad_norm": 0.8291054368019104, "learning_rate": 2.3587254650085757e-05, "loss": 0.4909, "num_input_tokens_seen": 44138176, "step": 76030 }, { "epoch": 11.324843610366399, "grad_norm": 2.672905445098877, "learning_rate": 2.3584010433006577e-05, "loss": 0.6537, "num_input_tokens_seen": 44141280, "step": 76035 }, { "epoch": 11.325588322907358, "grad_norm": 1.0610367059707642, "learning_rate": 2.3580766239849102e-05, "loss": 0.5379, "num_input_tokens_seen": 44143904, "step": 76040 }, { "epoch": 11.326333035448316, "grad_norm": 2.040515422821045, "learning_rate": 2.3577522070668128e-05, "loss": 0.5858, "num_input_tokens_seen": 44147136, "step": 76045 }, { "epoch": 11.327077747989277, "grad_norm": 1.661746859550476, "learning_rate": 2.3574277925518488e-05, "loss": 0.7905, "num_input_tokens_seen": 44149888, "step": 76050 }, { "epoch": 11.327822460530236, "grad_norm": 1.740299105644226, "learning_rate": 2.357103380445496e-05, "loss": 0.4936, "num_input_tokens_seen": 44152544, "step": 76055 }, { "epoch": 11.328567173071194, "grad_norm": 2.4966061115264893, "learning_rate": 2.356778970753237e-05, "loss": 0.6111, "num_input_tokens_seen": 44155744, "step": 76060 }, { "epoch": 11.329311885612153, "grad_norm": 0.8410837650299072, "learning_rate": 2.3564545634805516e-05, "loss": 0.5786, "num_input_tokens_seen": 44158656, "step": 76065 }, { "epoch": 11.330056598153114, "grad_norm": 1.3354276418685913, "learning_rate": 2.356130158632921e-05, "loss": 0.6013, "num_input_tokens_seen": 44161376, "step": 76070 }, { "epoch": 11.330801310694072, "grad_norm": 1.0185282230377197, "learning_rate": 2.3558057562158247e-05, "loss": 0.6365, "num_input_tokens_seen": 44164096, "step": 76075 }, { "epoch": 11.331546023235031, "grad_norm": 1.15519380569458, "learning_rate": 2.355481356234743e-05, "loss": 0.6398, "num_input_tokens_seen": 44166816, "step": 76080 }, { "epoch": 11.33229073577599, "grad_norm": 1.0604435205459595, "learning_rate": 2.355156958695158e-05, "loss": 0.6059, "num_input_tokens_seen": 44169632, "step": 76085 }, { "epoch": 11.33303544831695, "grad_norm": 1.1584973335266113, "learning_rate": 2.354832563602548e-05, "loss": 0.6075, "num_input_tokens_seen": 44172352, "step": 76090 }, { "epoch": 11.33378016085791, "grad_norm": 1.532212734222412, "learning_rate": 2.3545081709623953e-05, "loss": 0.5015, "num_input_tokens_seen": 44175520, "step": 76095 }, { "epoch": 11.334524873398868, "grad_norm": 2.7927331924438477, "learning_rate": 2.354183780780178e-05, "loss": 0.7342, "num_input_tokens_seen": 44178304, "step": 76100 }, { "epoch": 11.335269585939827, "grad_norm": 1.830721139907837, "learning_rate": 2.3538593930613784e-05, "loss": 0.5434, "num_input_tokens_seen": 44181024, "step": 76105 }, { "epoch": 11.336014298480787, "grad_norm": 1.2735244035720825, "learning_rate": 2.353535007811475e-05, "loss": 0.8775, "num_input_tokens_seen": 44183968, "step": 76110 }, { "epoch": 11.336759011021746, "grad_norm": 3.143019437789917, "learning_rate": 2.3532106250359498e-05, "loss": 0.5288, "num_input_tokens_seen": 44186752, "step": 76115 }, { "epoch": 11.337503723562705, "grad_norm": 1.622771143913269, "learning_rate": 2.3528862447402817e-05, "loss": 0.7194, "num_input_tokens_seen": 44189728, "step": 76120 }, { "epoch": 11.338248436103664, "grad_norm": 1.6443877220153809, "learning_rate": 2.3525618669299505e-05, "loss": 0.6133, "num_input_tokens_seen": 44192576, "step": 76125 }, { "epoch": 11.338993148644622, "grad_norm": 2.1253392696380615, "learning_rate": 2.3522374916104377e-05, "loss": 0.6032, "num_input_tokens_seen": 44195136, "step": 76130 }, { "epoch": 11.339737861185583, "grad_norm": 0.7508260607719421, "learning_rate": 2.3519131187872207e-05, "loss": 0.5904, "num_input_tokens_seen": 44198048, "step": 76135 }, { "epoch": 11.340482573726542, "grad_norm": 1.6488760709762573, "learning_rate": 2.3515887484657823e-05, "loss": 0.5168, "num_input_tokens_seen": 44201024, "step": 76140 }, { "epoch": 11.3412272862675, "grad_norm": 1.315281867980957, "learning_rate": 2.3512643806516e-05, "loss": 0.6998, "num_input_tokens_seen": 44204224, "step": 76145 }, { "epoch": 11.341971998808459, "grad_norm": 1.6453417539596558, "learning_rate": 2.3509400153501556e-05, "loss": 0.7181, "num_input_tokens_seen": 44207072, "step": 76150 }, { "epoch": 11.34271671134942, "grad_norm": 1.7607334852218628, "learning_rate": 2.3506156525669286e-05, "loss": 0.6077, "num_input_tokens_seen": 44209984, "step": 76155 }, { "epoch": 11.343461423890378, "grad_norm": 1.7371541261672974, "learning_rate": 2.3502912923073976e-05, "loss": 0.8441, "num_input_tokens_seen": 44212800, "step": 76160 }, { "epoch": 11.344206136431337, "grad_norm": 0.9730259776115417, "learning_rate": 2.349966934577044e-05, "loss": 0.6739, "num_input_tokens_seen": 44215776, "step": 76165 }, { "epoch": 11.344950848972296, "grad_norm": 1.0525598526000977, "learning_rate": 2.349642579381345e-05, "loss": 0.553, "num_input_tokens_seen": 44218624, "step": 76170 }, { "epoch": 11.345695561513256, "grad_norm": 1.902706265449524, "learning_rate": 2.349318226725783e-05, "loss": 0.6664, "num_input_tokens_seen": 44221568, "step": 76175 }, { "epoch": 11.346440274054215, "grad_norm": 1.1222864389419556, "learning_rate": 2.3489938766158354e-05, "loss": 0.6102, "num_input_tokens_seen": 44224704, "step": 76180 }, { "epoch": 11.347184986595174, "grad_norm": 1.6191097497940063, "learning_rate": 2.3486695290569838e-05, "loss": 0.5928, "num_input_tokens_seen": 44227584, "step": 76185 }, { "epoch": 11.347929699136133, "grad_norm": 0.9866864085197449, "learning_rate": 2.348345184054706e-05, "loss": 0.5565, "num_input_tokens_seen": 44230944, "step": 76190 }, { "epoch": 11.348674411677093, "grad_norm": 1.6007479429244995, "learning_rate": 2.3480208416144832e-05, "loss": 0.4982, "num_input_tokens_seen": 44233792, "step": 76195 }, { "epoch": 11.349419124218052, "grad_norm": 1.0244044065475464, "learning_rate": 2.3476965017417935e-05, "loss": 0.5303, "num_input_tokens_seen": 44236800, "step": 76200 }, { "epoch": 11.35016383675901, "grad_norm": 1.578082799911499, "learning_rate": 2.3473721644421155e-05, "loss": 0.789, "num_input_tokens_seen": 44239776, "step": 76205 }, { "epoch": 11.35090854929997, "grad_norm": 0.8857119679450989, "learning_rate": 2.3470478297209307e-05, "loss": 0.4958, "num_input_tokens_seen": 44242912, "step": 76210 }, { "epoch": 11.35165326184093, "grad_norm": 1.206400752067566, "learning_rate": 2.3467234975837162e-05, "loss": 0.5679, "num_input_tokens_seen": 44245856, "step": 76215 }, { "epoch": 11.352397974381889, "grad_norm": 1.5089894533157349, "learning_rate": 2.3463991680359536e-05, "loss": 0.5798, "num_input_tokens_seen": 44248928, "step": 76220 }, { "epoch": 11.353142686922848, "grad_norm": 2.5412747859954834, "learning_rate": 2.346074841083121e-05, "loss": 0.7065, "num_input_tokens_seen": 44251904, "step": 76225 }, { "epoch": 11.353887399463806, "grad_norm": 2.329346179962158, "learning_rate": 2.345750516730697e-05, "loss": 0.6402, "num_input_tokens_seen": 44254592, "step": 76230 }, { "epoch": 11.354632112004767, "grad_norm": 1.253358006477356, "learning_rate": 2.3454261949841622e-05, "loss": 0.7319, "num_input_tokens_seen": 44257760, "step": 76235 }, { "epoch": 11.355376824545726, "grad_norm": 1.1645482778549194, "learning_rate": 2.3451018758489932e-05, "loss": 0.568, "num_input_tokens_seen": 44260544, "step": 76240 }, { "epoch": 11.356121537086684, "grad_norm": 1.0881961584091187, "learning_rate": 2.3447775593306716e-05, "loss": 0.6801, "num_input_tokens_seen": 44263168, "step": 76245 }, { "epoch": 11.356866249627643, "grad_norm": 1.1961888074874878, "learning_rate": 2.3444532454346745e-05, "loss": 0.7171, "num_input_tokens_seen": 44265824, "step": 76250 }, { "epoch": 11.357610962168604, "grad_norm": 3.1922922134399414, "learning_rate": 2.3441289341664822e-05, "loss": 0.4361, "num_input_tokens_seen": 44268928, "step": 76255 }, { "epoch": 11.358355674709562, "grad_norm": 1.486328363418579, "learning_rate": 2.3438046255315735e-05, "loss": 0.7044, "num_input_tokens_seen": 44271712, "step": 76260 }, { "epoch": 11.359100387250521, "grad_norm": 1.5963140726089478, "learning_rate": 2.3434803195354268e-05, "loss": 0.6496, "num_input_tokens_seen": 44274592, "step": 76265 }, { "epoch": 11.35984509979148, "grad_norm": 1.5953280925750732, "learning_rate": 2.3431560161835204e-05, "loss": 0.6614, "num_input_tokens_seen": 44277696, "step": 76270 }, { "epoch": 11.36058981233244, "grad_norm": 1.4061578512191772, "learning_rate": 2.3428317154813344e-05, "loss": 0.6954, "num_input_tokens_seen": 44280832, "step": 76275 }, { "epoch": 11.3613345248734, "grad_norm": 2.1363704204559326, "learning_rate": 2.342507417434347e-05, "loss": 0.5681, "num_input_tokens_seen": 44283680, "step": 76280 }, { "epoch": 11.362079237414358, "grad_norm": 1.1367361545562744, "learning_rate": 2.3421831220480357e-05, "loss": 0.5511, "num_input_tokens_seen": 44286784, "step": 76285 }, { "epoch": 11.362823949955317, "grad_norm": 1.3169848918914795, "learning_rate": 2.341858829327881e-05, "loss": 0.5228, "num_input_tokens_seen": 44289440, "step": 76290 }, { "epoch": 11.363568662496277, "grad_norm": 1.277194619178772, "learning_rate": 2.34153453927936e-05, "loss": 0.5751, "num_input_tokens_seen": 44292352, "step": 76295 }, { "epoch": 11.364313375037236, "grad_norm": 1.140455722808838, "learning_rate": 2.341210251907953e-05, "loss": 0.5436, "num_input_tokens_seen": 44295360, "step": 76300 }, { "epoch": 11.365058087578195, "grad_norm": 2.5055673122406006, "learning_rate": 2.340885967219136e-05, "loss": 0.6943, "num_input_tokens_seen": 44298272, "step": 76305 }, { "epoch": 11.365802800119154, "grad_norm": 1.98235285282135, "learning_rate": 2.3405616852183902e-05, "loss": 0.9345, "num_input_tokens_seen": 44301472, "step": 76310 }, { "epoch": 11.366547512660112, "grad_norm": 1.655708909034729, "learning_rate": 2.3402374059111912e-05, "loss": 0.746, "num_input_tokens_seen": 44304800, "step": 76315 }, { "epoch": 11.367292225201073, "grad_norm": 1.571216344833374, "learning_rate": 2.3399131293030204e-05, "loss": 0.7141, "num_input_tokens_seen": 44307520, "step": 76320 }, { "epoch": 11.368036937742032, "grad_norm": 1.9965858459472656, "learning_rate": 2.339588855399354e-05, "loss": 0.7743, "num_input_tokens_seen": 44310528, "step": 76325 }, { "epoch": 11.36878165028299, "grad_norm": 1.2827404737472534, "learning_rate": 2.3392645842056707e-05, "loss": 0.5707, "num_input_tokens_seen": 44313344, "step": 76330 }, { "epoch": 11.36952636282395, "grad_norm": 0.9539183378219604, "learning_rate": 2.338940315727449e-05, "loss": 0.6326, "num_input_tokens_seen": 44316096, "step": 76335 }, { "epoch": 11.37027107536491, "grad_norm": 1.3424034118652344, "learning_rate": 2.3386160499701663e-05, "loss": 0.5977, "num_input_tokens_seen": 44318816, "step": 76340 }, { "epoch": 11.371015787905868, "grad_norm": 0.8863571286201477, "learning_rate": 2.3382917869393027e-05, "loss": 0.6194, "num_input_tokens_seen": 44321440, "step": 76345 }, { "epoch": 11.371760500446827, "grad_norm": 0.9671374559402466, "learning_rate": 2.3379675266403335e-05, "loss": 0.6704, "num_input_tokens_seen": 44324160, "step": 76350 }, { "epoch": 11.372505212987786, "grad_norm": 4.408517360687256, "learning_rate": 2.3376432690787396e-05, "loss": 0.8179, "num_input_tokens_seen": 44326944, "step": 76355 }, { "epoch": 11.373249925528746, "grad_norm": 1.8251798152923584, "learning_rate": 2.3373190142599973e-05, "loss": 0.739, "num_input_tokens_seen": 44329824, "step": 76360 }, { "epoch": 11.373994638069705, "grad_norm": 1.216212511062622, "learning_rate": 2.3369947621895845e-05, "loss": 0.8149, "num_input_tokens_seen": 44332544, "step": 76365 }, { "epoch": 11.374739350610664, "grad_norm": 1.9227977991104126, "learning_rate": 2.3366705128729805e-05, "loss": 0.612, "num_input_tokens_seen": 44335424, "step": 76370 }, { "epoch": 11.375484063151623, "grad_norm": 3.7609126567840576, "learning_rate": 2.3363462663156606e-05, "loss": 0.5519, "num_input_tokens_seen": 44338272, "step": 76375 }, { "epoch": 11.376228775692583, "grad_norm": 1.2978309392929077, "learning_rate": 2.3360220225231057e-05, "loss": 0.7498, "num_input_tokens_seen": 44341152, "step": 76380 }, { "epoch": 11.376973488233542, "grad_norm": 1.6051056385040283, "learning_rate": 2.335697781500791e-05, "loss": 0.4596, "num_input_tokens_seen": 44344192, "step": 76385 }, { "epoch": 11.3777182007745, "grad_norm": 1.1217936277389526, "learning_rate": 2.3353735432541957e-05, "loss": 0.6692, "num_input_tokens_seen": 44347264, "step": 76390 }, { "epoch": 11.37846291331546, "grad_norm": 2.026869535446167, "learning_rate": 2.335049307788797e-05, "loss": 0.6938, "num_input_tokens_seen": 44350304, "step": 76395 }, { "epoch": 11.37920762585642, "grad_norm": 1.071266531944275, "learning_rate": 2.334725075110073e-05, "loss": 0.502, "num_input_tokens_seen": 44353344, "step": 76400 }, { "epoch": 11.379952338397379, "grad_norm": 1.3641993999481201, "learning_rate": 2.3344008452235008e-05, "loss": 0.5815, "num_input_tokens_seen": 44356064, "step": 76405 }, { "epoch": 11.380697050938338, "grad_norm": 2.2653043270111084, "learning_rate": 2.3340766181345572e-05, "loss": 0.8215, "num_input_tokens_seen": 44358720, "step": 76410 }, { "epoch": 11.381441763479296, "grad_norm": 1.5786129236221313, "learning_rate": 2.3337523938487214e-05, "loss": 0.643, "num_input_tokens_seen": 44361696, "step": 76415 }, { "epoch": 11.382186476020257, "grad_norm": 1.6942375898361206, "learning_rate": 2.3334281723714694e-05, "loss": 0.6132, "num_input_tokens_seen": 44364480, "step": 76420 }, { "epoch": 11.382931188561216, "grad_norm": 1.3075505495071411, "learning_rate": 2.3331039537082796e-05, "loss": 0.5942, "num_input_tokens_seen": 44367264, "step": 76425 }, { "epoch": 11.383675901102174, "grad_norm": 1.009971022605896, "learning_rate": 2.332779737864628e-05, "loss": 0.6146, "num_input_tokens_seen": 44370304, "step": 76430 }, { "epoch": 11.384420613643133, "grad_norm": 0.9740890860557556, "learning_rate": 2.3324555248459938e-05, "loss": 0.6884, "num_input_tokens_seen": 44373088, "step": 76435 }, { "epoch": 11.385165326184094, "grad_norm": 1.1873931884765625, "learning_rate": 2.3321313146578532e-05, "loss": 0.5407, "num_input_tokens_seen": 44375936, "step": 76440 }, { "epoch": 11.385910038725052, "grad_norm": 0.9740415215492249, "learning_rate": 2.3318071073056826e-05, "loss": 0.6341, "num_input_tokens_seen": 44378848, "step": 76445 }, { "epoch": 11.386654751266011, "grad_norm": 1.6571515798568726, "learning_rate": 2.3314829027949606e-05, "loss": 0.7067, "num_input_tokens_seen": 44381568, "step": 76450 }, { "epoch": 11.38739946380697, "grad_norm": 1.3360224962234497, "learning_rate": 2.3311587011311634e-05, "loss": 0.6447, "num_input_tokens_seen": 44384448, "step": 76455 }, { "epoch": 11.38814417634793, "grad_norm": 1.570932388305664, "learning_rate": 2.330834502319769e-05, "loss": 0.6322, "num_input_tokens_seen": 44387360, "step": 76460 }, { "epoch": 11.38888888888889, "grad_norm": 2.987495183944702, "learning_rate": 2.3305103063662522e-05, "loss": 0.6306, "num_input_tokens_seen": 44390112, "step": 76465 }, { "epoch": 11.389633601429848, "grad_norm": 1.3238657712936401, "learning_rate": 2.330186113276093e-05, "loss": 0.5382, "num_input_tokens_seen": 44393408, "step": 76470 }, { "epoch": 11.390378313970807, "grad_norm": 1.6100594997406006, "learning_rate": 2.3298619230547656e-05, "loss": 0.6373, "num_input_tokens_seen": 44396768, "step": 76475 }, { "epoch": 11.391123026511767, "grad_norm": 3.0291452407836914, "learning_rate": 2.329537735707749e-05, "loss": 0.6473, "num_input_tokens_seen": 44399584, "step": 76480 }, { "epoch": 11.391867739052726, "grad_norm": 1.0488412380218506, "learning_rate": 2.3292135512405198e-05, "loss": 0.4826, "num_input_tokens_seen": 44402528, "step": 76485 }, { "epoch": 11.392612451593685, "grad_norm": 2.391836643218994, "learning_rate": 2.3288893696585528e-05, "loss": 0.5789, "num_input_tokens_seen": 44405568, "step": 76490 }, { "epoch": 11.393357164134644, "grad_norm": 1.0829451084136963, "learning_rate": 2.328565190967327e-05, "loss": 0.5855, "num_input_tokens_seen": 44408448, "step": 76495 }, { "epoch": 11.394101876675602, "grad_norm": 1.300572156906128, "learning_rate": 2.3282410151723167e-05, "loss": 0.554, "num_input_tokens_seen": 44411552, "step": 76500 }, { "epoch": 11.394846589216563, "grad_norm": 0.8917226195335388, "learning_rate": 2.327916842279001e-05, "loss": 0.4685, "num_input_tokens_seen": 44414496, "step": 76505 }, { "epoch": 11.395591301757522, "grad_norm": 2.833047389984131, "learning_rate": 2.3275926722928542e-05, "loss": 0.644, "num_input_tokens_seen": 44417280, "step": 76510 }, { "epoch": 11.39633601429848, "grad_norm": 0.8470327258110046, "learning_rate": 2.327268505219355e-05, "loss": 0.5692, "num_input_tokens_seen": 44420160, "step": 76515 }, { "epoch": 11.39708072683944, "grad_norm": 0.9204630851745605, "learning_rate": 2.326944341063979e-05, "loss": 0.4813, "num_input_tokens_seen": 44423136, "step": 76520 }, { "epoch": 11.3978254393804, "grad_norm": 0.9309818148612976, "learning_rate": 2.326620179832202e-05, "loss": 0.4956, "num_input_tokens_seen": 44426112, "step": 76525 }, { "epoch": 11.398570151921358, "grad_norm": 1.6798803806304932, "learning_rate": 2.3262960215295014e-05, "loss": 0.4172, "num_input_tokens_seen": 44429056, "step": 76530 }, { "epoch": 11.399314864462317, "grad_norm": 1.7889404296875, "learning_rate": 2.3259718661613518e-05, "loss": 0.647, "num_input_tokens_seen": 44432032, "step": 76535 }, { "epoch": 11.400059577003276, "grad_norm": 0.9949190616607666, "learning_rate": 2.3256477137332315e-05, "loss": 0.7619, "num_input_tokens_seen": 44435008, "step": 76540 }, { "epoch": 11.400804289544237, "grad_norm": 0.9322001934051514, "learning_rate": 2.325323564250615e-05, "loss": 0.5469, "num_input_tokens_seen": 44437632, "step": 76545 }, { "epoch": 11.401549002085195, "grad_norm": 1.0874541997909546, "learning_rate": 2.324999417718981e-05, "loss": 0.5434, "num_input_tokens_seen": 44440352, "step": 76550 }, { "epoch": 11.402293714626154, "grad_norm": 1.0737636089324951, "learning_rate": 2.3246752741438026e-05, "loss": 0.658, "num_input_tokens_seen": 44443232, "step": 76555 }, { "epoch": 11.403038427167113, "grad_norm": 1.5174851417541504, "learning_rate": 2.324351133530558e-05, "loss": 0.5827, "num_input_tokens_seen": 44445856, "step": 76560 }, { "epoch": 11.403783139708073, "grad_norm": 1.7408649921417236, "learning_rate": 2.3240269958847226e-05, "loss": 0.6728, "num_input_tokens_seen": 44448672, "step": 76565 }, { "epoch": 11.404527852249032, "grad_norm": 0.7602663636207581, "learning_rate": 2.3237028612117712e-05, "loss": 0.5676, "num_input_tokens_seen": 44451360, "step": 76570 }, { "epoch": 11.40527256478999, "grad_norm": 1.148305892944336, "learning_rate": 2.3233787295171818e-05, "loss": 0.4835, "num_input_tokens_seen": 44454112, "step": 76575 }, { "epoch": 11.40601727733095, "grad_norm": 1.5372689962387085, "learning_rate": 2.323054600806428e-05, "loss": 0.501, "num_input_tokens_seen": 44456832, "step": 76580 }, { "epoch": 11.40676198987191, "grad_norm": 1.562712550163269, "learning_rate": 2.322730475084988e-05, "loss": 0.7505, "num_input_tokens_seen": 44459680, "step": 76585 }, { "epoch": 11.407506702412869, "grad_norm": 3.0803561210632324, "learning_rate": 2.3224063523583363e-05, "loss": 0.7225, "num_input_tokens_seen": 44462784, "step": 76590 }, { "epoch": 11.408251414953828, "grad_norm": 0.9647995233535767, "learning_rate": 2.322082232631949e-05, "loss": 0.4283, "num_input_tokens_seen": 44465664, "step": 76595 }, { "epoch": 11.408996127494786, "grad_norm": 1.4672505855560303, "learning_rate": 2.3217581159113016e-05, "loss": 0.6959, "num_input_tokens_seen": 44468416, "step": 76600 }, { "epoch": 11.409740840035747, "grad_norm": 1.3712116479873657, "learning_rate": 2.3214340022018688e-05, "loss": 0.7018, "num_input_tokens_seen": 44471008, "step": 76605 }, { "epoch": 11.410485552576706, "grad_norm": 1.3604415655136108, "learning_rate": 2.321109891509128e-05, "loss": 0.662, "num_input_tokens_seen": 44474016, "step": 76610 }, { "epoch": 11.411230265117664, "grad_norm": 0.7887731790542603, "learning_rate": 2.3207857838385524e-05, "loss": 0.5303, "num_input_tokens_seen": 44476832, "step": 76615 }, { "epoch": 11.411974977658623, "grad_norm": 1.2213016748428345, "learning_rate": 2.32046167919562e-05, "loss": 0.6916, "num_input_tokens_seen": 44479840, "step": 76620 }, { "epoch": 11.412719690199584, "grad_norm": 1.1072092056274414, "learning_rate": 2.320137577585805e-05, "loss": 0.5841, "num_input_tokens_seen": 44482848, "step": 76625 }, { "epoch": 11.413464402740543, "grad_norm": 1.7206332683563232, "learning_rate": 2.319813479014583e-05, "loss": 0.4155, "num_input_tokens_seen": 44485696, "step": 76630 }, { "epoch": 11.414209115281501, "grad_norm": 1.3199492692947388, "learning_rate": 2.319489383487428e-05, "loss": 0.5779, "num_input_tokens_seen": 44488608, "step": 76635 }, { "epoch": 11.41495382782246, "grad_norm": 1.0544044971466064, "learning_rate": 2.3191652910098174e-05, "loss": 0.6703, "num_input_tokens_seen": 44491424, "step": 76640 }, { "epoch": 11.41569854036342, "grad_norm": 1.44871985912323, "learning_rate": 2.3188412015872258e-05, "loss": 0.5564, "num_input_tokens_seen": 44494144, "step": 76645 }, { "epoch": 11.41644325290438, "grad_norm": 1.7391830682754517, "learning_rate": 2.3185171152251265e-05, "loss": 0.4858, "num_input_tokens_seen": 44496992, "step": 76650 }, { "epoch": 11.417187965445338, "grad_norm": 1.7287592887878418, "learning_rate": 2.3181930319289975e-05, "loss": 0.5733, "num_input_tokens_seen": 44499872, "step": 76655 }, { "epoch": 11.417932677986297, "grad_norm": 1.4521960020065308, "learning_rate": 2.3178689517043116e-05, "loss": 0.794, "num_input_tokens_seen": 44502816, "step": 76660 }, { "epoch": 11.418677390527257, "grad_norm": 0.9483959078788757, "learning_rate": 2.3175448745565454e-05, "loss": 0.717, "num_input_tokens_seen": 44505536, "step": 76665 }, { "epoch": 11.419422103068216, "grad_norm": 1.7819567918777466, "learning_rate": 2.317220800491172e-05, "loss": 0.5773, "num_input_tokens_seen": 44508992, "step": 76670 }, { "epoch": 11.420166815609175, "grad_norm": 1.0617519617080688, "learning_rate": 2.3168967295136685e-05, "loss": 0.5685, "num_input_tokens_seen": 44511776, "step": 76675 }, { "epoch": 11.420911528150134, "grad_norm": 1.8836743831634521, "learning_rate": 2.3165726616295083e-05, "loss": 0.6144, "num_input_tokens_seen": 44514528, "step": 76680 }, { "epoch": 11.421656240691092, "grad_norm": 1.861181616783142, "learning_rate": 2.316248596844166e-05, "loss": 0.5371, "num_input_tokens_seen": 44517600, "step": 76685 }, { "epoch": 11.422400953232053, "grad_norm": 1.2834595441818237, "learning_rate": 2.3159245351631176e-05, "loss": 0.6753, "num_input_tokens_seen": 44520576, "step": 76690 }, { "epoch": 11.423145665773012, "grad_norm": 0.7605897188186646, "learning_rate": 2.315600476591837e-05, "loss": 0.6029, "num_input_tokens_seen": 44523360, "step": 76695 }, { "epoch": 11.42389037831397, "grad_norm": 1.7624232769012451, "learning_rate": 2.3152764211357988e-05, "loss": 0.7755, "num_input_tokens_seen": 44526144, "step": 76700 }, { "epoch": 11.42463509085493, "grad_norm": 1.1871633529663086, "learning_rate": 2.314952368800477e-05, "loss": 0.5558, "num_input_tokens_seen": 44528960, "step": 76705 }, { "epoch": 11.42537980339589, "grad_norm": 1.927158236503601, "learning_rate": 2.3146283195913482e-05, "loss": 0.5734, "num_input_tokens_seen": 44532128, "step": 76710 }, { "epoch": 11.426124515936849, "grad_norm": 1.3406833410263062, "learning_rate": 2.3143042735138848e-05, "loss": 0.4976, "num_input_tokens_seen": 44535136, "step": 76715 }, { "epoch": 11.426869228477807, "grad_norm": 2.2133030891418457, "learning_rate": 2.3139802305735618e-05, "loss": 0.665, "num_input_tokens_seen": 44538176, "step": 76720 }, { "epoch": 11.427613941018766, "grad_norm": 2.128943920135498, "learning_rate": 2.3136561907758543e-05, "loss": 0.3848, "num_input_tokens_seen": 44541024, "step": 76725 }, { "epoch": 11.428358653559727, "grad_norm": 0.8700162768363953, "learning_rate": 2.3133321541262356e-05, "loss": 0.4603, "num_input_tokens_seen": 44543840, "step": 76730 }, { "epoch": 11.429103366100685, "grad_norm": 1.0207215547561646, "learning_rate": 2.3130081206301812e-05, "loss": 0.6221, "num_input_tokens_seen": 44547008, "step": 76735 }, { "epoch": 11.429848078641644, "grad_norm": 1.2076956033706665, "learning_rate": 2.3126840902931633e-05, "loss": 0.6083, "num_input_tokens_seen": 44549632, "step": 76740 }, { "epoch": 11.430592791182603, "grad_norm": 1.8627148866653442, "learning_rate": 2.312360063120658e-05, "loss": 0.8425, "num_input_tokens_seen": 44552608, "step": 76745 }, { "epoch": 11.431337503723563, "grad_norm": 1.4244623184204102, "learning_rate": 2.3120360391181388e-05, "loss": 0.5412, "num_input_tokens_seen": 44555392, "step": 76750 }, { "epoch": 11.432082216264522, "grad_norm": 1.411399245262146, "learning_rate": 2.31171201829108e-05, "loss": 0.5658, "num_input_tokens_seen": 44558400, "step": 76755 }, { "epoch": 11.432826928805481, "grad_norm": 0.8267415761947632, "learning_rate": 2.3113880006449547e-05, "loss": 0.7809, "num_input_tokens_seen": 44561216, "step": 76760 }, { "epoch": 11.43357164134644, "grad_norm": 0.7922597527503967, "learning_rate": 2.3110639861852373e-05, "loss": 0.481, "num_input_tokens_seen": 44563936, "step": 76765 }, { "epoch": 11.4343163538874, "grad_norm": 2.4932596683502197, "learning_rate": 2.3107399749174027e-05, "loss": 0.6729, "num_input_tokens_seen": 44566816, "step": 76770 }, { "epoch": 11.435061066428359, "grad_norm": 0.9479501247406006, "learning_rate": 2.3104159668469226e-05, "loss": 0.4932, "num_input_tokens_seen": 44569536, "step": 76775 }, { "epoch": 11.435805778969318, "grad_norm": 2.351254940032959, "learning_rate": 2.3100919619792733e-05, "loss": 0.6315, "num_input_tokens_seen": 44572448, "step": 76780 }, { "epoch": 11.436550491510276, "grad_norm": 3.2489962577819824, "learning_rate": 2.3097679603199267e-05, "loss": 0.6379, "num_input_tokens_seen": 44575264, "step": 76785 }, { "epoch": 11.437295204051237, "grad_norm": 1.115281343460083, "learning_rate": 2.3094439618743572e-05, "loss": 0.5955, "num_input_tokens_seen": 44578112, "step": 76790 }, { "epoch": 11.438039916592196, "grad_norm": 1.9744614362716675, "learning_rate": 2.3091199666480377e-05, "loss": 0.6427, "num_input_tokens_seen": 44580896, "step": 76795 }, { "epoch": 11.438784629133155, "grad_norm": 1.4559458494186401, "learning_rate": 2.3087959746464432e-05, "loss": 0.4635, "num_input_tokens_seen": 44583936, "step": 76800 }, { "epoch": 11.439529341674113, "grad_norm": 1.098252534866333, "learning_rate": 2.3084719858750464e-05, "loss": 0.4926, "num_input_tokens_seen": 44586944, "step": 76805 }, { "epoch": 11.440274054215074, "grad_norm": 1.3933396339416504, "learning_rate": 2.3081480003393198e-05, "loss": 0.7172, "num_input_tokens_seen": 44589920, "step": 76810 }, { "epoch": 11.441018766756033, "grad_norm": 1.8076063394546509, "learning_rate": 2.3078240180447384e-05, "loss": 0.658, "num_input_tokens_seen": 44592992, "step": 76815 }, { "epoch": 11.441763479296991, "grad_norm": 1.5052249431610107, "learning_rate": 2.307500038996775e-05, "loss": 0.4516, "num_input_tokens_seen": 44596000, "step": 76820 }, { "epoch": 11.44250819183795, "grad_norm": 3.066650152206421, "learning_rate": 2.3071760632009028e-05, "loss": 0.7081, "num_input_tokens_seen": 44598720, "step": 76825 }, { "epoch": 11.443252904378909, "grad_norm": 1.0987167358398438, "learning_rate": 2.3068520906625943e-05, "loss": 0.5911, "num_input_tokens_seen": 44601440, "step": 76830 }, { "epoch": 11.44399761691987, "grad_norm": 1.0085850954055786, "learning_rate": 2.306528121387324e-05, "loss": 0.6278, "num_input_tokens_seen": 44604608, "step": 76835 }, { "epoch": 11.444742329460828, "grad_norm": 2.7270703315734863, "learning_rate": 2.306204155380565e-05, "loss": 0.7143, "num_input_tokens_seen": 44607744, "step": 76840 }, { "epoch": 11.445487042001787, "grad_norm": 1.4840757846832275, "learning_rate": 2.3058801926477885e-05, "loss": 0.5381, "num_input_tokens_seen": 44610528, "step": 76845 }, { "epoch": 11.446231754542746, "grad_norm": 1.41255521774292, "learning_rate": 2.3055562331944703e-05, "loss": 0.651, "num_input_tokens_seen": 44613184, "step": 76850 }, { "epoch": 11.446976467083706, "grad_norm": 2.12540602684021, "learning_rate": 2.3052322770260808e-05, "loss": 0.6014, "num_input_tokens_seen": 44616288, "step": 76855 }, { "epoch": 11.447721179624665, "grad_norm": 1.7953689098358154, "learning_rate": 2.3049083241480948e-05, "loss": 0.6719, "num_input_tokens_seen": 44619104, "step": 76860 }, { "epoch": 11.448465892165624, "grad_norm": 2.2422358989715576, "learning_rate": 2.3045843745659834e-05, "loss": 0.5521, "num_input_tokens_seen": 44621792, "step": 76865 }, { "epoch": 11.449210604706582, "grad_norm": 1.1664118766784668, "learning_rate": 2.3042604282852215e-05, "loss": 0.6385, "num_input_tokens_seen": 44624832, "step": 76870 }, { "epoch": 11.449955317247543, "grad_norm": 1.095004916191101, "learning_rate": 2.3039364853112794e-05, "loss": 0.5735, "num_input_tokens_seen": 44627552, "step": 76875 }, { "epoch": 11.450700029788502, "grad_norm": 1.3692737817764282, "learning_rate": 2.3036125456496324e-05, "loss": 0.6172, "num_input_tokens_seen": 44630592, "step": 76880 }, { "epoch": 11.45144474232946, "grad_norm": 1.861688494682312, "learning_rate": 2.303288609305752e-05, "loss": 0.521, "num_input_tokens_seen": 44633472, "step": 76885 }, { "epoch": 11.45218945487042, "grad_norm": 2.124342679977417, "learning_rate": 2.3029646762851096e-05, "loss": 0.5668, "num_input_tokens_seen": 44636224, "step": 76890 }, { "epoch": 11.45293416741138, "grad_norm": 1.201228380203247, "learning_rate": 2.3026407465931797e-05, "loss": 0.5863, "num_input_tokens_seen": 44639104, "step": 76895 }, { "epoch": 11.453678879952339, "grad_norm": 1.9328126907348633, "learning_rate": 2.3023168202354324e-05, "loss": 0.4198, "num_input_tokens_seen": 44642112, "step": 76900 }, { "epoch": 11.454423592493297, "grad_norm": 1.8145169019699097, "learning_rate": 2.301992897217343e-05, "loss": 0.6544, "num_input_tokens_seen": 44644896, "step": 76905 }, { "epoch": 11.455168305034256, "grad_norm": 1.6249547004699707, "learning_rate": 2.3016689775443806e-05, "loss": 0.6827, "num_input_tokens_seen": 44647776, "step": 76910 }, { "epoch": 11.455913017575217, "grad_norm": 3.163717269897461, "learning_rate": 2.3013450612220207e-05, "loss": 0.6087, "num_input_tokens_seen": 44650912, "step": 76915 }, { "epoch": 11.456657730116175, "grad_norm": 2.319687843322754, "learning_rate": 2.3010211482557335e-05, "loss": 0.6471, "num_input_tokens_seen": 44653728, "step": 76920 }, { "epoch": 11.457402442657134, "grad_norm": 1.252105712890625, "learning_rate": 2.3006972386509925e-05, "loss": 0.5931, "num_input_tokens_seen": 44656832, "step": 76925 }, { "epoch": 11.458147155198093, "grad_norm": 2.7985596656799316, "learning_rate": 2.3003733324132693e-05, "loss": 0.6863, "num_input_tokens_seen": 44659936, "step": 76930 }, { "epoch": 11.458891867739053, "grad_norm": 1.0269216299057007, "learning_rate": 2.300049429548034e-05, "loss": 0.7377, "num_input_tokens_seen": 44662656, "step": 76935 }, { "epoch": 11.459636580280012, "grad_norm": 0.7856981754302979, "learning_rate": 2.299725530060762e-05, "loss": 0.7047, "num_input_tokens_seen": 44665248, "step": 76940 }, { "epoch": 11.460381292820971, "grad_norm": 2.2748589515686035, "learning_rate": 2.2994016339569224e-05, "loss": 0.7816, "num_input_tokens_seen": 44668128, "step": 76945 }, { "epoch": 11.46112600536193, "grad_norm": 2.957597017288208, "learning_rate": 2.2990777412419892e-05, "loss": 0.4608, "num_input_tokens_seen": 44671072, "step": 76950 }, { "epoch": 11.46187071790289, "grad_norm": 1.6549291610717773, "learning_rate": 2.298753851921433e-05, "loss": 0.7618, "num_input_tokens_seen": 44674240, "step": 76955 }, { "epoch": 11.462615430443849, "grad_norm": 1.4786120653152466, "learning_rate": 2.2984299660007263e-05, "loss": 0.5639, "num_input_tokens_seen": 44677280, "step": 76960 }, { "epoch": 11.463360142984808, "grad_norm": 2.157832622528076, "learning_rate": 2.2981060834853406e-05, "loss": 0.7121, "num_input_tokens_seen": 44680288, "step": 76965 }, { "epoch": 11.464104855525767, "grad_norm": 1.5752328634262085, "learning_rate": 2.2977822043807466e-05, "loss": 0.5765, "num_input_tokens_seen": 44683040, "step": 76970 }, { "epoch": 11.464849568066727, "grad_norm": 2.6645522117614746, "learning_rate": 2.2974583286924176e-05, "loss": 0.5735, "num_input_tokens_seen": 44686080, "step": 76975 }, { "epoch": 11.465594280607686, "grad_norm": 1.1167867183685303, "learning_rate": 2.297134456425823e-05, "loss": 0.5198, "num_input_tokens_seen": 44689088, "step": 76980 }, { "epoch": 11.466338993148645, "grad_norm": 0.617072343826294, "learning_rate": 2.2968105875864368e-05, "loss": 0.4001, "num_input_tokens_seen": 44691840, "step": 76985 }, { "epoch": 11.467083705689603, "grad_norm": 0.7917807698249817, "learning_rate": 2.2964867221797286e-05, "loss": 0.5767, "num_input_tokens_seen": 44694944, "step": 76990 }, { "epoch": 11.467828418230564, "grad_norm": 1.0598421096801758, "learning_rate": 2.296162860211171e-05, "loss": 0.7583, "num_input_tokens_seen": 44697664, "step": 76995 }, { "epoch": 11.468573130771523, "grad_norm": 2.308677911758423, "learning_rate": 2.2958390016862335e-05, "loss": 0.6213, "num_input_tokens_seen": 44700672, "step": 77000 }, { "epoch": 11.469317843312481, "grad_norm": 1.2713836431503296, "learning_rate": 2.29551514661039e-05, "loss": 0.5091, "num_input_tokens_seen": 44703488, "step": 77005 }, { "epoch": 11.47006255585344, "grad_norm": 2.2156949043273926, "learning_rate": 2.2951912949891098e-05, "loss": 0.5941, "num_input_tokens_seen": 44706624, "step": 77010 }, { "epoch": 11.470807268394399, "grad_norm": 1.0193002223968506, "learning_rate": 2.294867446827864e-05, "loss": 0.5995, "num_input_tokens_seen": 44709440, "step": 77015 }, { "epoch": 11.47155198093536, "grad_norm": 1.5119482278823853, "learning_rate": 2.294543602132125e-05, "loss": 0.5175, "num_input_tokens_seen": 44712192, "step": 77020 }, { "epoch": 11.472296693476318, "grad_norm": 1.3179181814193726, "learning_rate": 2.2942197609073624e-05, "loss": 0.4906, "num_input_tokens_seen": 44715424, "step": 77025 }, { "epoch": 11.473041406017277, "grad_norm": 1.7634432315826416, "learning_rate": 2.2938959231590483e-05, "loss": 0.6096, "num_input_tokens_seen": 44718400, "step": 77030 }, { "epoch": 11.473786118558236, "grad_norm": 3.498040199279785, "learning_rate": 2.2935720888926522e-05, "loss": 0.8494, "num_input_tokens_seen": 44721504, "step": 77035 }, { "epoch": 11.474530831099196, "grad_norm": 1.5029188394546509, "learning_rate": 2.2932482581136466e-05, "loss": 0.6459, "num_input_tokens_seen": 44724352, "step": 77040 }, { "epoch": 11.475275543640155, "grad_norm": 1.8674027919769287, "learning_rate": 2.292924430827502e-05, "loss": 0.5792, "num_input_tokens_seen": 44727296, "step": 77045 }, { "epoch": 11.476020256181114, "grad_norm": 1.9082026481628418, "learning_rate": 2.292600607039687e-05, "loss": 0.7318, "num_input_tokens_seen": 44730176, "step": 77050 }, { "epoch": 11.476764968722073, "grad_norm": 1.5491094589233398, "learning_rate": 2.2922767867556755e-05, "loss": 0.7324, "num_input_tokens_seen": 44733184, "step": 77055 }, { "epoch": 11.477509681263033, "grad_norm": 0.7909924387931824, "learning_rate": 2.291952969980936e-05, "loss": 0.641, "num_input_tokens_seen": 44735872, "step": 77060 }, { "epoch": 11.478254393803992, "grad_norm": 1.3701151609420776, "learning_rate": 2.29162915672094e-05, "loss": 0.7535, "num_input_tokens_seen": 44738784, "step": 77065 }, { "epoch": 11.47899910634495, "grad_norm": 0.996371865272522, "learning_rate": 2.2913053469811568e-05, "loss": 0.644, "num_input_tokens_seen": 44741888, "step": 77070 }, { "epoch": 11.47974381888591, "grad_norm": 1.2177708148956299, "learning_rate": 2.2909815407670584e-05, "loss": 0.6476, "num_input_tokens_seen": 44745024, "step": 77075 }, { "epoch": 11.48048853142687, "grad_norm": 1.1045851707458496, "learning_rate": 2.2906577380841143e-05, "loss": 0.5893, "num_input_tokens_seen": 44748288, "step": 77080 }, { "epoch": 11.481233243967829, "grad_norm": 1.4575377702713013, "learning_rate": 2.290333938937795e-05, "loss": 0.4475, "num_input_tokens_seen": 44751168, "step": 77085 }, { "epoch": 11.481977956508787, "grad_norm": 1.1944464445114136, "learning_rate": 2.2900101433335704e-05, "loss": 0.4408, "num_input_tokens_seen": 44754240, "step": 77090 }, { "epoch": 11.482722669049746, "grad_norm": 1.3932490348815918, "learning_rate": 2.289686351276911e-05, "loss": 0.6434, "num_input_tokens_seen": 44756736, "step": 77095 }, { "epoch": 11.483467381590707, "grad_norm": 1.0650393962860107, "learning_rate": 2.2893625627732877e-05, "loss": 0.5949, "num_input_tokens_seen": 44759744, "step": 77100 }, { "epoch": 11.484212094131665, "grad_norm": 0.672256350517273, "learning_rate": 2.2890387778281686e-05, "loss": 0.5723, "num_input_tokens_seen": 44762688, "step": 77105 }, { "epoch": 11.484956806672624, "grad_norm": 1.374049425125122, "learning_rate": 2.2887149964470258e-05, "loss": 0.6123, "num_input_tokens_seen": 44765440, "step": 77110 }, { "epoch": 11.485701519213583, "grad_norm": 1.0485233068466187, "learning_rate": 2.2883912186353282e-05, "loss": 0.4593, "num_input_tokens_seen": 44768256, "step": 77115 }, { "epoch": 11.486446231754543, "grad_norm": 1.4449652433395386, "learning_rate": 2.288067444398546e-05, "loss": 0.6282, "num_input_tokens_seen": 44771136, "step": 77120 }, { "epoch": 11.487190944295502, "grad_norm": 1.2980282306671143, "learning_rate": 2.2877436737421494e-05, "loss": 0.4961, "num_input_tokens_seen": 44774240, "step": 77125 }, { "epoch": 11.487935656836461, "grad_norm": 1.6792434453964233, "learning_rate": 2.287419906671606e-05, "loss": 0.4851, "num_input_tokens_seen": 44776832, "step": 77130 }, { "epoch": 11.48868036937742, "grad_norm": 0.9088075757026672, "learning_rate": 2.287096143192389e-05, "loss": 0.43, "num_input_tokens_seen": 44779424, "step": 77135 }, { "epoch": 11.48942508191838, "grad_norm": 1.282557725906372, "learning_rate": 2.286772383309965e-05, "loss": 0.7089, "num_input_tokens_seen": 44782272, "step": 77140 }, { "epoch": 11.490169794459339, "grad_norm": 1.2064096927642822, "learning_rate": 2.286448627029806e-05, "loss": 0.5567, "num_input_tokens_seen": 44785280, "step": 77145 }, { "epoch": 11.490914507000298, "grad_norm": 2.881573438644409, "learning_rate": 2.2861248743573794e-05, "loss": 0.6592, "num_input_tokens_seen": 44788160, "step": 77150 }, { "epoch": 11.491659219541257, "grad_norm": 2.043971061706543, "learning_rate": 2.2858011252981566e-05, "loss": 0.6299, "num_input_tokens_seen": 44790848, "step": 77155 }, { "epoch": 11.492403932082217, "grad_norm": 1.078136682510376, "learning_rate": 2.285477379857605e-05, "loss": 0.5762, "num_input_tokens_seen": 44793952, "step": 77160 }, { "epoch": 11.493148644623176, "grad_norm": 1.90054190158844, "learning_rate": 2.2851536380411958e-05, "loss": 0.5506, "num_input_tokens_seen": 44796864, "step": 77165 }, { "epoch": 11.493893357164135, "grad_norm": 2.134110689163208, "learning_rate": 2.284829899854398e-05, "loss": 0.6657, "num_input_tokens_seen": 44799872, "step": 77170 }, { "epoch": 11.494638069705093, "grad_norm": 2.5523319244384766, "learning_rate": 2.284506165302679e-05, "loss": 0.7256, "num_input_tokens_seen": 44802688, "step": 77175 }, { "epoch": 11.495382782246054, "grad_norm": 3.5980429649353027, "learning_rate": 2.2841824343915103e-05, "loss": 0.4727, "num_input_tokens_seen": 44805440, "step": 77180 }, { "epoch": 11.496127494787013, "grad_norm": 1.3006759881973267, "learning_rate": 2.28385870712636e-05, "loss": 0.5685, "num_input_tokens_seen": 44808352, "step": 77185 }, { "epoch": 11.496872207327971, "grad_norm": 1.8396589756011963, "learning_rate": 2.283534983512697e-05, "loss": 0.5308, "num_input_tokens_seen": 44811136, "step": 77190 }, { "epoch": 11.49761691986893, "grad_norm": 1.1747115850448608, "learning_rate": 2.2832112635559897e-05, "loss": 0.715, "num_input_tokens_seen": 44813984, "step": 77195 }, { "epoch": 11.498361632409889, "grad_norm": 2.0295915603637695, "learning_rate": 2.282887547261709e-05, "loss": 0.5462, "num_input_tokens_seen": 44816544, "step": 77200 }, { "epoch": 11.49910634495085, "grad_norm": 1.4304367303848267, "learning_rate": 2.2825638346353223e-05, "loss": 0.7396, "num_input_tokens_seen": 44819488, "step": 77205 }, { "epoch": 11.499851057491808, "grad_norm": 1.3873592615127563, "learning_rate": 2.2822401256822974e-05, "loss": 0.7193, "num_input_tokens_seen": 44822432, "step": 77210 }, { "epoch": 11.500595770032767, "grad_norm": 1.6869926452636719, "learning_rate": 2.2819164204081057e-05, "loss": 0.5028, "num_input_tokens_seen": 44825408, "step": 77215 }, { "epoch": 11.501340482573726, "grad_norm": 1.216892957687378, "learning_rate": 2.281592718818214e-05, "loss": 0.465, "num_input_tokens_seen": 44828256, "step": 77220 }, { "epoch": 11.502085195114686, "grad_norm": 1.2999709844589233, "learning_rate": 2.2812690209180914e-05, "loss": 0.647, "num_input_tokens_seen": 44831360, "step": 77225 }, { "epoch": 11.502829907655645, "grad_norm": 3.1891918182373047, "learning_rate": 2.2809453267132054e-05, "loss": 0.6851, "num_input_tokens_seen": 44834176, "step": 77230 }, { "epoch": 11.503574620196604, "grad_norm": 1.4796165227890015, "learning_rate": 2.2806216362090267e-05, "loss": 0.8561, "num_input_tokens_seen": 44836864, "step": 77235 }, { "epoch": 11.504319332737563, "grad_norm": 0.8947727084159851, "learning_rate": 2.2802979494110213e-05, "loss": 0.4625, "num_input_tokens_seen": 44839680, "step": 77240 }, { "epoch": 11.505064045278523, "grad_norm": 1.210906982421875, "learning_rate": 2.27997426632466e-05, "loss": 0.6565, "num_input_tokens_seen": 44842912, "step": 77245 }, { "epoch": 11.505808757819482, "grad_norm": 1.6504151821136475, "learning_rate": 2.2796505869554098e-05, "loss": 0.7468, "num_input_tokens_seen": 44846016, "step": 77250 }, { "epoch": 11.50655347036044, "grad_norm": 1.0460039377212524, "learning_rate": 2.2793269113087385e-05, "loss": 0.5694, "num_input_tokens_seen": 44849120, "step": 77255 }, { "epoch": 11.5072981829014, "grad_norm": 1.0409997701644897, "learning_rate": 2.279003239390115e-05, "loss": 0.5586, "num_input_tokens_seen": 44852000, "step": 77260 }, { "epoch": 11.50804289544236, "grad_norm": 1.5218596458435059, "learning_rate": 2.2786795712050065e-05, "loss": 0.5626, "num_input_tokens_seen": 44854912, "step": 77265 }, { "epoch": 11.508787607983319, "grad_norm": 0.8565995097160339, "learning_rate": 2.2783559067588822e-05, "loss": 0.4183, "num_input_tokens_seen": 44857760, "step": 77270 }, { "epoch": 11.509532320524277, "grad_norm": 1.4401309490203857, "learning_rate": 2.278032246057209e-05, "loss": 0.6265, "num_input_tokens_seen": 44860512, "step": 77275 }, { "epoch": 11.510277033065236, "grad_norm": 3.0724074840545654, "learning_rate": 2.2777085891054566e-05, "loss": 0.5558, "num_input_tokens_seen": 44863680, "step": 77280 }, { "epoch": 11.511021745606197, "grad_norm": 1.2147594690322876, "learning_rate": 2.277384935909091e-05, "loss": 0.5142, "num_input_tokens_seen": 44866528, "step": 77285 }, { "epoch": 11.511766458147155, "grad_norm": 2.407681465148926, "learning_rate": 2.277061286473581e-05, "loss": 0.7745, "num_input_tokens_seen": 44869568, "step": 77290 }, { "epoch": 11.512511170688114, "grad_norm": 1.3761134147644043, "learning_rate": 2.2767376408043935e-05, "loss": 0.6673, "num_input_tokens_seen": 44872608, "step": 77295 }, { "epoch": 11.513255883229073, "grad_norm": 1.4783557653427124, "learning_rate": 2.2764139989069962e-05, "loss": 0.5737, "num_input_tokens_seen": 44875680, "step": 77300 }, { "epoch": 11.514000595770034, "grad_norm": 1.905415654182434, "learning_rate": 2.276090360786858e-05, "loss": 0.6144, "num_input_tokens_seen": 44878400, "step": 77305 }, { "epoch": 11.514745308310992, "grad_norm": 0.8809472918510437, "learning_rate": 2.2757667264494448e-05, "loss": 0.6109, "num_input_tokens_seen": 44881184, "step": 77310 }, { "epoch": 11.515490020851951, "grad_norm": 1.1937321424484253, "learning_rate": 2.275443095900226e-05, "loss": 0.7166, "num_input_tokens_seen": 44884384, "step": 77315 }, { "epoch": 11.51623473339291, "grad_norm": 1.7698109149932861, "learning_rate": 2.2751194691446666e-05, "loss": 0.493, "num_input_tokens_seen": 44887264, "step": 77320 }, { "epoch": 11.51697944593387, "grad_norm": 2.734750747680664, "learning_rate": 2.2747958461882365e-05, "loss": 0.6063, "num_input_tokens_seen": 44889952, "step": 77325 }, { "epoch": 11.517724158474829, "grad_norm": 1.0682493448257446, "learning_rate": 2.2744722270364012e-05, "loss": 0.6938, "num_input_tokens_seen": 44893152, "step": 77330 }, { "epoch": 11.518468871015788, "grad_norm": 1.970271110534668, "learning_rate": 2.274148611694628e-05, "loss": 0.6104, "num_input_tokens_seen": 44896128, "step": 77335 }, { "epoch": 11.519213583556747, "grad_norm": 2.4332075119018555, "learning_rate": 2.2738250001683846e-05, "loss": 0.6365, "num_input_tokens_seen": 44898976, "step": 77340 }, { "epoch": 11.519958296097705, "grad_norm": 1.4168306589126587, "learning_rate": 2.2735013924631378e-05, "loss": 0.5909, "num_input_tokens_seen": 44901536, "step": 77345 }, { "epoch": 11.520703008638666, "grad_norm": 0.784457802772522, "learning_rate": 2.273177788584355e-05, "loss": 0.5291, "num_input_tokens_seen": 44904256, "step": 77350 }, { "epoch": 11.521447721179625, "grad_norm": 2.037010431289673, "learning_rate": 2.272854188537503e-05, "loss": 0.7606, "num_input_tokens_seen": 44907104, "step": 77355 }, { "epoch": 11.522192433720583, "grad_norm": 1.2771496772766113, "learning_rate": 2.272530592328049e-05, "loss": 0.475, "num_input_tokens_seen": 44909856, "step": 77360 }, { "epoch": 11.522937146261544, "grad_norm": 1.4732801914215088, "learning_rate": 2.272206999961459e-05, "loss": 0.6891, "num_input_tokens_seen": 44912928, "step": 77365 }, { "epoch": 11.523681858802503, "grad_norm": 0.7448680400848389, "learning_rate": 2.2718834114432e-05, "loss": 0.4248, "num_input_tokens_seen": 44915648, "step": 77370 }, { "epoch": 11.524426571343461, "grad_norm": 2.0177690982818604, "learning_rate": 2.2715598267787394e-05, "loss": 0.564, "num_input_tokens_seen": 44918496, "step": 77375 }, { "epoch": 11.52517128388442, "grad_norm": 1.5392228364944458, "learning_rate": 2.2712362459735425e-05, "loss": 0.5817, "num_input_tokens_seen": 44921312, "step": 77380 }, { "epoch": 11.525915996425379, "grad_norm": 0.9870915412902832, "learning_rate": 2.2709126690330778e-05, "loss": 0.5806, "num_input_tokens_seen": 44924160, "step": 77385 }, { "epoch": 11.52666070896634, "grad_norm": 0.935734212398529, "learning_rate": 2.27058909596281e-05, "loss": 0.5314, "num_input_tokens_seen": 44926944, "step": 77390 }, { "epoch": 11.527405421507298, "grad_norm": 1.85843026638031, "learning_rate": 2.2702655267682068e-05, "loss": 0.684, "num_input_tokens_seen": 44929856, "step": 77395 }, { "epoch": 11.528150134048257, "grad_norm": 1.13259756565094, "learning_rate": 2.2699419614547333e-05, "loss": 0.5871, "num_input_tokens_seen": 44932832, "step": 77400 }, { "epoch": 11.528894846589216, "grad_norm": 0.8300651907920837, "learning_rate": 2.2696184000278573e-05, "loss": 0.7791, "num_input_tokens_seen": 44936064, "step": 77405 }, { "epoch": 11.529639559130176, "grad_norm": 1.5258114337921143, "learning_rate": 2.2692948424930445e-05, "loss": 0.5402, "num_input_tokens_seen": 44938944, "step": 77410 }, { "epoch": 11.530384271671135, "grad_norm": 2.1058006286621094, "learning_rate": 2.2689712888557603e-05, "loss": 0.5663, "num_input_tokens_seen": 44941504, "step": 77415 }, { "epoch": 11.531128984212094, "grad_norm": 0.8088816404342651, "learning_rate": 2.268647739121471e-05, "loss": 0.5794, "num_input_tokens_seen": 44944352, "step": 77420 }, { "epoch": 11.531873696753053, "grad_norm": 1.2557930946350098, "learning_rate": 2.2683241932956432e-05, "loss": 0.5543, "num_input_tokens_seen": 44947072, "step": 77425 }, { "epoch": 11.532618409294013, "grad_norm": 1.3686541318893433, "learning_rate": 2.2680006513837436e-05, "loss": 0.5355, "num_input_tokens_seen": 44950208, "step": 77430 }, { "epoch": 11.533363121834972, "grad_norm": 0.9289689064025879, "learning_rate": 2.2676771133912355e-05, "loss": 0.4077, "num_input_tokens_seen": 44953024, "step": 77435 }, { "epoch": 11.53410783437593, "grad_norm": 1.2752273082733154, "learning_rate": 2.2673535793235877e-05, "loss": 0.693, "num_input_tokens_seen": 44955872, "step": 77440 }, { "epoch": 11.53485254691689, "grad_norm": 1.3422863483428955, "learning_rate": 2.2670300491862646e-05, "loss": 0.5582, "num_input_tokens_seen": 44958720, "step": 77445 }, { "epoch": 11.53559725945785, "grad_norm": 3.168160915374756, "learning_rate": 2.2667065229847323e-05, "loss": 0.5504, "num_input_tokens_seen": 44961408, "step": 77450 }, { "epoch": 11.536341971998809, "grad_norm": 1.547667384147644, "learning_rate": 2.266383000724456e-05, "loss": 0.7047, "num_input_tokens_seen": 44964288, "step": 77455 }, { "epoch": 11.537086684539767, "grad_norm": 2.095167875289917, "learning_rate": 2.2660594824109008e-05, "loss": 0.5419, "num_input_tokens_seen": 44967040, "step": 77460 }, { "epoch": 11.537831397080726, "grad_norm": 1.3817799091339111, "learning_rate": 2.2657359680495335e-05, "loss": 0.5378, "num_input_tokens_seen": 44969952, "step": 77465 }, { "epoch": 11.538576109621687, "grad_norm": 1.441217064857483, "learning_rate": 2.2654124576458182e-05, "loss": 0.5967, "num_input_tokens_seen": 44972896, "step": 77470 }, { "epoch": 11.539320822162646, "grad_norm": 1.2743695974349976, "learning_rate": 2.265088951205222e-05, "loss": 0.4843, "num_input_tokens_seen": 44975840, "step": 77475 }, { "epoch": 11.540065534703604, "grad_norm": 1.236093521118164, "learning_rate": 2.2647654487332086e-05, "loss": 0.5192, "num_input_tokens_seen": 44978688, "step": 77480 }, { "epoch": 11.540810247244563, "grad_norm": 1.8488457202911377, "learning_rate": 2.2644419502352444e-05, "loss": 0.7125, "num_input_tokens_seen": 44981600, "step": 77485 }, { "epoch": 11.541554959785524, "grad_norm": 1.4531818628311157, "learning_rate": 2.264118455716794e-05, "loss": 0.6085, "num_input_tokens_seen": 44984544, "step": 77490 }, { "epoch": 11.542299672326482, "grad_norm": 0.9092240333557129, "learning_rate": 2.2637949651833218e-05, "loss": 0.5384, "num_input_tokens_seen": 44987232, "step": 77495 }, { "epoch": 11.543044384867441, "grad_norm": 1.2699198722839355, "learning_rate": 2.2634714786402942e-05, "loss": 0.54, "num_input_tokens_seen": 44990272, "step": 77500 }, { "epoch": 11.5437890974084, "grad_norm": 1.1591007709503174, "learning_rate": 2.2631479960931747e-05, "loss": 0.59, "num_input_tokens_seen": 44993472, "step": 77505 }, { "epoch": 11.54453380994936, "grad_norm": 1.767467975616455, "learning_rate": 2.26282451754743e-05, "loss": 0.5436, "num_input_tokens_seen": 44996256, "step": 77510 }, { "epoch": 11.54527852249032, "grad_norm": 1.106595516204834, "learning_rate": 2.262501043008524e-05, "loss": 0.5224, "num_input_tokens_seen": 44999232, "step": 77515 }, { "epoch": 11.546023235031278, "grad_norm": 1.2199997901916504, "learning_rate": 2.2621775724819218e-05, "loss": 0.6402, "num_input_tokens_seen": 45001952, "step": 77520 }, { "epoch": 11.546767947572237, "grad_norm": 1.6297656297683716, "learning_rate": 2.2618541059730862e-05, "loss": 0.664, "num_input_tokens_seen": 45004864, "step": 77525 }, { "epoch": 11.547512660113195, "grad_norm": 1.4866665601730347, "learning_rate": 2.2615306434874853e-05, "loss": 0.5414, "num_input_tokens_seen": 45007584, "step": 77530 }, { "epoch": 11.548257372654156, "grad_norm": 1.2542572021484375, "learning_rate": 2.2612071850305812e-05, "loss": 0.6605, "num_input_tokens_seen": 45010464, "step": 77535 }, { "epoch": 11.549002085195115, "grad_norm": 1.4583051204681396, "learning_rate": 2.2608837306078385e-05, "loss": 0.5711, "num_input_tokens_seen": 45013504, "step": 77540 }, { "epoch": 11.549746797736073, "grad_norm": 1.248940348625183, "learning_rate": 2.2605602802247227e-05, "loss": 0.6993, "num_input_tokens_seen": 45016544, "step": 77545 }, { "epoch": 11.550491510277032, "grad_norm": 1.3684003353118896, "learning_rate": 2.2602368338866974e-05, "loss": 0.6326, "num_input_tokens_seen": 45019456, "step": 77550 }, { "epoch": 11.551236222817993, "grad_norm": 1.0399888753890991, "learning_rate": 2.2599133915992273e-05, "loss": 0.5393, "num_input_tokens_seen": 45022464, "step": 77555 }, { "epoch": 11.551980935358952, "grad_norm": 2.9868996143341064, "learning_rate": 2.2595899533677756e-05, "loss": 0.6098, "num_input_tokens_seen": 45025664, "step": 77560 }, { "epoch": 11.55272564789991, "grad_norm": 3.017005443572998, "learning_rate": 2.2592665191978085e-05, "loss": 0.8074, "num_input_tokens_seen": 45028512, "step": 77565 }, { "epoch": 11.553470360440869, "grad_norm": 0.9578922390937805, "learning_rate": 2.2589430890947885e-05, "loss": 0.4815, "num_input_tokens_seen": 45031584, "step": 77570 }, { "epoch": 11.55421507298183, "grad_norm": 1.2705093622207642, "learning_rate": 2.2586196630641792e-05, "loss": 0.6926, "num_input_tokens_seen": 45034560, "step": 77575 }, { "epoch": 11.554959785522788, "grad_norm": 1.2383270263671875, "learning_rate": 2.2582962411114464e-05, "loss": 0.4871, "num_input_tokens_seen": 45037504, "step": 77580 }, { "epoch": 11.555704498063747, "grad_norm": 1.8343420028686523, "learning_rate": 2.2579728232420525e-05, "loss": 0.5876, "num_input_tokens_seen": 45040352, "step": 77585 }, { "epoch": 11.556449210604706, "grad_norm": 1.099657416343689, "learning_rate": 2.2576494094614624e-05, "loss": 0.525, "num_input_tokens_seen": 45043328, "step": 77590 }, { "epoch": 11.557193923145666, "grad_norm": 2.4335877895355225, "learning_rate": 2.257325999775138e-05, "loss": 0.9543, "num_input_tokens_seen": 45047488, "step": 77595 }, { "epoch": 11.557938635686625, "grad_norm": 1.0951061248779297, "learning_rate": 2.257002594188545e-05, "loss": 0.6678, "num_input_tokens_seen": 45050336, "step": 77600 }, { "epoch": 11.558683348227584, "grad_norm": 1.2154030799865723, "learning_rate": 2.2566791927071453e-05, "loss": 0.5949, "num_input_tokens_seen": 45053472, "step": 77605 }, { "epoch": 11.559428060768543, "grad_norm": 0.5418617725372314, "learning_rate": 2.2563557953364043e-05, "loss": 0.6235, "num_input_tokens_seen": 45056192, "step": 77610 }, { "epoch": 11.560172773309503, "grad_norm": 1.2141156196594238, "learning_rate": 2.256032402081785e-05, "loss": 0.6457, "num_input_tokens_seen": 45058880, "step": 77615 }, { "epoch": 11.560917485850462, "grad_norm": 1.3548493385314941, "learning_rate": 2.2557090129487493e-05, "loss": 0.6021, "num_input_tokens_seen": 45061664, "step": 77620 }, { "epoch": 11.56166219839142, "grad_norm": 1.4494611024856567, "learning_rate": 2.2553856279427625e-05, "loss": 0.5968, "num_input_tokens_seen": 45064416, "step": 77625 }, { "epoch": 11.56240691093238, "grad_norm": 0.7733591198921204, "learning_rate": 2.2550622470692852e-05, "loss": 0.5238, "num_input_tokens_seen": 45067232, "step": 77630 }, { "epoch": 11.56315162347334, "grad_norm": 1.0550568103790283, "learning_rate": 2.2547388703337837e-05, "loss": 0.68, "num_input_tokens_seen": 45070112, "step": 77635 }, { "epoch": 11.563896336014299, "grad_norm": 1.4775705337524414, "learning_rate": 2.2544154977417187e-05, "loss": 0.6635, "num_input_tokens_seen": 45073440, "step": 77640 }, { "epoch": 11.564641048555258, "grad_norm": 1.1387003660202026, "learning_rate": 2.2540921292985553e-05, "loss": 0.5218, "num_input_tokens_seen": 45076416, "step": 77645 }, { "epoch": 11.565385761096216, "grad_norm": 1.017824411392212, "learning_rate": 2.2537687650097554e-05, "loss": 0.5683, "num_input_tokens_seen": 45079424, "step": 77650 }, { "epoch": 11.566130473637177, "grad_norm": 1.2414530515670776, "learning_rate": 2.2534454048807814e-05, "loss": 0.6631, "num_input_tokens_seen": 45082368, "step": 77655 }, { "epoch": 11.566875186178136, "grad_norm": 1.2339690923690796, "learning_rate": 2.2531220489170977e-05, "loss": 0.6667, "num_input_tokens_seen": 45085280, "step": 77660 }, { "epoch": 11.567619898719094, "grad_norm": 1.8311176300048828, "learning_rate": 2.2527986971241642e-05, "loss": 0.5029, "num_input_tokens_seen": 45088416, "step": 77665 }, { "epoch": 11.568364611260053, "grad_norm": 2.1614162921905518, "learning_rate": 2.252475349507447e-05, "loss": 0.7445, "num_input_tokens_seen": 45091520, "step": 77670 }, { "epoch": 11.569109323801012, "grad_norm": 0.7313522696495056, "learning_rate": 2.2521520060724062e-05, "loss": 0.6781, "num_input_tokens_seen": 45094368, "step": 77675 }, { "epoch": 11.569854036341972, "grad_norm": 1.578077793121338, "learning_rate": 2.251828666824506e-05, "loss": 0.7454, "num_input_tokens_seen": 45097184, "step": 77680 }, { "epoch": 11.570598748882931, "grad_norm": 2.2246274948120117, "learning_rate": 2.2515053317692082e-05, "loss": 0.5725, "num_input_tokens_seen": 45100000, "step": 77685 }, { "epoch": 11.57134346142389, "grad_norm": 1.6857293844223022, "learning_rate": 2.2511820009119755e-05, "loss": 0.7462, "num_input_tokens_seen": 45102848, "step": 77690 }, { "epoch": 11.57208817396485, "grad_norm": 1.2338529825210571, "learning_rate": 2.25085867425827e-05, "loss": 0.5504, "num_input_tokens_seen": 45105472, "step": 77695 }, { "epoch": 11.57283288650581, "grad_norm": 1.6031217575073242, "learning_rate": 2.2505353518135534e-05, "loss": 0.5487, "num_input_tokens_seen": 45108384, "step": 77700 }, { "epoch": 11.573577599046768, "grad_norm": 1.442355990409851, "learning_rate": 2.250212033583289e-05, "loss": 0.428, "num_input_tokens_seen": 45111680, "step": 77705 }, { "epoch": 11.574322311587727, "grad_norm": 1.098618745803833, "learning_rate": 2.2498887195729375e-05, "loss": 0.4849, "num_input_tokens_seen": 45114304, "step": 77710 }, { "epoch": 11.575067024128685, "grad_norm": 1.2012845277786255, "learning_rate": 2.2495654097879627e-05, "loss": 0.3746, "num_input_tokens_seen": 45117088, "step": 77715 }, { "epoch": 11.575811736669646, "grad_norm": 1.3407074213027954, "learning_rate": 2.2492421042338257e-05, "loss": 0.6121, "num_input_tokens_seen": 45120000, "step": 77720 }, { "epoch": 11.576556449210605, "grad_norm": 2.1964738368988037, "learning_rate": 2.2489188029159887e-05, "loss": 0.5817, "num_input_tokens_seen": 45122976, "step": 77725 }, { "epoch": 11.577301161751564, "grad_norm": 1.0833414793014526, "learning_rate": 2.2485955058399135e-05, "loss": 0.5856, "num_input_tokens_seen": 45125952, "step": 77730 }, { "epoch": 11.578045874292522, "grad_norm": 1.5507720708847046, "learning_rate": 2.2482722130110608e-05, "loss": 0.7572, "num_input_tokens_seen": 45128736, "step": 77735 }, { "epoch": 11.578790586833483, "grad_norm": 1.579810380935669, "learning_rate": 2.2479489244348938e-05, "loss": 0.645, "num_input_tokens_seen": 45131840, "step": 77740 }, { "epoch": 11.579535299374442, "grad_norm": 1.3981324434280396, "learning_rate": 2.2476256401168736e-05, "loss": 0.7462, "num_input_tokens_seen": 45135296, "step": 77745 }, { "epoch": 11.5802800119154, "grad_norm": 2.9558310508728027, "learning_rate": 2.247302360062461e-05, "loss": 0.6848, "num_input_tokens_seen": 45138240, "step": 77750 }, { "epoch": 11.581024724456359, "grad_norm": 1.2567332983016968, "learning_rate": 2.246979084277119e-05, "loss": 0.3864, "num_input_tokens_seen": 45141088, "step": 77755 }, { "epoch": 11.58176943699732, "grad_norm": 1.360577940940857, "learning_rate": 2.2466558127663086e-05, "loss": 0.5564, "num_input_tokens_seen": 45144288, "step": 77760 }, { "epoch": 11.582514149538278, "grad_norm": 2.012937068939209, "learning_rate": 2.246332545535489e-05, "loss": 0.5802, "num_input_tokens_seen": 45147264, "step": 77765 }, { "epoch": 11.583258862079237, "grad_norm": 1.121293306350708, "learning_rate": 2.246009282590125e-05, "loss": 0.6135, "num_input_tokens_seen": 45150176, "step": 77770 }, { "epoch": 11.584003574620196, "grad_norm": 1.2963541746139526, "learning_rate": 2.2456860239356755e-05, "loss": 0.5344, "num_input_tokens_seen": 45152992, "step": 77775 }, { "epoch": 11.584748287161156, "grad_norm": 1.3919123411178589, "learning_rate": 2.245362769577602e-05, "loss": 0.7443, "num_input_tokens_seen": 45155808, "step": 77780 }, { "epoch": 11.585492999702115, "grad_norm": 1.9013360738754272, "learning_rate": 2.245039519521366e-05, "loss": 0.9331, "num_input_tokens_seen": 45158528, "step": 77785 }, { "epoch": 11.586237712243074, "grad_norm": 1.3927081823349, "learning_rate": 2.2447162737724274e-05, "loss": 0.6258, "num_input_tokens_seen": 45161280, "step": 77790 }, { "epoch": 11.586982424784033, "grad_norm": 1.1593103408813477, "learning_rate": 2.2443930323362487e-05, "loss": 0.7414, "num_input_tokens_seen": 45164160, "step": 77795 }, { "epoch": 11.587727137324993, "grad_norm": 1.4907915592193604, "learning_rate": 2.244069795218289e-05, "loss": 0.7288, "num_input_tokens_seen": 45167200, "step": 77800 }, { "epoch": 11.588471849865952, "grad_norm": 1.2044306993484497, "learning_rate": 2.243746562424011e-05, "loss": 0.6669, "num_input_tokens_seen": 45169856, "step": 77805 }, { "epoch": 11.58921656240691, "grad_norm": 1.4545762538909912, "learning_rate": 2.2434233339588746e-05, "loss": 0.5113, "num_input_tokens_seen": 45172544, "step": 77810 }, { "epoch": 11.58996127494787, "grad_norm": 1.1516060829162598, "learning_rate": 2.2431001098283393e-05, "loss": 0.6993, "num_input_tokens_seen": 45175616, "step": 77815 }, { "epoch": 11.59070598748883, "grad_norm": 2.4867053031921387, "learning_rate": 2.2427768900378674e-05, "loss": 0.5816, "num_input_tokens_seen": 45178560, "step": 77820 }, { "epoch": 11.591450700029789, "grad_norm": 1.6813825368881226, "learning_rate": 2.2424536745929174e-05, "loss": 0.4989, "num_input_tokens_seen": 45181376, "step": 77825 }, { "epoch": 11.592195412570748, "grad_norm": 1.8976635932922363, "learning_rate": 2.2421304634989517e-05, "loss": 0.575, "num_input_tokens_seen": 45184160, "step": 77830 }, { "epoch": 11.592940125111706, "grad_norm": 1.6954715251922607, "learning_rate": 2.2418072567614286e-05, "loss": 0.5384, "num_input_tokens_seen": 45187104, "step": 77835 }, { "epoch": 11.593684837652667, "grad_norm": 1.254178762435913, "learning_rate": 2.241484054385811e-05, "loss": 0.7201, "num_input_tokens_seen": 45189984, "step": 77840 }, { "epoch": 11.594429550193626, "grad_norm": 2.26596736907959, "learning_rate": 2.2411608563775564e-05, "loss": 0.6272, "num_input_tokens_seen": 45192832, "step": 77845 }, { "epoch": 11.595174262734584, "grad_norm": 0.9834962487220764, "learning_rate": 2.240837662742127e-05, "loss": 0.5368, "num_input_tokens_seen": 45195680, "step": 77850 }, { "epoch": 11.595918975275543, "grad_norm": 1.7420024871826172, "learning_rate": 2.240514473484982e-05, "loss": 0.7558, "num_input_tokens_seen": 45198432, "step": 77855 }, { "epoch": 11.596663687816502, "grad_norm": 1.568476915359497, "learning_rate": 2.24019128861158e-05, "loss": 0.7838, "num_input_tokens_seen": 45201152, "step": 77860 }, { "epoch": 11.597408400357462, "grad_norm": 1.6333156824111938, "learning_rate": 2.2398681081273832e-05, "loss": 0.5202, "num_input_tokens_seen": 45204288, "step": 77865 }, { "epoch": 11.598153112898421, "grad_norm": 3.2307732105255127, "learning_rate": 2.239544932037849e-05, "loss": 0.5963, "num_input_tokens_seen": 45207360, "step": 77870 }, { "epoch": 11.59889782543938, "grad_norm": 1.4513705968856812, "learning_rate": 2.2392217603484397e-05, "loss": 0.6669, "num_input_tokens_seen": 45210528, "step": 77875 }, { "epoch": 11.59964253798034, "grad_norm": 2.1688785552978516, "learning_rate": 2.2388985930646135e-05, "loss": 0.5251, "num_input_tokens_seen": 45213024, "step": 77880 }, { "epoch": 11.6003872505213, "grad_norm": 1.631469964981079, "learning_rate": 2.2385754301918303e-05, "loss": 0.6774, "num_input_tokens_seen": 45215776, "step": 77885 }, { "epoch": 11.601131963062258, "grad_norm": 1.0925424098968506, "learning_rate": 2.2382522717355498e-05, "loss": 0.5545, "num_input_tokens_seen": 45218880, "step": 77890 }, { "epoch": 11.601876675603217, "grad_norm": 1.8927149772644043, "learning_rate": 2.2379291177012295e-05, "loss": 0.7589, "num_input_tokens_seen": 45222112, "step": 77895 }, { "epoch": 11.602621388144176, "grad_norm": 1.7475197315216064, "learning_rate": 2.2376059680943324e-05, "loss": 0.6885, "num_input_tokens_seen": 45225024, "step": 77900 }, { "epoch": 11.603366100685136, "grad_norm": 3.738121747970581, "learning_rate": 2.237282822920314e-05, "loss": 0.5838, "num_input_tokens_seen": 45227776, "step": 77905 }, { "epoch": 11.604110813226095, "grad_norm": 1.5779224634170532, "learning_rate": 2.2369596821846367e-05, "loss": 0.5296, "num_input_tokens_seen": 45230656, "step": 77910 }, { "epoch": 11.604855525767054, "grad_norm": 1.594212293624878, "learning_rate": 2.2366365458927574e-05, "loss": 0.6348, "num_input_tokens_seen": 45233440, "step": 77915 }, { "epoch": 11.605600238308012, "grad_norm": 1.1626768112182617, "learning_rate": 2.236313414050137e-05, "loss": 0.5485, "num_input_tokens_seen": 45236416, "step": 77920 }, { "epoch": 11.606344950848973, "grad_norm": 1.3987020254135132, "learning_rate": 2.2359902866622317e-05, "loss": 0.6926, "num_input_tokens_seen": 45239424, "step": 77925 }, { "epoch": 11.607089663389932, "grad_norm": 1.0987249612808228, "learning_rate": 2.2356671637345038e-05, "loss": 0.6382, "num_input_tokens_seen": 45242336, "step": 77930 }, { "epoch": 11.60783437593089, "grad_norm": 1.0262248516082764, "learning_rate": 2.2353440452724102e-05, "loss": 0.5909, "num_input_tokens_seen": 45245824, "step": 77935 }, { "epoch": 11.60857908847185, "grad_norm": 1.454290747642517, "learning_rate": 2.235020931281409e-05, "loss": 0.5286, "num_input_tokens_seen": 45248864, "step": 77940 }, { "epoch": 11.60932380101281, "grad_norm": 1.458928108215332, "learning_rate": 2.2346978217669613e-05, "loss": 0.5037, "num_input_tokens_seen": 45251584, "step": 77945 }, { "epoch": 11.610068513553768, "grad_norm": 1.4408022165298462, "learning_rate": 2.2343747167345233e-05, "loss": 0.6836, "num_input_tokens_seen": 45254624, "step": 77950 }, { "epoch": 11.610813226094727, "grad_norm": 2.878761053085327, "learning_rate": 2.2340516161895553e-05, "loss": 0.5888, "num_input_tokens_seen": 45257600, "step": 77955 }, { "epoch": 11.611557938635686, "grad_norm": 1.8427534103393555, "learning_rate": 2.2337285201375137e-05, "loss": 0.7208, "num_input_tokens_seen": 45260320, "step": 77960 }, { "epoch": 11.612302651176647, "grad_norm": 1.8741168975830078, "learning_rate": 2.233405428583859e-05, "loss": 0.4896, "num_input_tokens_seen": 45263136, "step": 77965 }, { "epoch": 11.613047363717605, "grad_norm": 1.012225866317749, "learning_rate": 2.233082341534049e-05, "loss": 0.6361, "num_input_tokens_seen": 45266112, "step": 77970 }, { "epoch": 11.613792076258564, "grad_norm": 2.4944286346435547, "learning_rate": 2.2327592589935403e-05, "loss": 0.6694, "num_input_tokens_seen": 45269248, "step": 77975 }, { "epoch": 11.614536788799523, "grad_norm": 1.3482006788253784, "learning_rate": 2.2324361809677933e-05, "loss": 0.7054, "num_input_tokens_seen": 45272224, "step": 77980 }, { "epoch": 11.615281501340483, "grad_norm": 2.8354334831237793, "learning_rate": 2.2321131074622647e-05, "loss": 0.7, "num_input_tokens_seen": 45275040, "step": 77985 }, { "epoch": 11.616026213881442, "grad_norm": 1.275518774986267, "learning_rate": 2.2317900384824132e-05, "loss": 0.6567, "num_input_tokens_seen": 45277920, "step": 77990 }, { "epoch": 11.6167709264224, "grad_norm": 1.5116196870803833, "learning_rate": 2.2314669740336957e-05, "loss": 0.5241, "num_input_tokens_seen": 45280960, "step": 77995 }, { "epoch": 11.61751563896336, "grad_norm": 1.8027466535568237, "learning_rate": 2.2311439141215715e-05, "loss": 0.617, "num_input_tokens_seen": 45283936, "step": 78000 }, { "epoch": 11.61826035150432, "grad_norm": 1.4949520826339722, "learning_rate": 2.2308208587514967e-05, "loss": 0.7053, "num_input_tokens_seen": 45286752, "step": 78005 }, { "epoch": 11.619005064045279, "grad_norm": 2.8143930435180664, "learning_rate": 2.230497807928931e-05, "loss": 0.6953, "num_input_tokens_seen": 45289856, "step": 78010 }, { "epoch": 11.619749776586238, "grad_norm": 1.5310578346252441, "learning_rate": 2.2301747616593306e-05, "loss": 0.6095, "num_input_tokens_seen": 45292704, "step": 78015 }, { "epoch": 11.620494489127196, "grad_norm": 1.7448097467422485, "learning_rate": 2.2298517199481534e-05, "loss": 0.5136, "num_input_tokens_seen": 45295616, "step": 78020 }, { "epoch": 11.621239201668157, "grad_norm": 2.971632719039917, "learning_rate": 2.2295286828008572e-05, "loss": 0.6091, "num_input_tokens_seen": 45298560, "step": 78025 }, { "epoch": 11.621983914209116, "grad_norm": 1.3059134483337402, "learning_rate": 2.2292056502228975e-05, "loss": 0.6411, "num_input_tokens_seen": 45301472, "step": 78030 }, { "epoch": 11.622728626750074, "grad_norm": 1.7011957168579102, "learning_rate": 2.2288826222197346e-05, "loss": 0.5799, "num_input_tokens_seen": 45304224, "step": 78035 }, { "epoch": 11.623473339291033, "grad_norm": 1.109559178352356, "learning_rate": 2.228559598796823e-05, "loss": 0.5326, "num_input_tokens_seen": 45307296, "step": 78040 }, { "epoch": 11.624218051831992, "grad_norm": 0.5105577111244202, "learning_rate": 2.2282365799596222e-05, "loss": 0.6232, "num_input_tokens_seen": 45310176, "step": 78045 }, { "epoch": 11.624962764372953, "grad_norm": 3.134207248687744, "learning_rate": 2.2279135657135876e-05, "loss": 0.7313, "num_input_tokens_seen": 45313280, "step": 78050 }, { "epoch": 11.625707476913911, "grad_norm": 1.5628011226654053, "learning_rate": 2.2275905560641775e-05, "loss": 0.6008, "num_input_tokens_seen": 45316224, "step": 78055 }, { "epoch": 11.62645218945487, "grad_norm": 1.067043423652649, "learning_rate": 2.2272675510168482e-05, "loss": 0.6856, "num_input_tokens_seen": 45319040, "step": 78060 }, { "epoch": 11.627196901995829, "grad_norm": 1.9480681419372559, "learning_rate": 2.226944550577055e-05, "loss": 0.3922, "num_input_tokens_seen": 45321696, "step": 78065 }, { "epoch": 11.62794161453679, "grad_norm": 2.035693407058716, "learning_rate": 2.2266215547502573e-05, "loss": 0.5863, "num_input_tokens_seen": 45324608, "step": 78070 }, { "epoch": 11.628686327077748, "grad_norm": 0.7050447463989258, "learning_rate": 2.22629856354191e-05, "loss": 0.6212, "num_input_tokens_seen": 45327904, "step": 78075 }, { "epoch": 11.629431039618707, "grad_norm": 1.2226080894470215, "learning_rate": 2.22597557695747e-05, "loss": 0.5855, "num_input_tokens_seen": 45330720, "step": 78080 }, { "epoch": 11.630175752159666, "grad_norm": 1.8072001934051514, "learning_rate": 2.225652595002395e-05, "loss": 0.4509, "num_input_tokens_seen": 45333568, "step": 78085 }, { "epoch": 11.630920464700626, "grad_norm": 1.704514503479004, "learning_rate": 2.2253296176821402e-05, "loss": 0.7887, "num_input_tokens_seen": 45336256, "step": 78090 }, { "epoch": 11.631665177241585, "grad_norm": 1.41038978099823, "learning_rate": 2.2250066450021628e-05, "loss": 0.6656, "num_input_tokens_seen": 45338976, "step": 78095 }, { "epoch": 11.632409889782544, "grad_norm": 2.484832525253296, "learning_rate": 2.2246836769679175e-05, "loss": 0.692, "num_input_tokens_seen": 45341728, "step": 78100 }, { "epoch": 11.633154602323502, "grad_norm": 2.2484967708587646, "learning_rate": 2.2243607135848625e-05, "loss": 0.5267, "num_input_tokens_seen": 45344704, "step": 78105 }, { "epoch": 11.633899314864463, "grad_norm": 1.287279725074768, "learning_rate": 2.2240377548584532e-05, "loss": 0.6084, "num_input_tokens_seen": 45347648, "step": 78110 }, { "epoch": 11.634644027405422, "grad_norm": 3.8766908645629883, "learning_rate": 2.2237148007941455e-05, "loss": 0.6231, "num_input_tokens_seen": 45350688, "step": 78115 }, { "epoch": 11.63538873994638, "grad_norm": 1.1547000408172607, "learning_rate": 2.2233918513973944e-05, "loss": 0.5265, "num_input_tokens_seen": 45353632, "step": 78120 }, { "epoch": 11.63613345248734, "grad_norm": 2.218356132507324, "learning_rate": 2.223068906673658e-05, "loss": 0.6915, "num_input_tokens_seen": 45356320, "step": 78125 }, { "epoch": 11.6368781650283, "grad_norm": 1.0769368410110474, "learning_rate": 2.22274596662839e-05, "loss": 0.636, "num_input_tokens_seen": 45358880, "step": 78130 }, { "epoch": 11.637622877569259, "grad_norm": 1.0733318328857422, "learning_rate": 2.222423031267048e-05, "loss": 0.7441, "num_input_tokens_seen": 45361760, "step": 78135 }, { "epoch": 11.638367590110217, "grad_norm": 1.9546176195144653, "learning_rate": 2.222100100595087e-05, "loss": 0.7318, "num_input_tokens_seen": 45364544, "step": 78140 }, { "epoch": 11.639112302651176, "grad_norm": 0.9715448617935181, "learning_rate": 2.221777174617962e-05, "loss": 0.5247, "num_input_tokens_seen": 45367616, "step": 78145 }, { "epoch": 11.639857015192137, "grad_norm": 1.6996561288833618, "learning_rate": 2.221454253341129e-05, "loss": 0.6269, "num_input_tokens_seen": 45370400, "step": 78150 }, { "epoch": 11.640601727733095, "grad_norm": 1.3236273527145386, "learning_rate": 2.2211313367700422e-05, "loss": 0.5162, "num_input_tokens_seen": 45373152, "step": 78155 }, { "epoch": 11.641346440274054, "grad_norm": 0.9418445229530334, "learning_rate": 2.2208084249101593e-05, "loss": 0.4465, "num_input_tokens_seen": 45376064, "step": 78160 }, { "epoch": 11.642091152815013, "grad_norm": 2.4174344539642334, "learning_rate": 2.220485517766933e-05, "loss": 0.4261, "num_input_tokens_seen": 45379072, "step": 78165 }, { "epoch": 11.642835865355973, "grad_norm": 1.3875422477722168, "learning_rate": 2.220162615345821e-05, "loss": 0.5208, "num_input_tokens_seen": 45382016, "step": 78170 }, { "epoch": 11.643580577896932, "grad_norm": 0.8191459774971008, "learning_rate": 2.2198397176522773e-05, "loss": 0.4261, "num_input_tokens_seen": 45384960, "step": 78175 }, { "epoch": 11.64432529043789, "grad_norm": 2.2066190242767334, "learning_rate": 2.2195168246917564e-05, "loss": 0.5585, "num_input_tokens_seen": 45387744, "step": 78180 }, { "epoch": 11.64507000297885, "grad_norm": 1.21259343624115, "learning_rate": 2.219193936469714e-05, "loss": 0.6456, "num_input_tokens_seen": 45390496, "step": 78185 }, { "epoch": 11.64581471551981, "grad_norm": 3.0633134841918945, "learning_rate": 2.2188710529916033e-05, "loss": 0.5954, "num_input_tokens_seen": 45393568, "step": 78190 }, { "epoch": 11.646559428060769, "grad_norm": 1.579140543937683, "learning_rate": 2.218548174262882e-05, "loss": 0.5928, "num_input_tokens_seen": 45396480, "step": 78195 }, { "epoch": 11.647304140601728, "grad_norm": 1.2816354036331177, "learning_rate": 2.218225300289002e-05, "loss": 0.5617, "num_input_tokens_seen": 45399360, "step": 78200 }, { "epoch": 11.648048853142686, "grad_norm": 1.5663325786590576, "learning_rate": 2.21790243107542e-05, "loss": 0.6625, "num_input_tokens_seen": 45402464, "step": 78205 }, { "epoch": 11.648793565683647, "grad_norm": 0.8528845310211182, "learning_rate": 2.2175795666275894e-05, "loss": 0.511, "num_input_tokens_seen": 45405472, "step": 78210 }, { "epoch": 11.649538278224606, "grad_norm": 1.4238231182098389, "learning_rate": 2.2172567069509656e-05, "loss": 0.5908, "num_input_tokens_seen": 45408416, "step": 78215 }, { "epoch": 11.650282990765565, "grad_norm": 1.1759757995605469, "learning_rate": 2.2169338520510025e-05, "loss": 0.4243, "num_input_tokens_seen": 45411328, "step": 78220 }, { "epoch": 11.651027703306523, "grad_norm": 2.0486340522766113, "learning_rate": 2.2166110019331526e-05, "loss": 0.5205, "num_input_tokens_seen": 45414240, "step": 78225 }, { "epoch": 11.651772415847482, "grad_norm": 0.9382128715515137, "learning_rate": 2.2162881566028736e-05, "loss": 0.509, "num_input_tokens_seen": 45417568, "step": 78230 }, { "epoch": 11.652517128388443, "grad_norm": 1.2159366607666016, "learning_rate": 2.2159653160656162e-05, "loss": 0.6815, "num_input_tokens_seen": 45420384, "step": 78235 }, { "epoch": 11.653261840929401, "grad_norm": 0.6423872113227844, "learning_rate": 2.2156424803268374e-05, "loss": 0.4048, "num_input_tokens_seen": 45423328, "step": 78240 }, { "epoch": 11.65400655347036, "grad_norm": 1.862596035003662, "learning_rate": 2.2153196493919896e-05, "loss": 0.5167, "num_input_tokens_seen": 45426336, "step": 78245 }, { "epoch": 11.654751266011319, "grad_norm": 1.233704924583435, "learning_rate": 2.214996823266527e-05, "loss": 0.5733, "num_input_tokens_seen": 45429216, "step": 78250 }, { "epoch": 11.65549597855228, "grad_norm": 0.7965656518936157, "learning_rate": 2.2146740019559036e-05, "loss": 0.6058, "num_input_tokens_seen": 45432096, "step": 78255 }, { "epoch": 11.656240691093238, "grad_norm": 2.536121368408203, "learning_rate": 2.214351185465572e-05, "loss": 0.6107, "num_input_tokens_seen": 45435072, "step": 78260 }, { "epoch": 11.656985403634197, "grad_norm": 2.0391297340393066, "learning_rate": 2.214028373800988e-05, "loss": 0.8339, "num_input_tokens_seen": 45437632, "step": 78265 }, { "epoch": 11.657730116175156, "grad_norm": 1.734889268875122, "learning_rate": 2.2137055669676027e-05, "loss": 0.5563, "num_input_tokens_seen": 45440736, "step": 78270 }, { "epoch": 11.658474828716116, "grad_norm": 1.666008472442627, "learning_rate": 2.213382764970872e-05, "loss": 0.4619, "num_input_tokens_seen": 45443744, "step": 78275 }, { "epoch": 11.659219541257075, "grad_norm": 0.5917693972587585, "learning_rate": 2.2130599678162474e-05, "loss": 0.7394, "num_input_tokens_seen": 45446560, "step": 78280 }, { "epoch": 11.659964253798034, "grad_norm": 1.2600082159042358, "learning_rate": 2.212737175509184e-05, "loss": 0.4906, "num_input_tokens_seen": 45449344, "step": 78285 }, { "epoch": 11.660708966338992, "grad_norm": 1.4044246673583984, "learning_rate": 2.2124143880551327e-05, "loss": 0.5422, "num_input_tokens_seen": 45452544, "step": 78290 }, { "epoch": 11.661453678879953, "grad_norm": 1.0602655410766602, "learning_rate": 2.2120916054595492e-05, "loss": 0.51, "num_input_tokens_seen": 45455328, "step": 78295 }, { "epoch": 11.662198391420912, "grad_norm": 1.9631478786468506, "learning_rate": 2.211768827727885e-05, "loss": 0.8117, "num_input_tokens_seen": 45458752, "step": 78300 }, { "epoch": 11.66294310396187, "grad_norm": 1.3350130319595337, "learning_rate": 2.211446054865593e-05, "loss": 0.7752, "num_input_tokens_seen": 45461440, "step": 78305 }, { "epoch": 11.66368781650283, "grad_norm": 1.8349097967147827, "learning_rate": 2.2111232868781277e-05, "loss": 0.5955, "num_input_tokens_seen": 45464448, "step": 78310 }, { "epoch": 11.66443252904379, "grad_norm": 1.712125301361084, "learning_rate": 2.21080052377094e-05, "loss": 0.5581, "num_input_tokens_seen": 45467072, "step": 78315 }, { "epoch": 11.665177241584749, "grad_norm": 1.4002019166946411, "learning_rate": 2.210477765549484e-05, "loss": 0.795, "num_input_tokens_seen": 45469984, "step": 78320 }, { "epoch": 11.665921954125707, "grad_norm": 1.0122569799423218, "learning_rate": 2.210155012219211e-05, "loss": 0.4536, "num_input_tokens_seen": 45472800, "step": 78325 }, { "epoch": 11.666666666666666, "grad_norm": 2.3726999759674072, "learning_rate": 2.2098322637855757e-05, "loss": 0.6839, "num_input_tokens_seen": 45475584, "step": 78330 }, { "epoch": 11.667411379207627, "grad_norm": 2.749744415283203, "learning_rate": 2.2095095202540293e-05, "loss": 0.8192, "num_input_tokens_seen": 45478656, "step": 78335 }, { "epoch": 11.668156091748585, "grad_norm": 1.5249323844909668, "learning_rate": 2.209186781630023e-05, "loss": 0.6315, "num_input_tokens_seen": 45481664, "step": 78340 }, { "epoch": 11.668900804289544, "grad_norm": 1.862992286682129, "learning_rate": 2.2088640479190116e-05, "loss": 0.6215, "num_input_tokens_seen": 45484864, "step": 78345 }, { "epoch": 11.669645516830503, "grad_norm": 0.7937740087509155, "learning_rate": 2.208541319126446e-05, "loss": 0.6123, "num_input_tokens_seen": 45487776, "step": 78350 }, { "epoch": 11.670390229371463, "grad_norm": 1.4789748191833496, "learning_rate": 2.2082185952577788e-05, "loss": 0.6825, "num_input_tokens_seen": 45490624, "step": 78355 }, { "epoch": 11.671134941912422, "grad_norm": 1.2253239154815674, "learning_rate": 2.207895876318461e-05, "loss": 0.6156, "num_input_tokens_seen": 45493312, "step": 78360 }, { "epoch": 11.671879654453381, "grad_norm": 0.9502069354057312, "learning_rate": 2.2075731623139463e-05, "loss": 0.6071, "num_input_tokens_seen": 45496640, "step": 78365 }, { "epoch": 11.67262436699434, "grad_norm": 1.5053308010101318, "learning_rate": 2.207250453249685e-05, "loss": 0.5126, "num_input_tokens_seen": 45499648, "step": 78370 }, { "epoch": 11.673369079535298, "grad_norm": 1.366227626800537, "learning_rate": 2.2069277491311306e-05, "loss": 0.5351, "num_input_tokens_seen": 45502400, "step": 78375 }, { "epoch": 11.674113792076259, "grad_norm": 1.9646369218826294, "learning_rate": 2.2066050499637344e-05, "loss": 0.7124, "num_input_tokens_seen": 45505216, "step": 78380 }, { "epoch": 11.674858504617218, "grad_norm": 2.66231632232666, "learning_rate": 2.2062823557529467e-05, "loss": 0.728, "num_input_tokens_seen": 45508160, "step": 78385 }, { "epoch": 11.675603217158177, "grad_norm": 2.7223541736602783, "learning_rate": 2.2059596665042213e-05, "loss": 0.5746, "num_input_tokens_seen": 45510912, "step": 78390 }, { "epoch": 11.676347929699137, "grad_norm": 2.2047905921936035, "learning_rate": 2.2056369822230067e-05, "loss": 0.5799, "num_input_tokens_seen": 45513696, "step": 78395 }, { "epoch": 11.677092642240096, "grad_norm": 1.1182047128677368, "learning_rate": 2.2053143029147574e-05, "loss": 0.3912, "num_input_tokens_seen": 45516480, "step": 78400 }, { "epoch": 11.677837354781055, "grad_norm": 1.4868371486663818, "learning_rate": 2.2049916285849233e-05, "loss": 0.6813, "num_input_tokens_seen": 45519168, "step": 78405 }, { "epoch": 11.678582067322013, "grad_norm": 1.3935737609863281, "learning_rate": 2.204668959238955e-05, "loss": 0.5669, "num_input_tokens_seen": 45522080, "step": 78410 }, { "epoch": 11.679326779862972, "grad_norm": 1.1507776975631714, "learning_rate": 2.2043462948823057e-05, "loss": 0.7175, "num_input_tokens_seen": 45524768, "step": 78415 }, { "epoch": 11.680071492403933, "grad_norm": 0.915739119052887, "learning_rate": 2.2040236355204244e-05, "loss": 0.5806, "num_input_tokens_seen": 45527456, "step": 78420 }, { "epoch": 11.680816204944891, "grad_norm": 1.0360485315322876, "learning_rate": 2.2037009811587638e-05, "loss": 0.4312, "num_input_tokens_seen": 45530368, "step": 78425 }, { "epoch": 11.68156091748585, "grad_norm": 1.0233737230300903, "learning_rate": 2.2033783318027725e-05, "loss": 0.6447, "num_input_tokens_seen": 45533536, "step": 78430 }, { "epoch": 11.682305630026809, "grad_norm": 1.1957744359970093, "learning_rate": 2.203055687457904e-05, "loss": 0.6751, "num_input_tokens_seen": 45536224, "step": 78435 }, { "epoch": 11.68305034256777, "grad_norm": 3.1914710998535156, "learning_rate": 2.2027330481296074e-05, "loss": 0.747, "num_input_tokens_seen": 45539168, "step": 78440 }, { "epoch": 11.683795055108728, "grad_norm": 1.4752368927001953, "learning_rate": 2.2024104138233343e-05, "loss": 0.6675, "num_input_tokens_seen": 45542048, "step": 78445 }, { "epoch": 11.684539767649687, "grad_norm": 2.712170362472534, "learning_rate": 2.2020877845445338e-05, "loss": 0.6684, "num_input_tokens_seen": 45545056, "step": 78450 }, { "epoch": 11.685284480190646, "grad_norm": 1.3892998695373535, "learning_rate": 2.2017651602986584e-05, "loss": 0.635, "num_input_tokens_seen": 45547712, "step": 78455 }, { "epoch": 11.686029192731606, "grad_norm": 0.5244184732437134, "learning_rate": 2.2014425410911575e-05, "loss": 0.5374, "num_input_tokens_seen": 45550432, "step": 78460 }, { "epoch": 11.686773905272565, "grad_norm": 2.5127341747283936, "learning_rate": 2.2011199269274804e-05, "loss": 0.6295, "num_input_tokens_seen": 45553568, "step": 78465 }, { "epoch": 11.687518617813524, "grad_norm": 2.2633185386657715, "learning_rate": 2.2007973178130795e-05, "loss": 0.6564, "num_input_tokens_seen": 45556288, "step": 78470 }, { "epoch": 11.688263330354483, "grad_norm": 1.0910574197769165, "learning_rate": 2.2004747137534032e-05, "loss": 0.5287, "num_input_tokens_seen": 45559328, "step": 78475 }, { "epoch": 11.689008042895443, "grad_norm": 1.3253191709518433, "learning_rate": 2.2001521147539028e-05, "loss": 0.6364, "num_input_tokens_seen": 45561984, "step": 78480 }, { "epoch": 11.689752755436402, "grad_norm": 1.5638682842254639, "learning_rate": 2.1998295208200263e-05, "loss": 0.456, "num_input_tokens_seen": 45565088, "step": 78485 }, { "epoch": 11.69049746797736, "grad_norm": 1.8966810703277588, "learning_rate": 2.1995069319572264e-05, "loss": 0.4577, "num_input_tokens_seen": 45568096, "step": 78490 }, { "epoch": 11.69124218051832, "grad_norm": 1.507514238357544, "learning_rate": 2.1991843481709513e-05, "loss": 0.6926, "num_input_tokens_seen": 45570880, "step": 78495 }, { "epoch": 11.69198689305928, "grad_norm": 1.0995192527770996, "learning_rate": 2.19886176946665e-05, "loss": 0.5673, "num_input_tokens_seen": 45573824, "step": 78500 }, { "epoch": 11.692731605600239, "grad_norm": 1.1616524457931519, "learning_rate": 2.1985391958497743e-05, "loss": 0.6356, "num_input_tokens_seen": 45576480, "step": 78505 }, { "epoch": 11.693476318141197, "grad_norm": 1.063410758972168, "learning_rate": 2.1982166273257716e-05, "loss": 0.5284, "num_input_tokens_seen": 45579680, "step": 78510 }, { "epoch": 11.694221030682156, "grad_norm": 0.8887651562690735, "learning_rate": 2.1978940639000927e-05, "loss": 0.6693, "num_input_tokens_seen": 45582592, "step": 78515 }, { "epoch": 11.694965743223117, "grad_norm": 1.2546817064285278, "learning_rate": 2.1975715055781858e-05, "loss": 0.5535, "num_input_tokens_seen": 45585472, "step": 78520 }, { "epoch": 11.695710455764075, "grad_norm": 1.4690723419189453, "learning_rate": 2.1972489523655016e-05, "loss": 0.668, "num_input_tokens_seen": 45588512, "step": 78525 }, { "epoch": 11.696455168305034, "grad_norm": 1.071074366569519, "learning_rate": 2.1969264042674877e-05, "loss": 0.7177, "num_input_tokens_seen": 45591488, "step": 78530 }, { "epoch": 11.697199880845993, "grad_norm": 1.5276343822479248, "learning_rate": 2.1966038612895958e-05, "loss": 0.6815, "num_input_tokens_seen": 45594368, "step": 78535 }, { "epoch": 11.697944593386953, "grad_norm": 1.3897898197174072, "learning_rate": 2.1962813234372727e-05, "loss": 0.7715, "num_input_tokens_seen": 45596992, "step": 78540 }, { "epoch": 11.698689305927912, "grad_norm": 1.5797781944274902, "learning_rate": 2.1959587907159673e-05, "loss": 0.6256, "num_input_tokens_seen": 45599616, "step": 78545 }, { "epoch": 11.699434018468871, "grad_norm": 1.7619223594665527, "learning_rate": 2.19563626313113e-05, "loss": 0.6255, "num_input_tokens_seen": 45602464, "step": 78550 }, { "epoch": 11.70017873100983, "grad_norm": 1.258973479270935, "learning_rate": 2.1953137406882078e-05, "loss": 0.4524, "num_input_tokens_seen": 45605696, "step": 78555 }, { "epoch": 11.700923443550789, "grad_norm": 1.060693383216858, "learning_rate": 2.194991223392651e-05, "loss": 0.5077, "num_input_tokens_seen": 45608480, "step": 78560 }, { "epoch": 11.701668156091749, "grad_norm": 1.4304044246673584, "learning_rate": 2.1946687112499066e-05, "loss": 0.9316, "num_input_tokens_seen": 45611232, "step": 78565 }, { "epoch": 11.702412868632708, "grad_norm": 1.7488514184951782, "learning_rate": 2.194346204265425e-05, "loss": 0.7382, "num_input_tokens_seen": 45614208, "step": 78570 }, { "epoch": 11.703157581173667, "grad_norm": 1.6343507766723633, "learning_rate": 2.1940237024446535e-05, "loss": 0.6289, "num_input_tokens_seen": 45617280, "step": 78575 }, { "epoch": 11.703902293714627, "grad_norm": 0.8864413499832153, "learning_rate": 2.19370120579304e-05, "loss": 0.4857, "num_input_tokens_seen": 45620384, "step": 78580 }, { "epoch": 11.704647006255586, "grad_norm": 0.9735593795776367, "learning_rate": 2.1933787143160343e-05, "loss": 0.6091, "num_input_tokens_seen": 45623648, "step": 78585 }, { "epoch": 11.705391718796545, "grad_norm": 1.351637601852417, "learning_rate": 2.193056228019082e-05, "loss": 0.5286, "num_input_tokens_seen": 45626720, "step": 78590 }, { "epoch": 11.706136431337503, "grad_norm": 1.3923449516296387, "learning_rate": 2.1927337469076343e-05, "loss": 0.5988, "num_input_tokens_seen": 45629728, "step": 78595 }, { "epoch": 11.706881143878462, "grad_norm": 0.875547468662262, "learning_rate": 2.1924112709871362e-05, "loss": 0.636, "num_input_tokens_seen": 45632256, "step": 78600 }, { "epoch": 11.707625856419423, "grad_norm": 1.8463726043701172, "learning_rate": 2.1920888002630382e-05, "loss": 0.5991, "num_input_tokens_seen": 45635136, "step": 78605 }, { "epoch": 11.708370568960381, "grad_norm": 1.9718855619430542, "learning_rate": 2.1917663347407867e-05, "loss": 0.6486, "num_input_tokens_seen": 45637888, "step": 78610 }, { "epoch": 11.70911528150134, "grad_norm": 1.283539056777954, "learning_rate": 2.1914438744258298e-05, "loss": 0.5467, "num_input_tokens_seen": 45640928, "step": 78615 }, { "epoch": 11.709859994042299, "grad_norm": 1.266677737236023, "learning_rate": 2.1911214193236153e-05, "loss": 0.6045, "num_input_tokens_seen": 45643904, "step": 78620 }, { "epoch": 11.71060470658326, "grad_norm": 2.1848084926605225, "learning_rate": 2.1907989694395893e-05, "loss": 0.6095, "num_input_tokens_seen": 45646976, "step": 78625 }, { "epoch": 11.711349419124218, "grad_norm": 1.6900218725204468, "learning_rate": 2.1904765247792016e-05, "loss": 0.4696, "num_input_tokens_seen": 45649760, "step": 78630 }, { "epoch": 11.712094131665177, "grad_norm": 2.1714041233062744, "learning_rate": 2.1901540853478976e-05, "loss": 0.8178, "num_input_tokens_seen": 45652704, "step": 78635 }, { "epoch": 11.712838844206136, "grad_norm": 0.9823921322822571, "learning_rate": 2.1898316511511264e-05, "loss": 0.6589, "num_input_tokens_seen": 45655520, "step": 78640 }, { "epoch": 11.713583556747096, "grad_norm": 1.0327845811843872, "learning_rate": 2.1895092221943335e-05, "loss": 0.6095, "num_input_tokens_seen": 45658432, "step": 78645 }, { "epoch": 11.714328269288055, "grad_norm": 1.4652155637741089, "learning_rate": 2.1891867984829672e-05, "loss": 0.5374, "num_input_tokens_seen": 45661216, "step": 78650 }, { "epoch": 11.715072981829014, "grad_norm": 1.43727445602417, "learning_rate": 2.1888643800224728e-05, "loss": 0.5584, "num_input_tokens_seen": 45663904, "step": 78655 }, { "epoch": 11.715817694369973, "grad_norm": 0.9205118417739868, "learning_rate": 2.1885419668183e-05, "loss": 0.5466, "num_input_tokens_seen": 45667008, "step": 78660 }, { "epoch": 11.716562406910933, "grad_norm": 1.573641300201416, "learning_rate": 2.188219558875894e-05, "loss": 0.4882, "num_input_tokens_seen": 45670080, "step": 78665 }, { "epoch": 11.717307119451892, "grad_norm": 1.4162282943725586, "learning_rate": 2.1878971562007007e-05, "loss": 0.6326, "num_input_tokens_seen": 45672992, "step": 78670 }, { "epoch": 11.71805183199285, "grad_norm": 1.195021629333496, "learning_rate": 2.1875747587981686e-05, "loss": 0.4401, "num_input_tokens_seen": 45675936, "step": 78675 }, { "epoch": 11.71879654453381, "grad_norm": 2.086798906326294, "learning_rate": 2.1872523666737428e-05, "loss": 0.6019, "num_input_tokens_seen": 45678880, "step": 78680 }, { "epoch": 11.71954125707477, "grad_norm": 1.2420730590820312, "learning_rate": 2.186929979832871e-05, "loss": 0.5749, "num_input_tokens_seen": 45681696, "step": 78685 }, { "epoch": 11.720285969615729, "grad_norm": 1.6710195541381836, "learning_rate": 2.186607598280998e-05, "loss": 0.3821, "num_input_tokens_seen": 45684256, "step": 78690 }, { "epoch": 11.721030682156687, "grad_norm": 1.947466254234314, "learning_rate": 2.186285222023572e-05, "loss": 0.5339, "num_input_tokens_seen": 45687232, "step": 78695 }, { "epoch": 11.721775394697646, "grad_norm": 1.1373083591461182, "learning_rate": 2.185962851066039e-05, "loss": 0.6401, "num_input_tokens_seen": 45690496, "step": 78700 }, { "epoch": 11.722520107238607, "grad_norm": 1.4153269529342651, "learning_rate": 2.1856404854138426e-05, "loss": 0.7105, "num_input_tokens_seen": 45693280, "step": 78705 }, { "epoch": 11.723264819779565, "grad_norm": 1.7628341913223267, "learning_rate": 2.1853181250724318e-05, "loss": 0.5324, "num_input_tokens_seen": 45696064, "step": 78710 }, { "epoch": 11.724009532320524, "grad_norm": 1.645149827003479, "learning_rate": 2.1849957700472515e-05, "loss": 0.571, "num_input_tokens_seen": 45698816, "step": 78715 }, { "epoch": 11.724754244861483, "grad_norm": 1.273402214050293, "learning_rate": 2.1846734203437478e-05, "loss": 0.5903, "num_input_tokens_seen": 45701696, "step": 78720 }, { "epoch": 11.725498957402444, "grad_norm": 1.0465909242630005, "learning_rate": 2.1843510759673648e-05, "loss": 0.4469, "num_input_tokens_seen": 45704512, "step": 78725 }, { "epoch": 11.726243669943402, "grad_norm": 1.544969081878662, "learning_rate": 2.184028736923551e-05, "loss": 0.801, "num_input_tokens_seen": 45707328, "step": 78730 }, { "epoch": 11.726988382484361, "grad_norm": 1.276414394378662, "learning_rate": 2.1837064032177497e-05, "loss": 0.5762, "num_input_tokens_seen": 45710080, "step": 78735 }, { "epoch": 11.72773309502532, "grad_norm": 1.6158778667449951, "learning_rate": 2.1833840748554075e-05, "loss": 0.6909, "num_input_tokens_seen": 45712896, "step": 78740 }, { "epoch": 11.728477807566279, "grad_norm": 1.9159947633743286, "learning_rate": 2.18306175184197e-05, "loss": 0.699, "num_input_tokens_seen": 45715616, "step": 78745 }, { "epoch": 11.729222520107239, "grad_norm": 0.8329160213470459, "learning_rate": 2.1827394341828817e-05, "loss": 0.5697, "num_input_tokens_seen": 45718848, "step": 78750 }, { "epoch": 11.729967232648198, "grad_norm": 1.0289952754974365, "learning_rate": 2.1824171218835886e-05, "loss": 0.4847, "num_input_tokens_seen": 45721888, "step": 78755 }, { "epoch": 11.730711945189157, "grad_norm": 0.8580930829048157, "learning_rate": 2.1820948149495343e-05, "loss": 0.496, "num_input_tokens_seen": 45724832, "step": 78760 }, { "epoch": 11.731456657730115, "grad_norm": 2.661134719848633, "learning_rate": 2.181772513386166e-05, "loss": 0.5472, "num_input_tokens_seen": 45728000, "step": 78765 }, { "epoch": 11.732201370271076, "grad_norm": 1.370902419090271, "learning_rate": 2.1814502171989276e-05, "loss": 0.6185, "num_input_tokens_seen": 45730784, "step": 78770 }, { "epoch": 11.732946082812035, "grad_norm": 1.3750191926956177, "learning_rate": 2.1811279263932642e-05, "loss": 0.4402, "num_input_tokens_seen": 45733568, "step": 78775 }, { "epoch": 11.733690795352993, "grad_norm": 0.9013421535491943, "learning_rate": 2.1808056409746196e-05, "loss": 0.5212, "num_input_tokens_seen": 45736416, "step": 78780 }, { "epoch": 11.734435507893952, "grad_norm": 2.237238645553589, "learning_rate": 2.18048336094844e-05, "loss": 0.8821, "num_input_tokens_seen": 45739104, "step": 78785 }, { "epoch": 11.735180220434913, "grad_norm": 1.3464900255203247, "learning_rate": 2.180161086320169e-05, "loss": 0.6766, "num_input_tokens_seen": 45741984, "step": 78790 }, { "epoch": 11.735924932975871, "grad_norm": 1.3025965690612793, "learning_rate": 2.1798388170952508e-05, "loss": 0.4068, "num_input_tokens_seen": 45744608, "step": 78795 }, { "epoch": 11.73666964551683, "grad_norm": 0.885981559753418, "learning_rate": 2.1795165532791315e-05, "loss": 0.7099, "num_input_tokens_seen": 45747328, "step": 78800 }, { "epoch": 11.737414358057789, "grad_norm": 2.233222723007202, "learning_rate": 2.1791942948772533e-05, "loss": 0.7925, "num_input_tokens_seen": 45750400, "step": 78805 }, { "epoch": 11.73815907059875, "grad_norm": 2.5483219623565674, "learning_rate": 2.1788720418950626e-05, "loss": 0.647, "num_input_tokens_seen": 45753152, "step": 78810 }, { "epoch": 11.738903783139708, "grad_norm": 1.9205669164657593, "learning_rate": 2.178549794338001e-05, "loss": 0.571, "num_input_tokens_seen": 45756320, "step": 78815 }, { "epoch": 11.739648495680667, "grad_norm": 1.3739871978759766, "learning_rate": 2.178227552211515e-05, "loss": 0.43, "num_input_tokens_seen": 45759488, "step": 78820 }, { "epoch": 11.740393208221626, "grad_norm": 2.8463706970214844, "learning_rate": 2.1779053155210474e-05, "loss": 0.6564, "num_input_tokens_seen": 45762208, "step": 78825 }, { "epoch": 11.741137920762586, "grad_norm": 2.2957096099853516, "learning_rate": 2.177583084272041e-05, "loss": 0.5453, "num_input_tokens_seen": 45764896, "step": 78830 }, { "epoch": 11.741882633303545, "grad_norm": 1.496172308921814, "learning_rate": 2.177260858469942e-05, "loss": 0.547, "num_input_tokens_seen": 45767712, "step": 78835 }, { "epoch": 11.742627345844504, "grad_norm": 1.4615920782089233, "learning_rate": 2.176938638120192e-05, "loss": 0.5922, "num_input_tokens_seen": 45770880, "step": 78840 }, { "epoch": 11.743372058385463, "grad_norm": 1.1393522024154663, "learning_rate": 2.176616423228236e-05, "loss": 0.5359, "num_input_tokens_seen": 45773856, "step": 78845 }, { "epoch": 11.744116770926423, "grad_norm": 1.0075700283050537, "learning_rate": 2.1762942137995158e-05, "loss": 0.7444, "num_input_tokens_seen": 45776576, "step": 78850 }, { "epoch": 11.744861483467382, "grad_norm": 0.8639349341392517, "learning_rate": 2.175972009839477e-05, "loss": 0.5179, "num_input_tokens_seen": 45779392, "step": 78855 }, { "epoch": 11.74560619600834, "grad_norm": 1.4615241289138794, "learning_rate": 2.1756498113535617e-05, "loss": 0.5735, "num_input_tokens_seen": 45782368, "step": 78860 }, { "epoch": 11.7463509085493, "grad_norm": 2.2064154148101807, "learning_rate": 2.1753276183472122e-05, "loss": 0.4215, "num_input_tokens_seen": 45785056, "step": 78865 }, { "epoch": 11.74709562109026, "grad_norm": 3.1357038021087646, "learning_rate": 2.1750054308258737e-05, "loss": 0.5342, "num_input_tokens_seen": 45787904, "step": 78870 }, { "epoch": 11.747840333631219, "grad_norm": 1.248058795928955, "learning_rate": 2.1746832487949874e-05, "loss": 0.6033, "num_input_tokens_seen": 45790624, "step": 78875 }, { "epoch": 11.748585046172177, "grad_norm": 1.6263453960418701, "learning_rate": 2.174361072259998e-05, "loss": 0.623, "num_input_tokens_seen": 45793824, "step": 78880 }, { "epoch": 11.749329758713136, "grad_norm": 1.5350650548934937, "learning_rate": 2.1740389012263454e-05, "loss": 0.5351, "num_input_tokens_seen": 45796736, "step": 78885 }, { "epoch": 11.750074471254095, "grad_norm": 0.9640829563140869, "learning_rate": 2.173716735699476e-05, "loss": 0.7244, "num_input_tokens_seen": 45799712, "step": 78890 }, { "epoch": 11.750819183795056, "grad_norm": 1.3454304933547974, "learning_rate": 2.173394575684829e-05, "loss": 0.4338, "num_input_tokens_seen": 45802528, "step": 78895 }, { "epoch": 11.751563896336014, "grad_norm": 2.048779249191284, "learning_rate": 2.1730724211878506e-05, "loss": 0.7462, "num_input_tokens_seen": 45805632, "step": 78900 }, { "epoch": 11.752308608876973, "grad_norm": 1.8848756551742554, "learning_rate": 2.172750272213981e-05, "loss": 0.5654, "num_input_tokens_seen": 45808256, "step": 78905 }, { "epoch": 11.753053321417934, "grad_norm": 1.0258872509002686, "learning_rate": 2.1724281287686622e-05, "loss": 0.5353, "num_input_tokens_seen": 45811168, "step": 78910 }, { "epoch": 11.753798033958892, "grad_norm": 1.1056673526763916, "learning_rate": 2.1721059908573383e-05, "loss": 0.4113, "num_input_tokens_seen": 45814048, "step": 78915 }, { "epoch": 11.754542746499851, "grad_norm": 1.3003957271575928, "learning_rate": 2.171783858485449e-05, "loss": 0.4791, "num_input_tokens_seen": 45816960, "step": 78920 }, { "epoch": 11.75528745904081, "grad_norm": 1.8443739414215088, "learning_rate": 2.171461731658439e-05, "loss": 0.5624, "num_input_tokens_seen": 45819680, "step": 78925 }, { "epoch": 11.756032171581769, "grad_norm": 1.905148983001709, "learning_rate": 2.1711396103817477e-05, "loss": 0.6489, "num_input_tokens_seen": 45823008, "step": 78930 }, { "epoch": 11.75677688412273, "grad_norm": 1.1589999198913574, "learning_rate": 2.17081749466082e-05, "loss": 0.5456, "num_input_tokens_seen": 45825952, "step": 78935 }, { "epoch": 11.757521596663688, "grad_norm": 1.6781710386276245, "learning_rate": 2.170495384501096e-05, "loss": 0.5389, "num_input_tokens_seen": 45828672, "step": 78940 }, { "epoch": 11.758266309204647, "grad_norm": 0.9204805493354797, "learning_rate": 2.1701732799080173e-05, "loss": 0.5148, "num_input_tokens_seen": 45831808, "step": 78945 }, { "epoch": 11.759011021745605, "grad_norm": 1.8689779043197632, "learning_rate": 2.169851180887026e-05, "loss": 0.5629, "num_input_tokens_seen": 45834656, "step": 78950 }, { "epoch": 11.759755734286566, "grad_norm": 2.4302287101745605, "learning_rate": 2.1695290874435623e-05, "loss": 0.7075, "num_input_tokens_seen": 45837376, "step": 78955 }, { "epoch": 11.760500446827525, "grad_norm": 1.00412917137146, "learning_rate": 2.16920699958307e-05, "loss": 0.7979, "num_input_tokens_seen": 45840416, "step": 78960 }, { "epoch": 11.761245159368483, "grad_norm": 2.5562634468078613, "learning_rate": 2.168884917310988e-05, "loss": 0.6517, "num_input_tokens_seen": 45843424, "step": 78965 }, { "epoch": 11.761989871909442, "grad_norm": 1.530233383178711, "learning_rate": 2.16856284063276e-05, "loss": 0.6748, "num_input_tokens_seen": 45846336, "step": 78970 }, { "epoch": 11.762734584450403, "grad_norm": 1.2645570039749146, "learning_rate": 2.1682407695538255e-05, "loss": 0.6619, "num_input_tokens_seen": 45849024, "step": 78975 }, { "epoch": 11.763479296991362, "grad_norm": 2.0382988452911377, "learning_rate": 2.1679187040796266e-05, "loss": 0.5249, "num_input_tokens_seen": 45851616, "step": 78980 }, { "epoch": 11.76422400953232, "grad_norm": 0.6853756904602051, "learning_rate": 2.1675966442156038e-05, "loss": 0.6966, "num_input_tokens_seen": 45854592, "step": 78985 }, { "epoch": 11.764968722073279, "grad_norm": 2.706409454345703, "learning_rate": 2.1672745899671965e-05, "loss": 0.6935, "num_input_tokens_seen": 45857696, "step": 78990 }, { "epoch": 11.76571343461424, "grad_norm": 1.541149616241455, "learning_rate": 2.1669525413398477e-05, "loss": 0.5408, "num_input_tokens_seen": 45860640, "step": 78995 }, { "epoch": 11.766458147155198, "grad_norm": 1.1125109195709229, "learning_rate": 2.166630498338997e-05, "loss": 0.5448, "num_input_tokens_seen": 45863872, "step": 79000 }, { "epoch": 11.767202859696157, "grad_norm": 1.0362861156463623, "learning_rate": 2.1663084609700853e-05, "loss": 0.522, "num_input_tokens_seen": 45866624, "step": 79005 }, { "epoch": 11.767947572237116, "grad_norm": 2.9204764366149902, "learning_rate": 2.1659864292385528e-05, "loss": 0.638, "num_input_tokens_seen": 45869344, "step": 79010 }, { "epoch": 11.768692284778076, "grad_norm": 1.283484697341919, "learning_rate": 2.1656644031498407e-05, "loss": 0.7404, "num_input_tokens_seen": 45872256, "step": 79015 }, { "epoch": 11.769436997319035, "grad_norm": 2.21307110786438, "learning_rate": 2.1653423827093888e-05, "loss": 0.7906, "num_input_tokens_seen": 45875008, "step": 79020 }, { "epoch": 11.770181709859994, "grad_norm": 1.526346206665039, "learning_rate": 2.1650203679226362e-05, "loss": 0.8355, "num_input_tokens_seen": 45877760, "step": 79025 }, { "epoch": 11.770926422400953, "grad_norm": 1.8078055381774902, "learning_rate": 2.164698358795025e-05, "loss": 0.7164, "num_input_tokens_seen": 45880672, "step": 79030 }, { "epoch": 11.771671134941913, "grad_norm": 1.0227670669555664, "learning_rate": 2.164376355331993e-05, "loss": 0.5574, "num_input_tokens_seen": 45883648, "step": 79035 }, { "epoch": 11.772415847482872, "grad_norm": 1.4606540203094482, "learning_rate": 2.1640543575389828e-05, "loss": 0.6323, "num_input_tokens_seen": 45886272, "step": 79040 }, { "epoch": 11.77316056002383, "grad_norm": 1.144761323928833, "learning_rate": 2.163732365421432e-05, "loss": 0.7187, "num_input_tokens_seen": 45889184, "step": 79045 }, { "epoch": 11.77390527256479, "grad_norm": 1.1007381677627563, "learning_rate": 2.1634103789847813e-05, "loss": 0.4613, "num_input_tokens_seen": 45892384, "step": 79050 }, { "epoch": 11.77464998510575, "grad_norm": 1.6951355934143066, "learning_rate": 2.1630883982344695e-05, "loss": 0.5247, "num_input_tokens_seen": 45895200, "step": 79055 }, { "epoch": 11.775394697646709, "grad_norm": 1.0169938802719116, "learning_rate": 2.1627664231759383e-05, "loss": 0.6023, "num_input_tokens_seen": 45898240, "step": 79060 }, { "epoch": 11.776139410187668, "grad_norm": 1.9192777872085571, "learning_rate": 2.1624444538146248e-05, "loss": 0.4908, "num_input_tokens_seen": 45900992, "step": 79065 }, { "epoch": 11.776884122728626, "grad_norm": 1.2545039653778076, "learning_rate": 2.1621224901559685e-05, "loss": 0.641, "num_input_tokens_seen": 45904064, "step": 79070 }, { "epoch": 11.777628835269585, "grad_norm": 1.8782857656478882, "learning_rate": 2.1618005322054103e-05, "loss": 0.6089, "num_input_tokens_seen": 45906784, "step": 79075 }, { "epoch": 11.778373547810546, "grad_norm": 1.0128264427185059, "learning_rate": 2.1614785799683877e-05, "loss": 0.42, "num_input_tokens_seen": 45909632, "step": 79080 }, { "epoch": 11.779118260351504, "grad_norm": 1.4517929553985596, "learning_rate": 2.1611566334503413e-05, "loss": 0.4654, "num_input_tokens_seen": 45912576, "step": 79085 }, { "epoch": 11.779862972892463, "grad_norm": 1.5033411979675293, "learning_rate": 2.160834692656708e-05, "loss": 0.6248, "num_input_tokens_seen": 45915520, "step": 79090 }, { "epoch": 11.780607685433424, "grad_norm": 1.4551832675933838, "learning_rate": 2.160512757592929e-05, "loss": 0.667, "num_input_tokens_seen": 45918752, "step": 79095 }, { "epoch": 11.781352397974382, "grad_norm": 0.7732240557670593, "learning_rate": 2.1601908282644418e-05, "loss": 0.5022, "num_input_tokens_seen": 45921600, "step": 79100 }, { "epoch": 11.782097110515341, "grad_norm": 1.1723500490188599, "learning_rate": 2.1598689046766848e-05, "loss": 0.587, "num_input_tokens_seen": 45924416, "step": 79105 }, { "epoch": 11.7828418230563, "grad_norm": 1.0677117109298706, "learning_rate": 2.1595469868350966e-05, "loss": 0.5644, "num_input_tokens_seen": 45927520, "step": 79110 }, { "epoch": 11.783586535597259, "grad_norm": 1.4378340244293213, "learning_rate": 2.1592250747451166e-05, "loss": 0.498, "num_input_tokens_seen": 45930464, "step": 79115 }, { "epoch": 11.78433124813822, "grad_norm": 1.1917279958724976, "learning_rate": 2.1589031684121828e-05, "loss": 0.6547, "num_input_tokens_seen": 45933248, "step": 79120 }, { "epoch": 11.785075960679178, "grad_norm": 1.2920345067977905, "learning_rate": 2.1585812678417323e-05, "loss": 0.6704, "num_input_tokens_seen": 45936320, "step": 79125 }, { "epoch": 11.785820673220137, "grad_norm": 2.1992027759552, "learning_rate": 2.1582593730392055e-05, "loss": 0.6548, "num_input_tokens_seen": 45938944, "step": 79130 }, { "epoch": 11.786565385761095, "grad_norm": 1.4189765453338623, "learning_rate": 2.1579374840100383e-05, "loss": 0.6136, "num_input_tokens_seen": 45941824, "step": 79135 }, { "epoch": 11.787310098302056, "grad_norm": 1.0698970556259155, "learning_rate": 2.1576156007596705e-05, "loss": 0.6878, "num_input_tokens_seen": 45944736, "step": 79140 }, { "epoch": 11.788054810843015, "grad_norm": 3.025099515914917, "learning_rate": 2.1572937232935385e-05, "loss": 0.5055, "num_input_tokens_seen": 45947520, "step": 79145 }, { "epoch": 11.788799523383974, "grad_norm": 1.0274512767791748, "learning_rate": 2.1569718516170806e-05, "loss": 0.5764, "num_input_tokens_seen": 45950208, "step": 79150 }, { "epoch": 11.789544235924932, "grad_norm": 2.526303291320801, "learning_rate": 2.1566499857357352e-05, "loss": 0.6257, "num_input_tokens_seen": 45953120, "step": 79155 }, { "epoch": 11.790288948465893, "grad_norm": 1.5209721326828003, "learning_rate": 2.1563281256549385e-05, "loss": 0.6204, "num_input_tokens_seen": 45956032, "step": 79160 }, { "epoch": 11.791033661006852, "grad_norm": 1.6222944259643555, "learning_rate": 2.15600627138013e-05, "loss": 0.5493, "num_input_tokens_seen": 45959136, "step": 79165 }, { "epoch": 11.79177837354781, "grad_norm": 1.9132167100906372, "learning_rate": 2.155684422916745e-05, "loss": 0.5351, "num_input_tokens_seen": 45962080, "step": 79170 }, { "epoch": 11.792523086088769, "grad_norm": 1.7262752056121826, "learning_rate": 2.1553625802702226e-05, "loss": 0.7913, "num_input_tokens_seen": 45965088, "step": 79175 }, { "epoch": 11.79326779862973, "grad_norm": 1.4349359273910522, "learning_rate": 2.155040743445999e-05, "loss": 0.604, "num_input_tokens_seen": 45967936, "step": 79180 }, { "epoch": 11.794012511170688, "grad_norm": 2.261584758758545, "learning_rate": 2.1547189124495103e-05, "loss": 0.7831, "num_input_tokens_seen": 45971072, "step": 79185 }, { "epoch": 11.794757223711647, "grad_norm": 1.300376057624817, "learning_rate": 2.1543970872861957e-05, "loss": 0.637, "num_input_tokens_seen": 45974176, "step": 79190 }, { "epoch": 11.795501936252606, "grad_norm": 0.9704028367996216, "learning_rate": 2.15407526796149e-05, "loss": 0.4108, "num_input_tokens_seen": 45976832, "step": 79195 }, { "epoch": 11.796246648793566, "grad_norm": 2.034463882446289, "learning_rate": 2.153753454480832e-05, "loss": 0.4567, "num_input_tokens_seen": 45979616, "step": 79200 }, { "epoch": 11.796991361334525, "grad_norm": 1.4340976476669312, "learning_rate": 2.1534316468496575e-05, "loss": 0.7009, "num_input_tokens_seen": 45982656, "step": 79205 }, { "epoch": 11.797736073875484, "grad_norm": 1.9081950187683105, "learning_rate": 2.153109845073403e-05, "loss": 0.663, "num_input_tokens_seen": 45985568, "step": 79210 }, { "epoch": 11.798480786416443, "grad_norm": 1.673884391784668, "learning_rate": 2.1527880491575042e-05, "loss": 0.6543, "num_input_tokens_seen": 45988544, "step": 79215 }, { "epoch": 11.799225498957403, "grad_norm": 1.9637662172317505, "learning_rate": 2.1524662591073997e-05, "loss": 0.5711, "num_input_tokens_seen": 45991232, "step": 79220 }, { "epoch": 11.799970211498362, "grad_norm": 2.466944932937622, "learning_rate": 2.1521444749285244e-05, "loss": 0.5612, "num_input_tokens_seen": 45994400, "step": 79225 }, { "epoch": 11.80071492403932, "grad_norm": 1.0474272966384888, "learning_rate": 2.1518226966263136e-05, "loss": 0.5441, "num_input_tokens_seen": 45997408, "step": 79230 }, { "epoch": 11.80145963658028, "grad_norm": 1.961087703704834, "learning_rate": 2.1515009242062055e-05, "loss": 0.6425, "num_input_tokens_seen": 46000480, "step": 79235 }, { "epoch": 11.80220434912124, "grad_norm": 1.0497846603393555, "learning_rate": 2.1511791576736346e-05, "loss": 0.5516, "num_input_tokens_seen": 46003360, "step": 79240 }, { "epoch": 11.802949061662199, "grad_norm": 1.464374303817749, "learning_rate": 2.1508573970340377e-05, "loss": 0.5553, "num_input_tokens_seen": 46006240, "step": 79245 }, { "epoch": 11.803693774203158, "grad_norm": 1.7750049829483032, "learning_rate": 2.1505356422928493e-05, "loss": 0.6882, "num_input_tokens_seen": 46009088, "step": 79250 }, { "epoch": 11.804438486744116, "grad_norm": 1.3441176414489746, "learning_rate": 2.1502138934555072e-05, "loss": 0.6901, "num_input_tokens_seen": 46012032, "step": 79255 }, { "epoch": 11.805183199285075, "grad_norm": 1.1007053852081299, "learning_rate": 2.1498921505274444e-05, "loss": 0.5947, "num_input_tokens_seen": 46015008, "step": 79260 }, { "epoch": 11.805927911826036, "grad_norm": 1.1660850048065186, "learning_rate": 2.1495704135140992e-05, "loss": 0.5906, "num_input_tokens_seen": 46017632, "step": 79265 }, { "epoch": 11.806672624366994, "grad_norm": 1.1392042636871338, "learning_rate": 2.1492486824209058e-05, "loss": 0.8149, "num_input_tokens_seen": 46020384, "step": 79270 }, { "epoch": 11.807417336907953, "grad_norm": 1.3567533493041992, "learning_rate": 2.1489269572532987e-05, "loss": 0.6123, "num_input_tokens_seen": 46023424, "step": 79275 }, { "epoch": 11.808162049448912, "grad_norm": 1.762372612953186, "learning_rate": 2.1486052380167146e-05, "loss": 0.4622, "num_input_tokens_seen": 46026208, "step": 79280 }, { "epoch": 11.808906761989872, "grad_norm": 1.180572271347046, "learning_rate": 2.1482835247165867e-05, "loss": 0.5321, "num_input_tokens_seen": 46028768, "step": 79285 }, { "epoch": 11.809651474530831, "grad_norm": 1.0007877349853516, "learning_rate": 2.1479618173583522e-05, "loss": 0.6531, "num_input_tokens_seen": 46031456, "step": 79290 }, { "epoch": 11.81039618707179, "grad_norm": 2.9968655109405518, "learning_rate": 2.147640115947444e-05, "loss": 0.7039, "num_input_tokens_seen": 46034304, "step": 79295 }, { "epoch": 11.811140899612749, "grad_norm": 1.0532209873199463, "learning_rate": 2.147318420489299e-05, "loss": 0.4604, "num_input_tokens_seen": 46037408, "step": 79300 }, { "epoch": 11.81188561215371, "grad_norm": 1.7967835664749146, "learning_rate": 2.1469967309893508e-05, "loss": 0.5092, "num_input_tokens_seen": 46040512, "step": 79305 }, { "epoch": 11.812630324694668, "grad_norm": 0.8887224197387695, "learning_rate": 2.1466750474530333e-05, "loss": 0.6382, "num_input_tokens_seen": 46043616, "step": 79310 }, { "epoch": 11.813375037235627, "grad_norm": 1.3685482740402222, "learning_rate": 2.1463533698857827e-05, "loss": 0.7346, "num_input_tokens_seen": 46046432, "step": 79315 }, { "epoch": 11.814119749776586, "grad_norm": 1.7640572786331177, "learning_rate": 2.1460316982930313e-05, "loss": 0.6288, "num_input_tokens_seen": 46049248, "step": 79320 }, { "epoch": 11.814864462317546, "grad_norm": 0.9572517275810242, "learning_rate": 2.1457100326802155e-05, "loss": 0.6883, "num_input_tokens_seen": 46052160, "step": 79325 }, { "epoch": 11.815609174858505, "grad_norm": 1.3226702213287354, "learning_rate": 2.1453883730527677e-05, "loss": 0.6206, "num_input_tokens_seen": 46055264, "step": 79330 }, { "epoch": 11.816353887399464, "grad_norm": 1.0596628189086914, "learning_rate": 2.145066719416124e-05, "loss": 0.5098, "num_input_tokens_seen": 46057952, "step": 79335 }, { "epoch": 11.817098599940422, "grad_norm": 2.453294277191162, "learning_rate": 2.1447450717757167e-05, "loss": 0.6708, "num_input_tokens_seen": 46061056, "step": 79340 }, { "epoch": 11.817843312481383, "grad_norm": 1.0599933862686157, "learning_rate": 2.144423430136981e-05, "loss": 0.5024, "num_input_tokens_seen": 46064000, "step": 79345 }, { "epoch": 11.818588025022342, "grad_norm": 1.1230260133743286, "learning_rate": 2.1441017945053497e-05, "loss": 0.6144, "num_input_tokens_seen": 46066816, "step": 79350 }, { "epoch": 11.8193327375633, "grad_norm": 1.259644627571106, "learning_rate": 2.143780164886256e-05, "loss": 0.4603, "num_input_tokens_seen": 46069728, "step": 79355 }, { "epoch": 11.82007745010426, "grad_norm": 1.3569425344467163, "learning_rate": 2.143458541285136e-05, "loss": 0.6715, "num_input_tokens_seen": 46072288, "step": 79360 }, { "epoch": 11.82082216264522, "grad_norm": 1.1295849084854126, "learning_rate": 2.1431369237074196e-05, "loss": 0.6215, "num_input_tokens_seen": 46075232, "step": 79365 }, { "epoch": 11.821566875186178, "grad_norm": 2.518129825592041, "learning_rate": 2.1428153121585438e-05, "loss": 0.6188, "num_input_tokens_seen": 46078112, "step": 79370 }, { "epoch": 11.822311587727137, "grad_norm": 1.106726884841919, "learning_rate": 2.1424937066439398e-05, "loss": 0.6574, "num_input_tokens_seen": 46081376, "step": 79375 }, { "epoch": 11.823056300268096, "grad_norm": 2.166952610015869, "learning_rate": 2.1421721071690415e-05, "loss": 0.594, "num_input_tokens_seen": 46084192, "step": 79380 }, { "epoch": 11.823801012809056, "grad_norm": 1.4569251537322998, "learning_rate": 2.141850513739282e-05, "loss": 0.5936, "num_input_tokens_seen": 46086816, "step": 79385 }, { "epoch": 11.824545725350015, "grad_norm": 1.486385703086853, "learning_rate": 2.1415289263600927e-05, "loss": 0.6032, "num_input_tokens_seen": 46089504, "step": 79390 }, { "epoch": 11.825290437890974, "grad_norm": 2.3359744548797607, "learning_rate": 2.1412073450369092e-05, "loss": 0.6571, "num_input_tokens_seen": 46092512, "step": 79395 }, { "epoch": 11.826035150431933, "grad_norm": 2.7450804710388184, "learning_rate": 2.1408857697751617e-05, "loss": 0.7119, "num_input_tokens_seen": 46095744, "step": 79400 }, { "epoch": 11.826779862972892, "grad_norm": 0.726097047328949, "learning_rate": 2.1405642005802852e-05, "loss": 0.5771, "num_input_tokens_seen": 46098720, "step": 79405 }, { "epoch": 11.827524575513852, "grad_norm": 2.088374376296997, "learning_rate": 2.1402426374577107e-05, "loss": 0.7498, "num_input_tokens_seen": 46101536, "step": 79410 }, { "epoch": 11.82826928805481, "grad_norm": 1.4845211505889893, "learning_rate": 2.139921080412872e-05, "loss": 0.6601, "num_input_tokens_seen": 46104608, "step": 79415 }, { "epoch": 11.82901400059577, "grad_norm": 2.0072813034057617, "learning_rate": 2.1395995294511993e-05, "loss": 0.7709, "num_input_tokens_seen": 46107584, "step": 79420 }, { "epoch": 11.82975871313673, "grad_norm": 0.8129653930664062, "learning_rate": 2.1392779845781275e-05, "loss": 0.6099, "num_input_tokens_seen": 46110592, "step": 79425 }, { "epoch": 11.830503425677689, "grad_norm": 1.65548574924469, "learning_rate": 2.1389564457990875e-05, "loss": 0.4717, "num_input_tokens_seen": 46113600, "step": 79430 }, { "epoch": 11.831248138218648, "grad_norm": 1.4559136629104614, "learning_rate": 2.1386349131195103e-05, "loss": 0.7372, "num_input_tokens_seen": 46116512, "step": 79435 }, { "epoch": 11.831992850759606, "grad_norm": 2.291902542114258, "learning_rate": 2.13831338654483e-05, "loss": 0.6456, "num_input_tokens_seen": 46119328, "step": 79440 }, { "epoch": 11.832737563300565, "grad_norm": 2.047653913497925, "learning_rate": 2.1379918660804766e-05, "loss": 0.7838, "num_input_tokens_seen": 46122144, "step": 79445 }, { "epoch": 11.833482275841526, "grad_norm": 1.1983771324157715, "learning_rate": 2.1376703517318837e-05, "loss": 0.5544, "num_input_tokens_seen": 46125216, "step": 79450 }, { "epoch": 11.834226988382484, "grad_norm": 1.3852925300598145, "learning_rate": 2.1373488435044804e-05, "loss": 0.5596, "num_input_tokens_seen": 46128032, "step": 79455 }, { "epoch": 11.834971700923443, "grad_norm": 0.8932711482048035, "learning_rate": 2.1370273414037013e-05, "loss": 0.5718, "num_input_tokens_seen": 46130816, "step": 79460 }, { "epoch": 11.835716413464402, "grad_norm": 0.8981162309646606, "learning_rate": 2.1367058454349763e-05, "loss": 0.5286, "num_input_tokens_seen": 46133952, "step": 79465 }, { "epoch": 11.836461126005362, "grad_norm": 0.6234611868858337, "learning_rate": 2.1363843556037365e-05, "loss": 0.6762, "num_input_tokens_seen": 46136864, "step": 79470 }, { "epoch": 11.837205838546321, "grad_norm": 0.9380226135253906, "learning_rate": 2.136062871915413e-05, "loss": 0.6173, "num_input_tokens_seen": 46139808, "step": 79475 }, { "epoch": 11.83795055108728, "grad_norm": 1.1510206460952759, "learning_rate": 2.1357413943754374e-05, "loss": 0.492, "num_input_tokens_seen": 46142848, "step": 79480 }, { "epoch": 11.838695263628239, "grad_norm": 1.484665870666504, "learning_rate": 2.1354199229892416e-05, "loss": 0.4688, "num_input_tokens_seen": 46145344, "step": 79485 }, { "epoch": 11.8394399761692, "grad_norm": 1.4508981704711914, "learning_rate": 2.1350984577622547e-05, "loss": 0.6837, "num_input_tokens_seen": 46148480, "step": 79490 }, { "epoch": 11.840184688710158, "grad_norm": 1.4445191621780396, "learning_rate": 2.1347769986999088e-05, "loss": 0.5006, "num_input_tokens_seen": 46151136, "step": 79495 }, { "epoch": 11.840929401251117, "grad_norm": 1.0384495258331299, "learning_rate": 2.1344555458076345e-05, "loss": 0.5826, "num_input_tokens_seen": 46154080, "step": 79500 }, { "epoch": 11.841674113792076, "grad_norm": 1.7382837533950806, "learning_rate": 2.1341340990908627e-05, "loss": 0.5559, "num_input_tokens_seen": 46156928, "step": 79505 }, { "epoch": 11.842418826333036, "grad_norm": 2.546950101852417, "learning_rate": 2.133812658555023e-05, "loss": 0.5512, "num_input_tokens_seen": 46159904, "step": 79510 }, { "epoch": 11.843163538873995, "grad_norm": 1.0332777500152588, "learning_rate": 2.1334912242055454e-05, "loss": 0.4624, "num_input_tokens_seen": 46162528, "step": 79515 }, { "epoch": 11.843908251414954, "grad_norm": 0.9958528876304626, "learning_rate": 2.1331697960478624e-05, "loss": 0.5922, "num_input_tokens_seen": 46165408, "step": 79520 }, { "epoch": 11.844652963955912, "grad_norm": 2.57863450050354, "learning_rate": 2.1328483740874014e-05, "loss": 0.8151, "num_input_tokens_seen": 46168384, "step": 79525 }, { "epoch": 11.845397676496873, "grad_norm": 1.2545826435089111, "learning_rate": 2.1325269583295953e-05, "loss": 0.5434, "num_input_tokens_seen": 46171552, "step": 79530 }, { "epoch": 11.846142389037832, "grad_norm": 3.265199661254883, "learning_rate": 2.132205548779872e-05, "loss": 0.8218, "num_input_tokens_seen": 46174496, "step": 79535 }, { "epoch": 11.84688710157879, "grad_norm": 0.9537932872772217, "learning_rate": 2.131884145443663e-05, "loss": 0.6765, "num_input_tokens_seen": 46177472, "step": 79540 }, { "epoch": 11.84763181411975, "grad_norm": 1.2636988162994385, "learning_rate": 2.131562748326397e-05, "loss": 0.8218, "num_input_tokens_seen": 46180192, "step": 79545 }, { "epoch": 11.84837652666071, "grad_norm": 2.9769656658172607, "learning_rate": 2.131241357433503e-05, "loss": 0.4754, "num_input_tokens_seen": 46183104, "step": 79550 }, { "epoch": 11.849121239201668, "grad_norm": 1.0291409492492676, "learning_rate": 2.1309199727704125e-05, "loss": 0.5946, "num_input_tokens_seen": 46185984, "step": 79555 }, { "epoch": 11.849865951742627, "grad_norm": 1.5826319456100464, "learning_rate": 2.130598594342553e-05, "loss": 0.5847, "num_input_tokens_seen": 46189120, "step": 79560 }, { "epoch": 11.850610664283586, "grad_norm": 0.8332474827766418, "learning_rate": 2.130277222155355e-05, "loss": 0.4727, "num_input_tokens_seen": 46191936, "step": 79565 }, { "epoch": 11.851355376824547, "grad_norm": 1.7557207345962524, "learning_rate": 2.129955856214248e-05, "loss": 0.6357, "num_input_tokens_seen": 46194592, "step": 79570 }, { "epoch": 11.852100089365505, "grad_norm": 2.1950106620788574, "learning_rate": 2.129634496524661e-05, "loss": 0.4402, "num_input_tokens_seen": 46197376, "step": 79575 }, { "epoch": 11.852844801906464, "grad_norm": 1.8447673320770264, "learning_rate": 2.1293131430920215e-05, "loss": 0.7126, "num_input_tokens_seen": 46200384, "step": 79580 }, { "epoch": 11.853589514447423, "grad_norm": 2.273792266845703, "learning_rate": 2.128991795921761e-05, "loss": 0.6183, "num_input_tokens_seen": 46203136, "step": 79585 }, { "epoch": 11.854334226988382, "grad_norm": 0.7509564161300659, "learning_rate": 2.128670455019307e-05, "loss": 0.6516, "num_input_tokens_seen": 46206016, "step": 79590 }, { "epoch": 11.855078939529342, "grad_norm": 1.2425909042358398, "learning_rate": 2.128349120390087e-05, "loss": 0.686, "num_input_tokens_seen": 46208736, "step": 79595 }, { "epoch": 11.8558236520703, "grad_norm": 1.5599277019500732, "learning_rate": 2.1280277920395322e-05, "loss": 0.6569, "num_input_tokens_seen": 46211808, "step": 79600 }, { "epoch": 11.85656836461126, "grad_norm": 1.4597474336624146, "learning_rate": 2.1277064699730694e-05, "loss": 0.4027, "num_input_tokens_seen": 46214848, "step": 79605 }, { "epoch": 11.85731307715222, "grad_norm": 1.181632399559021, "learning_rate": 2.1273851541961274e-05, "loss": 0.5782, "num_input_tokens_seen": 46217664, "step": 79610 }, { "epoch": 11.858057789693179, "grad_norm": 1.8220328092575073, "learning_rate": 2.1270638447141337e-05, "loss": 0.6888, "num_input_tokens_seen": 46220736, "step": 79615 }, { "epoch": 11.858802502234138, "grad_norm": 1.1476472616195679, "learning_rate": 2.1267425415325185e-05, "loss": 0.5948, "num_input_tokens_seen": 46223648, "step": 79620 }, { "epoch": 11.859547214775096, "grad_norm": 1.2595841884613037, "learning_rate": 2.1264212446567084e-05, "loss": 0.5489, "num_input_tokens_seen": 46226560, "step": 79625 }, { "epoch": 11.860291927316055, "grad_norm": 1.8241920471191406, "learning_rate": 2.1260999540921307e-05, "loss": 0.6226, "num_input_tokens_seen": 46229568, "step": 79630 }, { "epoch": 11.861036639857016, "grad_norm": 0.9536088705062866, "learning_rate": 2.1257786698442155e-05, "loss": 0.4888, "num_input_tokens_seen": 46232416, "step": 79635 }, { "epoch": 11.861781352397974, "grad_norm": 2.1590993404388428, "learning_rate": 2.125457391918389e-05, "loss": 0.7592, "num_input_tokens_seen": 46235392, "step": 79640 }, { "epoch": 11.862526064938933, "grad_norm": 1.0352511405944824, "learning_rate": 2.1251361203200793e-05, "loss": 0.4541, "num_input_tokens_seen": 46238400, "step": 79645 }, { "epoch": 11.863270777479892, "grad_norm": 1.961483359336853, "learning_rate": 2.124814855054713e-05, "loss": 0.5447, "num_input_tokens_seen": 46241408, "step": 79650 }, { "epoch": 11.864015490020853, "grad_norm": 0.904312014579773, "learning_rate": 2.1244935961277197e-05, "loss": 0.4686, "num_input_tokens_seen": 46244320, "step": 79655 }, { "epoch": 11.864760202561811, "grad_norm": 1.0963906049728394, "learning_rate": 2.124172343544524e-05, "loss": 0.5431, "num_input_tokens_seen": 46247008, "step": 79660 }, { "epoch": 11.86550491510277, "grad_norm": 1.2106446027755737, "learning_rate": 2.123851097310556e-05, "loss": 0.6333, "num_input_tokens_seen": 46250112, "step": 79665 }, { "epoch": 11.866249627643729, "grad_norm": 1.1983507871627808, "learning_rate": 2.1235298574312405e-05, "loss": 0.5822, "num_input_tokens_seen": 46253088, "step": 79670 }, { "epoch": 11.86699434018469, "grad_norm": 0.9906798601150513, "learning_rate": 2.123208623912006e-05, "loss": 0.8401, "num_input_tokens_seen": 46256128, "step": 79675 }, { "epoch": 11.867739052725648, "grad_norm": 2.1980137825012207, "learning_rate": 2.1228873967582787e-05, "loss": 0.7098, "num_input_tokens_seen": 46258880, "step": 79680 }, { "epoch": 11.868483765266607, "grad_norm": 1.1095718145370483, "learning_rate": 2.1225661759754848e-05, "loss": 0.5003, "num_input_tokens_seen": 46262112, "step": 79685 }, { "epoch": 11.869228477807566, "grad_norm": 0.7558479905128479, "learning_rate": 2.1222449615690525e-05, "loss": 0.5636, "num_input_tokens_seen": 46265120, "step": 79690 }, { "epoch": 11.869973190348526, "grad_norm": 2.1504156589508057, "learning_rate": 2.121923753544407e-05, "loss": 0.7146, "num_input_tokens_seen": 46267712, "step": 79695 }, { "epoch": 11.870717902889485, "grad_norm": 1.456697940826416, "learning_rate": 2.1216025519069766e-05, "loss": 0.5074, "num_input_tokens_seen": 46270560, "step": 79700 }, { "epoch": 11.871462615430444, "grad_norm": 1.9572612047195435, "learning_rate": 2.121281356662186e-05, "loss": 0.6553, "num_input_tokens_seen": 46273632, "step": 79705 }, { "epoch": 11.872207327971402, "grad_norm": 1.4988782405853271, "learning_rate": 2.1209601678154615e-05, "loss": 0.6328, "num_input_tokens_seen": 46276416, "step": 79710 }, { "epoch": 11.872952040512363, "grad_norm": 1.6069438457489014, "learning_rate": 2.1206389853722306e-05, "loss": 0.5151, "num_input_tokens_seen": 46279200, "step": 79715 }, { "epoch": 11.873696753053322, "grad_norm": 1.5777838230133057, "learning_rate": 2.1203178093379172e-05, "loss": 0.5551, "num_input_tokens_seen": 46282016, "step": 79720 }, { "epoch": 11.87444146559428, "grad_norm": 3.552769184112549, "learning_rate": 2.1199966397179492e-05, "loss": 0.6466, "num_input_tokens_seen": 46285024, "step": 79725 }, { "epoch": 11.87518617813524, "grad_norm": 1.1202892065048218, "learning_rate": 2.1196754765177514e-05, "loss": 0.5422, "num_input_tokens_seen": 46287872, "step": 79730 }, { "epoch": 11.8759308906762, "grad_norm": 1.0886775255203247, "learning_rate": 2.1193543197427507e-05, "loss": 0.63, "num_input_tokens_seen": 46291040, "step": 79735 }, { "epoch": 11.876675603217159, "grad_norm": 1.8161746263504028, "learning_rate": 2.119033169398371e-05, "loss": 0.5817, "num_input_tokens_seen": 46293920, "step": 79740 }, { "epoch": 11.877420315758117, "grad_norm": 0.9511037468910217, "learning_rate": 2.1187120254900397e-05, "loss": 0.5016, "num_input_tokens_seen": 46296768, "step": 79745 }, { "epoch": 11.878165028299076, "grad_norm": 2.206996202468872, "learning_rate": 2.118390888023181e-05, "loss": 0.5268, "num_input_tokens_seen": 46299648, "step": 79750 }, { "epoch": 11.878909740840037, "grad_norm": 1.3358601331710815, "learning_rate": 2.1180697570032195e-05, "loss": 0.475, "num_input_tokens_seen": 46302592, "step": 79755 }, { "epoch": 11.879654453380995, "grad_norm": 2.031369924545288, "learning_rate": 2.117748632435582e-05, "loss": 0.7196, "num_input_tokens_seen": 46305728, "step": 79760 }, { "epoch": 11.880399165921954, "grad_norm": 2.0854287147521973, "learning_rate": 2.1174275143256927e-05, "loss": 0.7686, "num_input_tokens_seen": 46308544, "step": 79765 }, { "epoch": 11.881143878462913, "grad_norm": 1.9553231000900269, "learning_rate": 2.1171064026789768e-05, "loss": 0.6977, "num_input_tokens_seen": 46311520, "step": 79770 }, { "epoch": 11.881888591003872, "grad_norm": 1.7703726291656494, "learning_rate": 2.1167852975008587e-05, "loss": 0.4467, "num_input_tokens_seen": 46314336, "step": 79775 }, { "epoch": 11.882633303544832, "grad_norm": 2.491654872894287, "learning_rate": 2.1164641987967638e-05, "loss": 0.667, "num_input_tokens_seen": 46317344, "step": 79780 }, { "epoch": 11.883378016085791, "grad_norm": 1.4319409132003784, "learning_rate": 2.116143106572117e-05, "loss": 0.535, "num_input_tokens_seen": 46320224, "step": 79785 }, { "epoch": 11.88412272862675, "grad_norm": 1.932652235031128, "learning_rate": 2.115822020832341e-05, "loss": 0.7162, "num_input_tokens_seen": 46323104, "step": 79790 }, { "epoch": 11.884867441167708, "grad_norm": 2.6632680892944336, "learning_rate": 2.1155009415828628e-05, "loss": 0.8278, "num_input_tokens_seen": 46326048, "step": 79795 }, { "epoch": 11.885612153708669, "grad_norm": 1.4094566106796265, "learning_rate": 2.1151798688291046e-05, "loss": 0.6561, "num_input_tokens_seen": 46329376, "step": 79800 }, { "epoch": 11.886356866249628, "grad_norm": 1.7840194702148438, "learning_rate": 2.1148588025764916e-05, "loss": 0.7167, "num_input_tokens_seen": 46332064, "step": 79805 }, { "epoch": 11.887101578790586, "grad_norm": 1.867262601852417, "learning_rate": 2.1145377428304476e-05, "loss": 0.6038, "num_input_tokens_seen": 46334944, "step": 79810 }, { "epoch": 11.887846291331545, "grad_norm": 1.3135316371917725, "learning_rate": 2.1142166895963973e-05, "loss": 0.7595, "num_input_tokens_seen": 46338016, "step": 79815 }, { "epoch": 11.888591003872506, "grad_norm": 1.7234619855880737, "learning_rate": 2.1138956428797624e-05, "loss": 0.6176, "num_input_tokens_seen": 46340960, "step": 79820 }, { "epoch": 11.889335716413465, "grad_norm": 1.16163969039917, "learning_rate": 2.1135746026859697e-05, "loss": 0.5617, "num_input_tokens_seen": 46343968, "step": 79825 }, { "epoch": 11.890080428954423, "grad_norm": 1.97053062915802, "learning_rate": 2.1132535690204415e-05, "loss": 0.4427, "num_input_tokens_seen": 46347168, "step": 79830 }, { "epoch": 11.890825141495382, "grad_norm": 1.4676024913787842, "learning_rate": 2.1129325418886e-05, "loss": 0.7551, "num_input_tokens_seen": 46349952, "step": 79835 }, { "epoch": 11.891569854036343, "grad_norm": 1.3203555345535278, "learning_rate": 2.1126115212958708e-05, "loss": 0.4509, "num_input_tokens_seen": 46352768, "step": 79840 }, { "epoch": 11.892314566577301, "grad_norm": 1.3278838396072388, "learning_rate": 2.112290507247675e-05, "loss": 0.6577, "num_input_tokens_seen": 46355776, "step": 79845 }, { "epoch": 11.89305927911826, "grad_norm": 2.4999194145202637, "learning_rate": 2.1119694997494382e-05, "loss": 0.7871, "num_input_tokens_seen": 46358624, "step": 79850 }, { "epoch": 11.893803991659219, "grad_norm": 1.7995185852050781, "learning_rate": 2.1116484988065813e-05, "loss": 0.6313, "num_input_tokens_seen": 46361504, "step": 79855 }, { "epoch": 11.89454870420018, "grad_norm": 1.4095391035079956, "learning_rate": 2.1113275044245293e-05, "loss": 0.6449, "num_input_tokens_seen": 46364384, "step": 79860 }, { "epoch": 11.895293416741138, "grad_norm": 2.3473939895629883, "learning_rate": 2.1110065166087037e-05, "loss": 0.6945, "num_input_tokens_seen": 46367168, "step": 79865 }, { "epoch": 11.896038129282097, "grad_norm": 1.2971930503845215, "learning_rate": 2.110685535364528e-05, "loss": 0.6977, "num_input_tokens_seen": 46370048, "step": 79870 }, { "epoch": 11.896782841823056, "grad_norm": 1.143734335899353, "learning_rate": 2.1103645606974244e-05, "loss": 0.4567, "num_input_tokens_seen": 46372608, "step": 79875 }, { "epoch": 11.897527554364016, "grad_norm": 1.00949227809906, "learning_rate": 2.1100435926128146e-05, "loss": 0.4414, "num_input_tokens_seen": 46375584, "step": 79880 }, { "epoch": 11.898272266904975, "grad_norm": 1.3623820543289185, "learning_rate": 2.1097226311161232e-05, "loss": 0.6508, "num_input_tokens_seen": 46378368, "step": 79885 }, { "epoch": 11.899016979445934, "grad_norm": 1.7184205055236816, "learning_rate": 2.1094016762127698e-05, "loss": 0.6475, "num_input_tokens_seen": 46381248, "step": 79890 }, { "epoch": 11.899761691986892, "grad_norm": 1.2585256099700928, "learning_rate": 2.10908072790818e-05, "loss": 0.6846, "num_input_tokens_seen": 46384160, "step": 79895 }, { "epoch": 11.900506404527853, "grad_norm": 1.6359502077102661, "learning_rate": 2.1087597862077726e-05, "loss": 0.5207, "num_input_tokens_seen": 46386880, "step": 79900 }, { "epoch": 11.901251117068812, "grad_norm": 2.1825811862945557, "learning_rate": 2.1084388511169718e-05, "loss": 0.6077, "num_input_tokens_seen": 46389408, "step": 79905 }, { "epoch": 11.90199582960977, "grad_norm": 2.1270248889923096, "learning_rate": 2.1081179226411985e-05, "loss": 0.6493, "num_input_tokens_seen": 46392576, "step": 79910 }, { "epoch": 11.90274054215073, "grad_norm": 1.8761850595474243, "learning_rate": 2.107797000785874e-05, "loss": 0.5862, "num_input_tokens_seen": 46395456, "step": 79915 }, { "epoch": 11.90348525469169, "grad_norm": 1.8317790031433105, "learning_rate": 2.107476085556421e-05, "loss": 0.65, "num_input_tokens_seen": 46398304, "step": 79920 }, { "epoch": 11.904229967232649, "grad_norm": 1.2904478311538696, "learning_rate": 2.10715517695826e-05, "loss": 0.6108, "num_input_tokens_seen": 46401120, "step": 79925 }, { "epoch": 11.904974679773607, "grad_norm": 1.3811472654342651, "learning_rate": 2.106834274996814e-05, "loss": 0.5994, "num_input_tokens_seen": 46404128, "step": 79930 }, { "epoch": 11.905719392314566, "grad_norm": 1.1068400144577026, "learning_rate": 2.1065133796775026e-05, "loss": 0.4504, "num_input_tokens_seen": 46406880, "step": 79935 }, { "epoch": 11.906464104855527, "grad_norm": 1.536880373954773, "learning_rate": 2.1061924910057485e-05, "loss": 0.615, "num_input_tokens_seen": 46409952, "step": 79940 }, { "epoch": 11.907208817396485, "grad_norm": 2.3740158081054688, "learning_rate": 2.1058716089869707e-05, "loss": 0.4625, "num_input_tokens_seen": 46413088, "step": 79945 }, { "epoch": 11.907953529937444, "grad_norm": 1.544098973274231, "learning_rate": 2.1055507336265925e-05, "loss": 0.5395, "num_input_tokens_seen": 46415968, "step": 79950 }, { "epoch": 11.908698242478403, "grad_norm": 2.1581404209136963, "learning_rate": 2.105229864930034e-05, "loss": 0.7015, "num_input_tokens_seen": 46418656, "step": 79955 }, { "epoch": 11.909442955019362, "grad_norm": 1.7379788160324097, "learning_rate": 2.1049090029027146e-05, "loss": 0.7231, "num_input_tokens_seen": 46421600, "step": 79960 }, { "epoch": 11.910187667560322, "grad_norm": 1.7013959884643555, "learning_rate": 2.104588147550057e-05, "loss": 0.5914, "num_input_tokens_seen": 46424512, "step": 79965 }, { "epoch": 11.910932380101281, "grad_norm": 1.1714673042297363, "learning_rate": 2.1042672988774805e-05, "loss": 0.4979, "num_input_tokens_seen": 46427488, "step": 79970 }, { "epoch": 11.91167709264224, "grad_norm": 1.954286813735962, "learning_rate": 2.103946456890406e-05, "loss": 0.5039, "num_input_tokens_seen": 46430112, "step": 79975 }, { "epoch": 11.912421805183198, "grad_norm": 2.6917433738708496, "learning_rate": 2.1036256215942526e-05, "loss": 0.5644, "num_input_tokens_seen": 46433024, "step": 79980 }, { "epoch": 11.913166517724159, "grad_norm": 1.2810322046279907, "learning_rate": 2.1033047929944427e-05, "loss": 0.5061, "num_input_tokens_seen": 46435904, "step": 79985 }, { "epoch": 11.913911230265118, "grad_norm": 2.319223165512085, "learning_rate": 2.102983971096395e-05, "loss": 0.6358, "num_input_tokens_seen": 46438688, "step": 79990 }, { "epoch": 11.914655942806077, "grad_norm": 1.3619805574417114, "learning_rate": 2.1026631559055285e-05, "loss": 0.5752, "num_input_tokens_seen": 46441312, "step": 79995 }, { "epoch": 11.915400655347035, "grad_norm": 2.127516031265259, "learning_rate": 2.1023423474272652e-05, "loss": 0.7339, "num_input_tokens_seen": 46444256, "step": 80000 }, { "epoch": 11.916145367887996, "grad_norm": 1.5907328128814697, "learning_rate": 2.1020215456670234e-05, "loss": 0.7334, "num_input_tokens_seen": 46447200, "step": 80005 }, { "epoch": 11.916890080428955, "grad_norm": 1.464626431465149, "learning_rate": 2.1017007506302233e-05, "loss": 0.7562, "num_input_tokens_seen": 46450048, "step": 80010 }, { "epoch": 11.917634792969913, "grad_norm": 1.2853963375091553, "learning_rate": 2.1013799623222833e-05, "loss": 0.5954, "num_input_tokens_seen": 46453088, "step": 80015 }, { "epoch": 11.918379505510872, "grad_norm": 1.6595098972320557, "learning_rate": 2.1010591807486253e-05, "loss": 0.5963, "num_input_tokens_seen": 46456256, "step": 80020 }, { "epoch": 11.919124218051833, "grad_norm": 1.2183326482772827, "learning_rate": 2.100738405914665e-05, "loss": 0.5711, "num_input_tokens_seen": 46459040, "step": 80025 }, { "epoch": 11.919868930592791, "grad_norm": 1.3733595609664917, "learning_rate": 2.1004176378258252e-05, "loss": 0.6043, "num_input_tokens_seen": 46461952, "step": 80030 }, { "epoch": 11.92061364313375, "grad_norm": 3.478071928024292, "learning_rate": 2.100096876487523e-05, "loss": 0.6814, "num_input_tokens_seen": 46464896, "step": 80035 }, { "epoch": 11.921358355674709, "grad_norm": 1.6770143508911133, "learning_rate": 2.0997761219051777e-05, "loss": 0.5478, "num_input_tokens_seen": 46467680, "step": 80040 }, { "epoch": 11.92210306821567, "grad_norm": 0.6689898371696472, "learning_rate": 2.099455374084208e-05, "loss": 0.4822, "num_input_tokens_seen": 46470432, "step": 80045 }, { "epoch": 11.922847780756628, "grad_norm": 1.7781355381011963, "learning_rate": 2.0991346330300314e-05, "loss": 0.4689, "num_input_tokens_seen": 46473408, "step": 80050 }, { "epoch": 11.923592493297587, "grad_norm": 0.8123664259910583, "learning_rate": 2.0988138987480694e-05, "loss": 0.6692, "num_input_tokens_seen": 46476096, "step": 80055 }, { "epoch": 11.924337205838546, "grad_norm": 0.8009713888168335, "learning_rate": 2.0984931712437377e-05, "loss": 0.5286, "num_input_tokens_seen": 46479168, "step": 80060 }, { "epoch": 11.925081918379506, "grad_norm": 2.9839818477630615, "learning_rate": 2.0981724505224563e-05, "loss": 0.6732, "num_input_tokens_seen": 46482240, "step": 80065 }, { "epoch": 11.925826630920465, "grad_norm": 1.4881268739700317, "learning_rate": 2.0978517365896433e-05, "loss": 0.5576, "num_input_tokens_seen": 46485088, "step": 80070 }, { "epoch": 11.926571343461424, "grad_norm": 3.2438104152679443, "learning_rate": 2.0975310294507162e-05, "loss": 0.7968, "num_input_tokens_seen": 46487840, "step": 80075 }, { "epoch": 11.927316056002383, "grad_norm": 1.6346299648284912, "learning_rate": 2.0972103291110933e-05, "loss": 0.5569, "num_input_tokens_seen": 46490976, "step": 80080 }, { "epoch": 11.928060768543343, "grad_norm": 1.0973007678985596, "learning_rate": 2.096889635576192e-05, "loss": 0.6377, "num_input_tokens_seen": 46493536, "step": 80085 }, { "epoch": 11.928805481084302, "grad_norm": 3.7487430572509766, "learning_rate": 2.0965689488514314e-05, "loss": 0.526, "num_input_tokens_seen": 46496416, "step": 80090 }, { "epoch": 11.92955019362526, "grad_norm": 1.9919935464859009, "learning_rate": 2.0962482689422276e-05, "loss": 0.5324, "num_input_tokens_seen": 46499264, "step": 80095 }, { "epoch": 11.93029490616622, "grad_norm": 1.3651530742645264, "learning_rate": 2.0959275958539996e-05, "loss": 0.7218, "num_input_tokens_seen": 46501920, "step": 80100 }, { "epoch": 11.931039618707178, "grad_norm": 1.9583216905593872, "learning_rate": 2.095606929592164e-05, "loss": 0.544, "num_input_tokens_seen": 46504704, "step": 80105 }, { "epoch": 11.931784331248139, "grad_norm": 0.9995017051696777, "learning_rate": 2.0952862701621385e-05, "loss": 0.5507, "num_input_tokens_seen": 46507552, "step": 80110 }, { "epoch": 11.932529043789097, "grad_norm": 1.5378566980361938, "learning_rate": 2.09496561756934e-05, "loss": 0.6709, "num_input_tokens_seen": 46510080, "step": 80115 }, { "epoch": 11.933273756330056, "grad_norm": 1.2660949230194092, "learning_rate": 2.094644971819185e-05, "loss": 0.4964, "num_input_tokens_seen": 46512736, "step": 80120 }, { "epoch": 11.934018468871017, "grad_norm": 1.189500331878662, "learning_rate": 2.0943243329170922e-05, "loss": 0.5789, "num_input_tokens_seen": 46515584, "step": 80125 }, { "epoch": 11.934763181411975, "grad_norm": 1.5133799314498901, "learning_rate": 2.0940037008684772e-05, "loss": 0.4673, "num_input_tokens_seen": 46518688, "step": 80130 }, { "epoch": 11.935507893952934, "grad_norm": 1.6281076669692993, "learning_rate": 2.0936830756787568e-05, "loss": 0.6202, "num_input_tokens_seen": 46521568, "step": 80135 }, { "epoch": 11.936252606493893, "grad_norm": 0.9584008455276489, "learning_rate": 2.0933624573533477e-05, "loss": 0.609, "num_input_tokens_seen": 46524480, "step": 80140 }, { "epoch": 11.936997319034852, "grad_norm": 1.2641690969467163, "learning_rate": 2.0930418458976676e-05, "loss": 0.4885, "num_input_tokens_seen": 46527648, "step": 80145 }, { "epoch": 11.937742031575812, "grad_norm": 1.9995379447937012, "learning_rate": 2.0927212413171316e-05, "loss": 0.5528, "num_input_tokens_seen": 46530784, "step": 80150 }, { "epoch": 11.938486744116771, "grad_norm": 1.4149514436721802, "learning_rate": 2.092400643617155e-05, "loss": 0.525, "num_input_tokens_seen": 46533664, "step": 80155 }, { "epoch": 11.93923145665773, "grad_norm": 1.1741864681243896, "learning_rate": 2.092080052803157e-05, "loss": 0.634, "num_input_tokens_seen": 46536512, "step": 80160 }, { "epoch": 11.939976169198689, "grad_norm": 1.9622585773468018, "learning_rate": 2.0917594688805507e-05, "loss": 0.4701, "num_input_tokens_seen": 46539584, "step": 80165 }, { "epoch": 11.940720881739649, "grad_norm": 1.442460536956787, "learning_rate": 2.091438891854754e-05, "loss": 0.6187, "num_input_tokens_seen": 46542624, "step": 80170 }, { "epoch": 11.941465594280608, "grad_norm": 1.6068830490112305, "learning_rate": 2.091118321731181e-05, "loss": 0.5734, "num_input_tokens_seen": 46545632, "step": 80175 }, { "epoch": 11.942210306821567, "grad_norm": 1.1337389945983887, "learning_rate": 2.0907977585152495e-05, "loss": 0.4915, "num_input_tokens_seen": 46548256, "step": 80180 }, { "epoch": 11.942955019362525, "grad_norm": 1.6990212202072144, "learning_rate": 2.0904772022123725e-05, "loss": 0.6391, "num_input_tokens_seen": 46551040, "step": 80185 }, { "epoch": 11.943699731903486, "grad_norm": 1.2250645160675049, "learning_rate": 2.0901566528279687e-05, "loss": 0.5903, "num_input_tokens_seen": 46553728, "step": 80190 }, { "epoch": 11.944444444444445, "grad_norm": 1.7463972568511963, "learning_rate": 2.089836110367451e-05, "loss": 0.576, "num_input_tokens_seen": 46556800, "step": 80195 }, { "epoch": 11.945189156985403, "grad_norm": 2.268146514892578, "learning_rate": 2.0895155748362353e-05, "loss": 0.6164, "num_input_tokens_seen": 46559840, "step": 80200 }, { "epoch": 11.945933869526362, "grad_norm": 1.6564316749572754, "learning_rate": 2.0891950462397372e-05, "loss": 0.7317, "num_input_tokens_seen": 46562336, "step": 80205 }, { "epoch": 11.946678582067323, "grad_norm": 0.9550554752349854, "learning_rate": 2.0888745245833703e-05, "loss": 0.5594, "num_input_tokens_seen": 46565056, "step": 80210 }, { "epoch": 11.947423294608281, "grad_norm": 3.192814588546753, "learning_rate": 2.0885540098725513e-05, "loss": 0.6197, "num_input_tokens_seen": 46568032, "step": 80215 }, { "epoch": 11.94816800714924, "grad_norm": 1.8450168371200562, "learning_rate": 2.088233502112693e-05, "loss": 0.4323, "num_input_tokens_seen": 46570656, "step": 80220 }, { "epoch": 11.948912719690199, "grad_norm": 1.3188424110412598, "learning_rate": 2.0879130013092124e-05, "loss": 0.5691, "num_input_tokens_seen": 46573568, "step": 80225 }, { "epoch": 11.94965743223116, "grad_norm": 1.6004523038864136, "learning_rate": 2.087592507467523e-05, "loss": 0.6983, "num_input_tokens_seen": 46576384, "step": 80230 }, { "epoch": 11.950402144772118, "grad_norm": 1.471946358680725, "learning_rate": 2.087272020593038e-05, "loss": 0.5491, "num_input_tokens_seen": 46579424, "step": 80235 }, { "epoch": 11.951146857313077, "grad_norm": 0.7221354246139526, "learning_rate": 2.086951540691174e-05, "loss": 0.3552, "num_input_tokens_seen": 46582048, "step": 80240 }, { "epoch": 11.951891569854036, "grad_norm": 1.4235892295837402, "learning_rate": 2.086631067767342e-05, "loss": 0.5299, "num_input_tokens_seen": 46584768, "step": 80245 }, { "epoch": 11.952636282394996, "grad_norm": 0.9213985204696655, "learning_rate": 2.0863106018269596e-05, "loss": 0.6455, "num_input_tokens_seen": 46587872, "step": 80250 }, { "epoch": 11.953380994935955, "grad_norm": 1.1981112957000732, "learning_rate": 2.085990142875438e-05, "loss": 0.7034, "num_input_tokens_seen": 46590880, "step": 80255 }, { "epoch": 11.954125707476914, "grad_norm": 2.221647262573242, "learning_rate": 2.0856696909181932e-05, "loss": 0.6676, "num_input_tokens_seen": 46593600, "step": 80260 }, { "epoch": 11.954870420017873, "grad_norm": 0.9341117143630981, "learning_rate": 2.0853492459606373e-05, "loss": 0.5701, "num_input_tokens_seen": 46596320, "step": 80265 }, { "epoch": 11.955615132558833, "grad_norm": 1.1275362968444824, "learning_rate": 2.085028808008185e-05, "loss": 0.6257, "num_input_tokens_seen": 46599008, "step": 80270 }, { "epoch": 11.956359845099792, "grad_norm": 1.2284802198410034, "learning_rate": 2.084708377066249e-05, "loss": 0.7295, "num_input_tokens_seen": 46602112, "step": 80275 }, { "epoch": 11.95710455764075, "grad_norm": 1.8508738279342651, "learning_rate": 2.084387953140242e-05, "loss": 0.7628, "num_input_tokens_seen": 46605216, "step": 80280 }, { "epoch": 11.95784927018171, "grad_norm": 0.975013017654419, "learning_rate": 2.0840675362355792e-05, "loss": 0.6756, "num_input_tokens_seen": 46608352, "step": 80285 }, { "epoch": 11.958593982722668, "grad_norm": 1.1567018032073975, "learning_rate": 2.0837471263576716e-05, "loss": 0.7579, "num_input_tokens_seen": 46611136, "step": 80290 }, { "epoch": 11.959338695263629, "grad_norm": 1.162761926651001, "learning_rate": 2.0834267235119342e-05, "loss": 0.5841, "num_input_tokens_seen": 46613824, "step": 80295 }, { "epoch": 11.960083407804587, "grad_norm": 1.4342460632324219, "learning_rate": 2.083106327703779e-05, "loss": 0.5407, "num_input_tokens_seen": 46616896, "step": 80300 }, { "epoch": 11.960828120345546, "grad_norm": 0.8160017132759094, "learning_rate": 2.0827859389386184e-05, "loss": 0.6924, "num_input_tokens_seen": 46619680, "step": 80305 }, { "epoch": 11.961572832886507, "grad_norm": 2.4975550174713135, "learning_rate": 2.0824655572218655e-05, "loss": 0.5291, "num_input_tokens_seen": 46622336, "step": 80310 }, { "epoch": 11.962317545427466, "grad_norm": 1.192315936088562, "learning_rate": 2.0821451825589315e-05, "loss": 0.7669, "num_input_tokens_seen": 46625184, "step": 80315 }, { "epoch": 11.963062257968424, "grad_norm": 2.3944287300109863, "learning_rate": 2.0818248149552315e-05, "loss": 0.7001, "num_input_tokens_seen": 46627840, "step": 80320 }, { "epoch": 11.963806970509383, "grad_norm": 1.2485274076461792, "learning_rate": 2.0815044544161748e-05, "loss": 0.5411, "num_input_tokens_seen": 46630816, "step": 80325 }, { "epoch": 11.964551683050342, "grad_norm": 1.1191288232803345, "learning_rate": 2.081184100947176e-05, "loss": 0.4564, "num_input_tokens_seen": 46634208, "step": 80330 }, { "epoch": 11.965296395591302, "grad_norm": 0.8839839696884155, "learning_rate": 2.080863754553646e-05, "loss": 0.3152, "num_input_tokens_seen": 46637024, "step": 80335 }, { "epoch": 11.966041108132261, "grad_norm": 1.3588443994522095, "learning_rate": 2.080543415240997e-05, "loss": 0.5674, "num_input_tokens_seen": 46639872, "step": 80340 }, { "epoch": 11.96678582067322, "grad_norm": 3.274462938308716, "learning_rate": 2.0802230830146398e-05, "loss": 0.5823, "num_input_tokens_seen": 46642624, "step": 80345 }, { "epoch": 11.967530533214179, "grad_norm": 1.6649476289749146, "learning_rate": 2.0799027578799882e-05, "loss": 0.5783, "num_input_tokens_seen": 46645216, "step": 80350 }, { "epoch": 11.96827524575514, "grad_norm": 1.4059913158416748, "learning_rate": 2.0795824398424523e-05, "loss": 0.5122, "num_input_tokens_seen": 46648096, "step": 80355 }, { "epoch": 11.969019958296098, "grad_norm": 1.8398911952972412, "learning_rate": 2.079262128907443e-05, "loss": 0.5336, "num_input_tokens_seen": 46650976, "step": 80360 }, { "epoch": 11.969764670837057, "grad_norm": 1.5549042224884033, "learning_rate": 2.0789418250803732e-05, "loss": 0.5647, "num_input_tokens_seen": 46654208, "step": 80365 }, { "epoch": 11.970509383378015, "grad_norm": 1.8861838579177856, "learning_rate": 2.078621528366653e-05, "loss": 0.4617, "num_input_tokens_seen": 46656768, "step": 80370 }, { "epoch": 11.971254095918976, "grad_norm": 1.156272292137146, "learning_rate": 2.078301238771694e-05, "loss": 0.4267, "num_input_tokens_seen": 46659424, "step": 80375 }, { "epoch": 11.971998808459935, "grad_norm": 1.4239709377288818, "learning_rate": 2.0779809563009063e-05, "loss": 0.6593, "num_input_tokens_seen": 46662336, "step": 80380 }, { "epoch": 11.972743521000893, "grad_norm": 2.495598077774048, "learning_rate": 2.0776606809597022e-05, "loss": 0.7246, "num_input_tokens_seen": 46665152, "step": 80385 }, { "epoch": 11.973488233541852, "grad_norm": 1.383339285850525, "learning_rate": 2.077340412753492e-05, "loss": 0.5799, "num_input_tokens_seen": 46668032, "step": 80390 }, { "epoch": 11.974232946082813, "grad_norm": 1.9588044881820679, "learning_rate": 2.077020151687684e-05, "loss": 0.8427, "num_input_tokens_seen": 46670944, "step": 80395 }, { "epoch": 11.974977658623772, "grad_norm": 2.283083200454712, "learning_rate": 2.0766998977676928e-05, "loss": 0.666, "num_input_tokens_seen": 46673664, "step": 80400 }, { "epoch": 11.97572237116473, "grad_norm": 2.0188775062561035, "learning_rate": 2.0763796509989252e-05, "loss": 0.5804, "num_input_tokens_seen": 46676640, "step": 80405 }, { "epoch": 11.976467083705689, "grad_norm": 1.0422582626342773, "learning_rate": 2.076059411386794e-05, "loss": 0.5803, "num_input_tokens_seen": 46679648, "step": 80410 }, { "epoch": 11.97721179624665, "grad_norm": 1.1680351495742798, "learning_rate": 2.075739178936707e-05, "loss": 0.6058, "num_input_tokens_seen": 46682528, "step": 80415 }, { "epoch": 11.977956508787608, "grad_norm": 1.530016303062439, "learning_rate": 2.0754189536540764e-05, "loss": 0.5918, "num_input_tokens_seen": 46685920, "step": 80420 }, { "epoch": 11.978701221328567, "grad_norm": 0.8791453242301941, "learning_rate": 2.07509873554431e-05, "loss": 0.6019, "num_input_tokens_seen": 46688672, "step": 80425 }, { "epoch": 11.979445933869526, "grad_norm": 1.1354819536209106, "learning_rate": 2.074778524612819e-05, "loss": 0.4266, "num_input_tokens_seen": 46691776, "step": 80430 }, { "epoch": 11.980190646410486, "grad_norm": 1.2865060567855835, "learning_rate": 2.0744583208650136e-05, "loss": 0.5094, "num_input_tokens_seen": 46694656, "step": 80435 }, { "epoch": 11.980935358951445, "grad_norm": 1.3026844263076782, "learning_rate": 2.0741381243063015e-05, "loss": 0.7108, "num_input_tokens_seen": 46697312, "step": 80440 }, { "epoch": 11.981680071492404, "grad_norm": 1.2894809246063232, "learning_rate": 2.0738179349420935e-05, "loss": 0.3898, "num_input_tokens_seen": 46700256, "step": 80445 }, { "epoch": 11.982424784033363, "grad_norm": 1.6827709674835205, "learning_rate": 2.0734977527777974e-05, "loss": 0.4664, "num_input_tokens_seen": 46703136, "step": 80450 }, { "epoch": 11.983169496574323, "grad_norm": 1.0872669219970703, "learning_rate": 2.0731775778188242e-05, "loss": 0.6367, "num_input_tokens_seen": 46706048, "step": 80455 }, { "epoch": 11.983914209115282, "grad_norm": 1.4288606643676758, "learning_rate": 2.0728574100705813e-05, "loss": 0.5478, "num_input_tokens_seen": 46708832, "step": 80460 }, { "epoch": 11.98465892165624, "grad_norm": 1.7705155611038208, "learning_rate": 2.0725372495384786e-05, "loss": 0.7042, "num_input_tokens_seen": 46711648, "step": 80465 }, { "epoch": 11.9854036341972, "grad_norm": 1.9533036947250366, "learning_rate": 2.0722170962279248e-05, "loss": 0.6814, "num_input_tokens_seen": 46714752, "step": 80470 }, { "epoch": 11.986148346738158, "grad_norm": 1.1426851749420166, "learning_rate": 2.0718969501443286e-05, "loss": 0.6108, "num_input_tokens_seen": 46717888, "step": 80475 }, { "epoch": 11.986893059279119, "grad_norm": 1.3722445964813232, "learning_rate": 2.0715768112930984e-05, "loss": 0.5349, "num_input_tokens_seen": 46720736, "step": 80480 }, { "epoch": 11.987637771820078, "grad_norm": 2.767773389816284, "learning_rate": 2.071256679679641e-05, "loss": 0.7005, "num_input_tokens_seen": 46723520, "step": 80485 }, { "epoch": 11.988382484361036, "grad_norm": 3.469905138015747, "learning_rate": 2.070936555309368e-05, "loss": 0.6537, "num_input_tokens_seen": 46727040, "step": 80490 }, { "epoch": 11.989127196901995, "grad_norm": 0.8766027092933655, "learning_rate": 2.0706164381876852e-05, "loss": 0.6267, "num_input_tokens_seen": 46729824, "step": 80495 }, { "epoch": 11.989871909442956, "grad_norm": 1.4619503021240234, "learning_rate": 2.0702963283200018e-05, "loss": 0.4857, "num_input_tokens_seen": 46732608, "step": 80500 }, { "epoch": 11.990616621983914, "grad_norm": 0.9940077066421509, "learning_rate": 2.0699762257117235e-05, "loss": 0.6004, "num_input_tokens_seen": 46735456, "step": 80505 }, { "epoch": 11.991361334524873, "grad_norm": 1.75916588306427, "learning_rate": 2.0696561303682617e-05, "loss": 0.559, "num_input_tokens_seen": 46738656, "step": 80510 }, { "epoch": 11.992106047065832, "grad_norm": 1.6060574054718018, "learning_rate": 2.0693360422950217e-05, "loss": 0.5409, "num_input_tokens_seen": 46741888, "step": 80515 }, { "epoch": 11.992850759606792, "grad_norm": 0.6281924843788147, "learning_rate": 2.0690159614974107e-05, "loss": 0.6798, "num_input_tokens_seen": 46744704, "step": 80520 }, { "epoch": 11.993595472147751, "grad_norm": 0.8357025384902954, "learning_rate": 2.068695887980838e-05, "loss": 0.4654, "num_input_tokens_seen": 46747264, "step": 80525 }, { "epoch": 11.99434018468871, "grad_norm": 1.6137696504592896, "learning_rate": 2.0683758217507092e-05, "loss": 0.4157, "num_input_tokens_seen": 46750240, "step": 80530 }, { "epoch": 11.995084897229669, "grad_norm": 1.0070202350616455, "learning_rate": 2.068055762812433e-05, "loss": 0.6, "num_input_tokens_seen": 46753344, "step": 80535 }, { "epoch": 11.99582960977063, "grad_norm": 2.953122138977051, "learning_rate": 2.0677357111714147e-05, "loss": 0.6192, "num_input_tokens_seen": 46756128, "step": 80540 }, { "epoch": 11.996574322311588, "grad_norm": 1.540874719619751, "learning_rate": 2.067415666833063e-05, "loss": 0.5796, "num_input_tokens_seen": 46759168, "step": 80545 }, { "epoch": 11.997319034852547, "grad_norm": 1.7141286134719849, "learning_rate": 2.0670956298027833e-05, "loss": 0.5846, "num_input_tokens_seen": 46762144, "step": 80550 }, { "epoch": 11.998063747393505, "grad_norm": 1.6155097484588623, "learning_rate": 2.0667756000859835e-05, "loss": 0.5644, "num_input_tokens_seen": 46765024, "step": 80555 }, { "epoch": 11.998808459934466, "grad_norm": 1.3877977132797241, "learning_rate": 2.06645557768807e-05, "loss": 0.6031, "num_input_tokens_seen": 46768192, "step": 80560 }, { "epoch": 11.999553172475425, "grad_norm": 0.9940255880355835, "learning_rate": 2.0661355626144483e-05, "loss": 0.6195, "num_input_tokens_seen": 46771104, "step": 80565 }, { "epoch": 12.0, "eval_loss": 0.6560980677604675, "eval_runtime": 47.0015, "eval_samples_per_second": 63.487, "eval_steps_per_second": 15.872, "num_input_tokens_seen": 46772480, "step": 80568 }, { "epoch": 12.000297885016384, "grad_norm": 1.4176629781723022, "learning_rate": 2.0658155548705258e-05, "loss": 0.6398, "num_input_tokens_seen": 46773440, "step": 80570 }, { "epoch": 12.001042597557342, "grad_norm": 1.750608205795288, "learning_rate": 2.065495554461707e-05, "loss": 0.645, "num_input_tokens_seen": 46776384, "step": 80575 }, { "epoch": 12.001787310098303, "grad_norm": 1.234788179397583, "learning_rate": 2.0651755613934005e-05, "loss": 0.5956, "num_input_tokens_seen": 46779200, "step": 80580 }, { "epoch": 12.002532022639262, "grad_norm": 3.11519455909729, "learning_rate": 2.0648555756710098e-05, "loss": 0.4043, "num_input_tokens_seen": 46782176, "step": 80585 }, { "epoch": 12.00327673518022, "grad_norm": 1.3108466863632202, "learning_rate": 2.064535597299943e-05, "loss": 0.5641, "num_input_tokens_seen": 46784992, "step": 80590 }, { "epoch": 12.004021447721179, "grad_norm": 1.4191175699234009, "learning_rate": 2.0642156262856045e-05, "loss": 0.5237, "num_input_tokens_seen": 46788032, "step": 80595 }, { "epoch": 12.00476616026214, "grad_norm": 1.8715341091156006, "learning_rate": 2.0638956626333993e-05, "loss": 0.602, "num_input_tokens_seen": 46790880, "step": 80600 }, { "epoch": 12.005510872803098, "grad_norm": 1.482488751411438, "learning_rate": 2.0635757063487348e-05, "loss": 0.6501, "num_input_tokens_seen": 46794464, "step": 80605 }, { "epoch": 12.006255585344057, "grad_norm": 1.8386518955230713, "learning_rate": 2.0632557574370137e-05, "loss": 0.5884, "num_input_tokens_seen": 46797536, "step": 80610 }, { "epoch": 12.007000297885016, "grad_norm": 2.827678680419922, "learning_rate": 2.0629358159036437e-05, "loss": 0.6596, "num_input_tokens_seen": 46800896, "step": 80615 }, { "epoch": 12.007745010425976, "grad_norm": 1.4487367868423462, "learning_rate": 2.0626158817540284e-05, "loss": 0.661, "num_input_tokens_seen": 46803648, "step": 80620 }, { "epoch": 12.008489722966935, "grad_norm": 1.6238011121749878, "learning_rate": 2.0622959549935738e-05, "loss": 0.6701, "num_input_tokens_seen": 46806400, "step": 80625 }, { "epoch": 12.009234435507894, "grad_norm": 1.4102946519851685, "learning_rate": 2.061976035627684e-05, "loss": 0.6168, "num_input_tokens_seen": 46809088, "step": 80630 }, { "epoch": 12.009979148048853, "grad_norm": 1.3883816003799438, "learning_rate": 2.061656123661764e-05, "loss": 0.5355, "num_input_tokens_seen": 46812096, "step": 80635 }, { "epoch": 12.010723860589811, "grad_norm": 1.5382825136184692, "learning_rate": 2.0613362191012185e-05, "loss": 0.5357, "num_input_tokens_seen": 46814720, "step": 80640 }, { "epoch": 12.011468573130772, "grad_norm": 0.6489426493644714, "learning_rate": 2.0610163219514504e-05, "loss": 0.5297, "num_input_tokens_seen": 46817600, "step": 80645 }, { "epoch": 12.01221328567173, "grad_norm": 1.9708057641983032, "learning_rate": 2.0606964322178667e-05, "loss": 0.6303, "num_input_tokens_seen": 46820288, "step": 80650 }, { "epoch": 12.01295799821269, "grad_norm": 1.3411712646484375, "learning_rate": 2.0603765499058695e-05, "loss": 0.398, "num_input_tokens_seen": 46823104, "step": 80655 }, { "epoch": 12.013702710753648, "grad_norm": 2.68772554397583, "learning_rate": 2.0600566750208642e-05, "loss": 0.673, "num_input_tokens_seen": 46825888, "step": 80660 }, { "epoch": 12.014447423294609, "grad_norm": 1.3935987949371338, "learning_rate": 2.0597368075682542e-05, "loss": 0.608, "num_input_tokens_seen": 46828864, "step": 80665 }, { "epoch": 12.015192135835568, "grad_norm": 1.1399455070495605, "learning_rate": 2.0594169475534436e-05, "loss": 0.4756, "num_input_tokens_seen": 46831712, "step": 80670 }, { "epoch": 12.015936848376526, "grad_norm": 2.0048463344573975, "learning_rate": 2.0590970949818357e-05, "loss": 0.6741, "num_input_tokens_seen": 46834720, "step": 80675 }, { "epoch": 12.016681560917485, "grad_norm": 1.072216510772705, "learning_rate": 2.0587772498588336e-05, "loss": 0.4359, "num_input_tokens_seen": 46837728, "step": 80680 }, { "epoch": 12.017426273458446, "grad_norm": 2.2716569900512695, "learning_rate": 2.0584574121898424e-05, "loss": 0.5672, "num_input_tokens_seen": 46840640, "step": 80685 }, { "epoch": 12.018170985999404, "grad_norm": 1.5087265968322754, "learning_rate": 2.0581375819802635e-05, "loss": 0.6774, "num_input_tokens_seen": 46844128, "step": 80690 }, { "epoch": 12.018915698540363, "grad_norm": 1.1606017351150513, "learning_rate": 2.057817759235502e-05, "loss": 0.5577, "num_input_tokens_seen": 46846912, "step": 80695 }, { "epoch": 12.019660411081322, "grad_norm": 1.636210322380066, "learning_rate": 2.0574979439609593e-05, "loss": 0.5001, "num_input_tokens_seen": 46849632, "step": 80700 }, { "epoch": 12.020405123622282, "grad_norm": 1.3899402618408203, "learning_rate": 2.0571781361620398e-05, "loss": 0.5692, "num_input_tokens_seen": 46852416, "step": 80705 }, { "epoch": 12.021149836163241, "grad_norm": 1.2966434955596924, "learning_rate": 2.0568583358441445e-05, "loss": 0.4344, "num_input_tokens_seen": 46855520, "step": 80710 }, { "epoch": 12.0218945487042, "grad_norm": 1.337348461151123, "learning_rate": 2.0565385430126783e-05, "loss": 0.5808, "num_input_tokens_seen": 46858240, "step": 80715 }, { "epoch": 12.022639261245159, "grad_norm": 2.358491897583008, "learning_rate": 2.0562187576730428e-05, "loss": 0.6138, "num_input_tokens_seen": 46861152, "step": 80720 }, { "epoch": 12.02338397378612, "grad_norm": 1.654481053352356, "learning_rate": 2.0558989798306395e-05, "loss": 0.6654, "num_input_tokens_seen": 46863904, "step": 80725 }, { "epoch": 12.024128686327078, "grad_norm": 1.145735740661621, "learning_rate": 2.0555792094908722e-05, "loss": 0.6633, "num_input_tokens_seen": 46867200, "step": 80730 }, { "epoch": 12.024873398868037, "grad_norm": 1.184159278869629, "learning_rate": 2.055259446659142e-05, "loss": 0.5869, "num_input_tokens_seen": 46870240, "step": 80735 }, { "epoch": 12.025618111408996, "grad_norm": 0.9964027404785156, "learning_rate": 2.0549396913408522e-05, "loss": 0.631, "num_input_tokens_seen": 46873184, "step": 80740 }, { "epoch": 12.026362823949956, "grad_norm": 2.205186128616333, "learning_rate": 2.0546199435414028e-05, "loss": 0.5255, "num_input_tokens_seen": 46876544, "step": 80745 }, { "epoch": 12.027107536490915, "grad_norm": 1.3976696729660034, "learning_rate": 2.054300203266198e-05, "loss": 0.4675, "num_input_tokens_seen": 46879520, "step": 80750 }, { "epoch": 12.027852249031874, "grad_norm": 3.1544065475463867, "learning_rate": 2.0539804705206378e-05, "loss": 0.6122, "num_input_tokens_seen": 46882688, "step": 80755 }, { "epoch": 12.028596961572832, "grad_norm": 1.262864112854004, "learning_rate": 2.0536607453101236e-05, "loss": 0.4988, "num_input_tokens_seen": 46886720, "step": 80760 }, { "epoch": 12.029341674113793, "grad_norm": 1.866336464881897, "learning_rate": 2.0533410276400582e-05, "loss": 0.7229, "num_input_tokens_seen": 46889760, "step": 80765 }, { "epoch": 12.030086386654752, "grad_norm": 1.6615409851074219, "learning_rate": 2.053021317515842e-05, "loss": 0.5132, "num_input_tokens_seen": 46892448, "step": 80770 }, { "epoch": 12.03083109919571, "grad_norm": 1.540877103805542, "learning_rate": 2.0527016149428767e-05, "loss": 0.8688, "num_input_tokens_seen": 46895168, "step": 80775 }, { "epoch": 12.03157581173667, "grad_norm": 1.9448719024658203, "learning_rate": 2.052381919926562e-05, "loss": 0.653, "num_input_tokens_seen": 46898336, "step": 80780 }, { "epoch": 12.03232052427763, "grad_norm": 1.862999677658081, "learning_rate": 2.052062232472301e-05, "loss": 0.5134, "num_input_tokens_seen": 46901184, "step": 80785 }, { "epoch": 12.033065236818588, "grad_norm": 1.0992300510406494, "learning_rate": 2.0517425525854926e-05, "loss": 0.5158, "num_input_tokens_seen": 46904448, "step": 80790 }, { "epoch": 12.033809949359547, "grad_norm": 0.793030321598053, "learning_rate": 2.051422880271538e-05, "loss": 0.5422, "num_input_tokens_seen": 46907360, "step": 80795 }, { "epoch": 12.034554661900506, "grad_norm": 4.471041202545166, "learning_rate": 2.051103215535839e-05, "loss": 0.8206, "num_input_tokens_seen": 46910240, "step": 80800 }, { "epoch": 12.035299374441466, "grad_norm": 2.1668918132781982, "learning_rate": 2.0507835583837943e-05, "loss": 0.5973, "num_input_tokens_seen": 46913024, "step": 80805 }, { "epoch": 12.036044086982425, "grad_norm": 2.125854730606079, "learning_rate": 2.050463908820805e-05, "loss": 0.3932, "num_input_tokens_seen": 46915840, "step": 80810 }, { "epoch": 12.036788799523384, "grad_norm": 1.7067248821258545, "learning_rate": 2.0501442668522703e-05, "loss": 0.6133, "num_input_tokens_seen": 46918848, "step": 80815 }, { "epoch": 12.037533512064343, "grad_norm": 2.2576661109924316, "learning_rate": 2.0498246324835918e-05, "loss": 0.654, "num_input_tokens_seen": 46921888, "step": 80820 }, { "epoch": 12.038278224605302, "grad_norm": 1.3967247009277344, "learning_rate": 2.0495050057201683e-05, "loss": 0.6587, "num_input_tokens_seen": 46924896, "step": 80825 }, { "epoch": 12.039022937146262, "grad_norm": 1.9218761920928955, "learning_rate": 2.0491853865674002e-05, "loss": 0.6356, "num_input_tokens_seen": 46927872, "step": 80830 }, { "epoch": 12.03976764968722, "grad_norm": 1.766144037246704, "learning_rate": 2.0488657750306865e-05, "loss": 0.747, "num_input_tokens_seen": 46930848, "step": 80835 }, { "epoch": 12.04051236222818, "grad_norm": 1.1139130592346191, "learning_rate": 2.0485461711154265e-05, "loss": 0.6158, "num_input_tokens_seen": 46933984, "step": 80840 }, { "epoch": 12.041257074769138, "grad_norm": 2.265669822692871, "learning_rate": 2.0482265748270213e-05, "loss": 0.5814, "num_input_tokens_seen": 46936736, "step": 80845 }, { "epoch": 12.042001787310099, "grad_norm": 1.375116229057312, "learning_rate": 2.0479069861708674e-05, "loss": 0.75, "num_input_tokens_seen": 46939552, "step": 80850 }, { "epoch": 12.042746499851058, "grad_norm": 1.1187247037887573, "learning_rate": 2.047587405152367e-05, "loss": 0.5949, "num_input_tokens_seen": 46942304, "step": 80855 }, { "epoch": 12.043491212392016, "grad_norm": 1.3300631046295166, "learning_rate": 2.0472678317769168e-05, "loss": 0.501, "num_input_tokens_seen": 46945088, "step": 80860 }, { "epoch": 12.044235924932975, "grad_norm": 2.9358696937561035, "learning_rate": 2.0469482660499167e-05, "loss": 0.5886, "num_input_tokens_seen": 46947680, "step": 80865 }, { "epoch": 12.044980637473936, "grad_norm": 1.5604883432388306, "learning_rate": 2.0466287079767646e-05, "loss": 0.4422, "num_input_tokens_seen": 46950336, "step": 80870 }, { "epoch": 12.045725350014894, "grad_norm": 2.725968599319458, "learning_rate": 2.0463091575628608e-05, "loss": 0.7438, "num_input_tokens_seen": 46953280, "step": 80875 }, { "epoch": 12.046470062555853, "grad_norm": 1.297990322113037, "learning_rate": 2.045989614813602e-05, "loss": 0.5206, "num_input_tokens_seen": 46956064, "step": 80880 }, { "epoch": 12.047214775096812, "grad_norm": 1.431033968925476, "learning_rate": 2.0456700797343867e-05, "loss": 0.771, "num_input_tokens_seen": 46958848, "step": 80885 }, { "epoch": 12.047959487637772, "grad_norm": 1.4367173910140991, "learning_rate": 2.0453505523306147e-05, "loss": 0.6923, "num_input_tokens_seen": 46961760, "step": 80890 }, { "epoch": 12.048704200178731, "grad_norm": 1.1233736276626587, "learning_rate": 2.0450310326076823e-05, "loss": 0.6883, "num_input_tokens_seen": 46964672, "step": 80895 }, { "epoch": 12.04944891271969, "grad_norm": 1.5019947290420532, "learning_rate": 2.0447115205709887e-05, "loss": 0.5367, "num_input_tokens_seen": 46967328, "step": 80900 }, { "epoch": 12.050193625260649, "grad_norm": 0.9397425651550293, "learning_rate": 2.0443920162259306e-05, "loss": 0.5048, "num_input_tokens_seen": 46970112, "step": 80905 }, { "epoch": 12.05093833780161, "grad_norm": 1.7699341773986816, "learning_rate": 2.044072519577907e-05, "loss": 0.5907, "num_input_tokens_seen": 46972800, "step": 80910 }, { "epoch": 12.051683050342568, "grad_norm": 0.8222412467002869, "learning_rate": 2.043753030632315e-05, "loss": 0.5671, "num_input_tokens_seen": 46976032, "step": 80915 }, { "epoch": 12.052427762883527, "grad_norm": 1.3540548086166382, "learning_rate": 2.0434335493945506e-05, "loss": 0.5943, "num_input_tokens_seen": 46979232, "step": 80920 }, { "epoch": 12.053172475424486, "grad_norm": 0.9351268410682678, "learning_rate": 2.043114075870013e-05, "loss": 0.5466, "num_input_tokens_seen": 46981984, "step": 80925 }, { "epoch": 12.053917187965446, "grad_norm": 1.793322205543518, "learning_rate": 2.042794610064099e-05, "loss": 0.6327, "num_input_tokens_seen": 46985056, "step": 80930 }, { "epoch": 12.054661900506405, "grad_norm": 1.3384913206100464, "learning_rate": 2.0424751519822054e-05, "loss": 0.571, "num_input_tokens_seen": 46987968, "step": 80935 }, { "epoch": 12.055406613047364, "grad_norm": 4.14703369140625, "learning_rate": 2.0421557016297283e-05, "loss": 0.6945, "num_input_tokens_seen": 46990752, "step": 80940 }, { "epoch": 12.056151325588322, "grad_norm": 1.2035220861434937, "learning_rate": 2.041836259012066e-05, "loss": 0.7786, "num_input_tokens_seen": 46993888, "step": 80945 }, { "epoch": 12.056896038129283, "grad_norm": 3.122171640396118, "learning_rate": 2.0415168241346138e-05, "loss": 0.5826, "num_input_tokens_seen": 46996736, "step": 80950 }, { "epoch": 12.057640750670242, "grad_norm": 1.5090270042419434, "learning_rate": 2.0411973970027698e-05, "loss": 0.6258, "num_input_tokens_seen": 46999584, "step": 80955 }, { "epoch": 12.0583854632112, "grad_norm": 1.9158564805984497, "learning_rate": 2.0408779776219295e-05, "loss": 0.614, "num_input_tokens_seen": 47002208, "step": 80960 }, { "epoch": 12.05913017575216, "grad_norm": 1.5730232000350952, "learning_rate": 2.0405585659974885e-05, "loss": 0.721, "num_input_tokens_seen": 47005088, "step": 80965 }, { "epoch": 12.05987488829312, "grad_norm": 1.6104116439819336, "learning_rate": 2.0402391621348444e-05, "loss": 0.6072, "num_input_tokens_seen": 47007872, "step": 80970 }, { "epoch": 12.060619600834078, "grad_norm": 2.376138210296631, "learning_rate": 2.039919766039391e-05, "loss": 0.7959, "num_input_tokens_seen": 47010656, "step": 80975 }, { "epoch": 12.061364313375037, "grad_norm": 1.3134044408798218, "learning_rate": 2.0396003777165266e-05, "loss": 0.586, "num_input_tokens_seen": 47013536, "step": 80980 }, { "epoch": 12.062109025915996, "grad_norm": 4.038566589355469, "learning_rate": 2.0392809971716448e-05, "loss": 0.5502, "num_input_tokens_seen": 47016576, "step": 80985 }, { "epoch": 12.062853738456955, "grad_norm": 1.5600950717926025, "learning_rate": 2.0389616244101437e-05, "loss": 0.7029, "num_input_tokens_seen": 47019424, "step": 80990 }, { "epoch": 12.063598450997915, "grad_norm": 1.2303279638290405, "learning_rate": 2.038642259437417e-05, "loss": 0.5352, "num_input_tokens_seen": 47022528, "step": 80995 }, { "epoch": 12.064343163538874, "grad_norm": 1.346898078918457, "learning_rate": 2.03832290225886e-05, "loss": 0.6668, "num_input_tokens_seen": 47025440, "step": 81000 }, { "epoch": 12.065087876079833, "grad_norm": 0.8423542380332947, "learning_rate": 2.0380035528798692e-05, "loss": 0.6388, "num_input_tokens_seen": 47028288, "step": 81005 }, { "epoch": 12.065832588620792, "grad_norm": 0.9688662886619568, "learning_rate": 2.0376842113058372e-05, "loss": 0.5508, "num_input_tokens_seen": 47031360, "step": 81010 }, { "epoch": 12.066577301161752, "grad_norm": 2.0196292400360107, "learning_rate": 2.037364877542162e-05, "loss": 0.5528, "num_input_tokens_seen": 47034208, "step": 81015 }, { "epoch": 12.06732201370271, "grad_norm": 2.316391944885254, "learning_rate": 2.037045551594236e-05, "loss": 0.4181, "num_input_tokens_seen": 47036864, "step": 81020 }, { "epoch": 12.06806672624367, "grad_norm": 1.6601591110229492, "learning_rate": 2.0367262334674556e-05, "loss": 0.5135, "num_input_tokens_seen": 47039456, "step": 81025 }, { "epoch": 12.068811438784628, "grad_norm": 1.8773266077041626, "learning_rate": 2.0364069231672143e-05, "loss": 0.5757, "num_input_tokens_seen": 47042528, "step": 81030 }, { "epoch": 12.069556151325589, "grad_norm": 4.594247817993164, "learning_rate": 2.0360876206989073e-05, "loss": 0.7222, "num_input_tokens_seen": 47045440, "step": 81035 }, { "epoch": 12.070300863866548, "grad_norm": 1.7818588018417358, "learning_rate": 2.0357683260679285e-05, "loss": 0.642, "num_input_tokens_seen": 47048384, "step": 81040 }, { "epoch": 12.071045576407506, "grad_norm": 3.5683116912841797, "learning_rate": 2.035449039279671e-05, "loss": 0.7739, "num_input_tokens_seen": 47051328, "step": 81045 }, { "epoch": 12.071790288948465, "grad_norm": 1.5804234743118286, "learning_rate": 2.035129760339531e-05, "loss": 0.6078, "num_input_tokens_seen": 47054304, "step": 81050 }, { "epoch": 12.072535001489426, "grad_norm": 1.952786922454834, "learning_rate": 2.0348104892528998e-05, "loss": 0.6658, "num_input_tokens_seen": 47056960, "step": 81055 }, { "epoch": 12.073279714030384, "grad_norm": 1.3989903926849365, "learning_rate": 2.0344912260251742e-05, "loss": 0.4948, "num_input_tokens_seen": 47060224, "step": 81060 }, { "epoch": 12.074024426571343, "grad_norm": 0.9563128352165222, "learning_rate": 2.034171970661745e-05, "loss": 0.5293, "num_input_tokens_seen": 47063360, "step": 81065 }, { "epoch": 12.074769139112302, "grad_norm": 1.6953892707824707, "learning_rate": 2.0338527231680078e-05, "loss": 0.5523, "num_input_tokens_seen": 47066080, "step": 81070 }, { "epoch": 12.075513851653263, "grad_norm": 1.5943825244903564, "learning_rate": 2.033533483549354e-05, "loss": 0.4137, "num_input_tokens_seen": 47068864, "step": 81075 }, { "epoch": 12.076258564194221, "grad_norm": 1.4458353519439697, "learning_rate": 2.033214251811179e-05, "loss": 0.6994, "num_input_tokens_seen": 47071680, "step": 81080 }, { "epoch": 12.07700327673518, "grad_norm": 1.357093095779419, "learning_rate": 2.0328950279588748e-05, "loss": 0.5376, "num_input_tokens_seen": 47074688, "step": 81085 }, { "epoch": 12.077747989276139, "grad_norm": 1.1365323066711426, "learning_rate": 2.0325758119978334e-05, "loss": 0.5385, "num_input_tokens_seen": 47077760, "step": 81090 }, { "epoch": 12.0784927018171, "grad_norm": 1.052261471748352, "learning_rate": 2.0322566039334497e-05, "loss": 0.4923, "num_input_tokens_seen": 47080864, "step": 81095 }, { "epoch": 12.079237414358058, "grad_norm": 1.236649513244629, "learning_rate": 2.0319374037711143e-05, "loss": 0.5381, "num_input_tokens_seen": 47083744, "step": 81100 }, { "epoch": 12.079982126899017, "grad_norm": 1.2399276494979858, "learning_rate": 2.0316182115162218e-05, "loss": 0.5203, "num_input_tokens_seen": 47086720, "step": 81105 }, { "epoch": 12.080726839439976, "grad_norm": 1.4833658933639526, "learning_rate": 2.031299027174162e-05, "loss": 0.5346, "num_input_tokens_seen": 47089536, "step": 81110 }, { "epoch": 12.081471551980936, "grad_norm": 1.5767403841018677, "learning_rate": 2.03097985075033e-05, "loss": 0.6333, "num_input_tokens_seen": 47092288, "step": 81115 }, { "epoch": 12.082216264521895, "grad_norm": 1.3834973573684692, "learning_rate": 2.030660682250117e-05, "loss": 0.8745, "num_input_tokens_seen": 47094976, "step": 81120 }, { "epoch": 12.082960977062854, "grad_norm": 0.9744743704795837, "learning_rate": 2.0303415216789135e-05, "loss": 0.6479, "num_input_tokens_seen": 47098016, "step": 81125 }, { "epoch": 12.083705689603812, "grad_norm": 2.10827374458313, "learning_rate": 2.0300223690421135e-05, "loss": 0.6271, "num_input_tokens_seen": 47100832, "step": 81130 }, { "epoch": 12.084450402144773, "grad_norm": 1.9200997352600098, "learning_rate": 2.029703224345108e-05, "loss": 0.5699, "num_input_tokens_seen": 47103744, "step": 81135 }, { "epoch": 12.085195114685732, "grad_norm": 1.2955944538116455, "learning_rate": 2.0293840875932886e-05, "loss": 0.6168, "num_input_tokens_seen": 47106560, "step": 81140 }, { "epoch": 12.08593982722669, "grad_norm": 1.2163397073745728, "learning_rate": 2.029064958792046e-05, "loss": 0.4611, "num_input_tokens_seen": 47109568, "step": 81145 }, { "epoch": 12.08668453976765, "grad_norm": 1.4451168775558472, "learning_rate": 2.0287458379467728e-05, "loss": 0.7201, "num_input_tokens_seen": 47112288, "step": 81150 }, { "epoch": 12.08742925230861, "grad_norm": 1.0004972219467163, "learning_rate": 2.028426725062859e-05, "loss": 0.5678, "num_input_tokens_seen": 47115424, "step": 81155 }, { "epoch": 12.088173964849569, "grad_norm": 1.7318589687347412, "learning_rate": 2.0281076201456977e-05, "loss": 0.6906, "num_input_tokens_seen": 47118336, "step": 81160 }, { "epoch": 12.088918677390527, "grad_norm": 1.680574655532837, "learning_rate": 2.0277885232006776e-05, "loss": 0.4931, "num_input_tokens_seen": 47121248, "step": 81165 }, { "epoch": 12.089663389931486, "grad_norm": 1.438392162322998, "learning_rate": 2.0274694342331907e-05, "loss": 0.8479, "num_input_tokens_seen": 47124128, "step": 81170 }, { "epoch": 12.090408102472445, "grad_norm": 1.0094177722930908, "learning_rate": 2.027150353248628e-05, "loss": 0.6212, "num_input_tokens_seen": 47127008, "step": 81175 }, { "epoch": 12.091152815013405, "grad_norm": 2.1702704429626465, "learning_rate": 2.026831280252378e-05, "loss": 0.5635, "num_input_tokens_seen": 47129504, "step": 81180 }, { "epoch": 12.091897527554364, "grad_norm": 1.6652674674987793, "learning_rate": 2.026512215249834e-05, "loss": 0.6809, "num_input_tokens_seen": 47132352, "step": 81185 }, { "epoch": 12.092642240095323, "grad_norm": 2.047090768814087, "learning_rate": 2.0261931582463844e-05, "loss": 0.6283, "num_input_tokens_seen": 47135200, "step": 81190 }, { "epoch": 12.093386952636282, "grad_norm": 1.8314322233200073, "learning_rate": 2.0258741092474204e-05, "loss": 0.5484, "num_input_tokens_seen": 47137888, "step": 81195 }, { "epoch": 12.094131665177242, "grad_norm": 2.4905097484588623, "learning_rate": 2.0255550682583313e-05, "loss": 0.6355, "num_input_tokens_seen": 47140960, "step": 81200 }, { "epoch": 12.094876377718201, "grad_norm": 2.5328292846679688, "learning_rate": 2.025236035284506e-05, "loss": 0.6985, "num_input_tokens_seen": 47143904, "step": 81205 }, { "epoch": 12.09562109025916, "grad_norm": 1.6223381757736206, "learning_rate": 2.0249170103313365e-05, "loss": 0.7125, "num_input_tokens_seen": 47147008, "step": 81210 }, { "epoch": 12.096365802800118, "grad_norm": 2.0662801265716553, "learning_rate": 2.0245979934042104e-05, "loss": 0.6484, "num_input_tokens_seen": 47149504, "step": 81215 }, { "epoch": 12.097110515341079, "grad_norm": 1.6340627670288086, "learning_rate": 2.0242789845085187e-05, "loss": 0.58, "num_input_tokens_seen": 47152544, "step": 81220 }, { "epoch": 12.097855227882038, "grad_norm": 3.4357573986053467, "learning_rate": 2.0239599836496497e-05, "loss": 0.6912, "num_input_tokens_seen": 47155680, "step": 81225 }, { "epoch": 12.098599940422996, "grad_norm": 1.3231470584869385, "learning_rate": 2.0236409908329933e-05, "loss": 0.6654, "num_input_tokens_seen": 47158784, "step": 81230 }, { "epoch": 12.099344652963955, "grad_norm": 1.0696890354156494, "learning_rate": 2.0233220060639373e-05, "loss": 0.5036, "num_input_tokens_seen": 47161536, "step": 81235 }, { "epoch": 12.100089365504916, "grad_norm": 1.1277645826339722, "learning_rate": 2.023003029347873e-05, "loss": 0.4453, "num_input_tokens_seen": 47164448, "step": 81240 }, { "epoch": 12.100834078045875, "grad_norm": 1.292578101158142, "learning_rate": 2.0226840606901872e-05, "loss": 0.6794, "num_input_tokens_seen": 47167424, "step": 81245 }, { "epoch": 12.101578790586833, "grad_norm": 1.023213505744934, "learning_rate": 2.022365100096268e-05, "loss": 0.5221, "num_input_tokens_seen": 47170336, "step": 81250 }, { "epoch": 12.102323503127792, "grad_norm": 1.5975542068481445, "learning_rate": 2.0220461475715063e-05, "loss": 0.6935, "num_input_tokens_seen": 47173152, "step": 81255 }, { "epoch": 12.103068215668753, "grad_norm": 1.6004968881607056, "learning_rate": 2.0217272031212887e-05, "loss": 0.6604, "num_input_tokens_seen": 47176192, "step": 81260 }, { "epoch": 12.103812928209711, "grad_norm": 2.083000659942627, "learning_rate": 2.021408266751004e-05, "loss": 0.5099, "num_input_tokens_seen": 47179104, "step": 81265 }, { "epoch": 12.10455764075067, "grad_norm": 2.0275208950042725, "learning_rate": 2.0210893384660396e-05, "loss": 0.5866, "num_input_tokens_seen": 47181792, "step": 81270 }, { "epoch": 12.105302353291629, "grad_norm": 1.665464997291565, "learning_rate": 2.0207704182717852e-05, "loss": 0.444, "num_input_tokens_seen": 47184640, "step": 81275 }, { "epoch": 12.10604706583259, "grad_norm": 1.399874210357666, "learning_rate": 2.0204515061736275e-05, "loss": 0.6834, "num_input_tokens_seen": 47187936, "step": 81280 }, { "epoch": 12.106791778373548, "grad_norm": 1.7153791189193726, "learning_rate": 2.0201326021769526e-05, "loss": 0.5467, "num_input_tokens_seen": 47191200, "step": 81285 }, { "epoch": 12.107536490914507, "grad_norm": 2.7484257221221924, "learning_rate": 2.0198137062871512e-05, "loss": 0.598, "num_input_tokens_seen": 47194112, "step": 81290 }, { "epoch": 12.108281203455466, "grad_norm": 1.7450551986694336, "learning_rate": 2.0194948185096086e-05, "loss": 0.6456, "num_input_tokens_seen": 47197248, "step": 81295 }, { "epoch": 12.109025915996426, "grad_norm": 1.4673044681549072, "learning_rate": 2.019175938849713e-05, "loss": 0.5126, "num_input_tokens_seen": 47199968, "step": 81300 }, { "epoch": 12.109770628537385, "grad_norm": 1.5335313081741333, "learning_rate": 2.0188570673128504e-05, "loss": 0.7224, "num_input_tokens_seen": 47203136, "step": 81305 }, { "epoch": 12.110515341078344, "grad_norm": 2.204747200012207, "learning_rate": 2.0185382039044094e-05, "loss": 0.8878, "num_input_tokens_seen": 47206176, "step": 81310 }, { "epoch": 12.111260053619302, "grad_norm": 0.9657362699508667, "learning_rate": 2.0182193486297755e-05, "loss": 0.6768, "num_input_tokens_seen": 47209088, "step": 81315 }, { "epoch": 12.112004766160263, "grad_norm": 1.519765853881836, "learning_rate": 2.017900501494337e-05, "loss": 0.5746, "num_input_tokens_seen": 47211936, "step": 81320 }, { "epoch": 12.112749478701222, "grad_norm": 2.096788167953491, "learning_rate": 2.0175816625034795e-05, "loss": 0.5747, "num_input_tokens_seen": 47215072, "step": 81325 }, { "epoch": 12.11349419124218, "grad_norm": 1.1072328090667725, "learning_rate": 2.0172628316625887e-05, "loss": 0.4874, "num_input_tokens_seen": 47217920, "step": 81330 }, { "epoch": 12.11423890378314, "grad_norm": 1.6718231439590454, "learning_rate": 2.0169440089770523e-05, "loss": 0.5689, "num_input_tokens_seen": 47220672, "step": 81335 }, { "epoch": 12.114983616324098, "grad_norm": 2.1240744590759277, "learning_rate": 2.0166251944522553e-05, "loss": 0.6705, "num_input_tokens_seen": 47223584, "step": 81340 }, { "epoch": 12.115728328865059, "grad_norm": 1.5728936195373535, "learning_rate": 2.016306388093585e-05, "loss": 0.6013, "num_input_tokens_seen": 47226528, "step": 81345 }, { "epoch": 12.116473041406017, "grad_norm": 1.0725808143615723, "learning_rate": 2.0159875899064258e-05, "loss": 0.581, "num_input_tokens_seen": 47229472, "step": 81350 }, { "epoch": 12.117217753946976, "grad_norm": 1.1555362939834595, "learning_rate": 2.0156687998961653e-05, "loss": 0.8243, "num_input_tokens_seen": 47232544, "step": 81355 }, { "epoch": 12.117962466487935, "grad_norm": 1.2847003936767578, "learning_rate": 2.015350018068188e-05, "loss": 0.4915, "num_input_tokens_seen": 47235488, "step": 81360 }, { "epoch": 12.118707179028895, "grad_norm": 1.435474157333374, "learning_rate": 2.0150312444278795e-05, "loss": 0.6198, "num_input_tokens_seen": 47238400, "step": 81365 }, { "epoch": 12.119451891569854, "grad_norm": 1.547492265701294, "learning_rate": 2.0147124789806254e-05, "loss": 0.6463, "num_input_tokens_seen": 47241152, "step": 81370 }, { "epoch": 12.120196604110813, "grad_norm": 0.8026980757713318, "learning_rate": 2.01439372173181e-05, "loss": 0.499, "num_input_tokens_seen": 47244000, "step": 81375 }, { "epoch": 12.120941316651772, "grad_norm": 1.6147035360336304, "learning_rate": 2.0140749726868197e-05, "loss": 0.6733, "num_input_tokens_seen": 47246720, "step": 81380 }, { "epoch": 12.121686029192732, "grad_norm": 1.4366246461868286, "learning_rate": 2.013756231851038e-05, "loss": 0.5913, "num_input_tokens_seen": 47249472, "step": 81385 }, { "epoch": 12.122430741733691, "grad_norm": 1.5434352159500122, "learning_rate": 2.0134374992298515e-05, "loss": 0.7152, "num_input_tokens_seen": 47252768, "step": 81390 }, { "epoch": 12.12317545427465, "grad_norm": 1.6637340784072876, "learning_rate": 2.0131187748286438e-05, "loss": 0.5827, "num_input_tokens_seen": 47255872, "step": 81395 }, { "epoch": 12.123920166815608, "grad_norm": 0.8366915583610535, "learning_rate": 2.0128000586528e-05, "loss": 0.4801, "num_input_tokens_seen": 47258816, "step": 81400 }, { "epoch": 12.124664879356569, "grad_norm": 3.123039960861206, "learning_rate": 2.012481350707704e-05, "loss": 0.6029, "num_input_tokens_seen": 47262048, "step": 81405 }, { "epoch": 12.125409591897528, "grad_norm": 1.5637277364730835, "learning_rate": 2.012162650998739e-05, "loss": 0.6595, "num_input_tokens_seen": 47264896, "step": 81410 }, { "epoch": 12.126154304438487, "grad_norm": 1.3239997625350952, "learning_rate": 2.011843959531291e-05, "loss": 0.6682, "num_input_tokens_seen": 47267776, "step": 81415 }, { "epoch": 12.126899016979445, "grad_norm": 1.1856389045715332, "learning_rate": 2.0115252763107424e-05, "loss": 0.7154, "num_input_tokens_seen": 47270912, "step": 81420 }, { "epoch": 12.127643729520406, "grad_norm": 2.2104132175445557, "learning_rate": 2.0112066013424785e-05, "loss": 0.6556, "num_input_tokens_seen": 47273888, "step": 81425 }, { "epoch": 12.128388442061365, "grad_norm": 1.696358561515808, "learning_rate": 2.010887934631882e-05, "loss": 0.5856, "num_input_tokens_seen": 47276672, "step": 81430 }, { "epoch": 12.129133154602323, "grad_norm": 1.540825605392456, "learning_rate": 2.0105692761843375e-05, "loss": 0.5271, "num_input_tokens_seen": 47279424, "step": 81435 }, { "epoch": 12.129877867143282, "grad_norm": 2.7628471851348877, "learning_rate": 2.0102506260052273e-05, "loss": 0.5137, "num_input_tokens_seen": 47282432, "step": 81440 }, { "epoch": 12.130622579684243, "grad_norm": 1.1612974405288696, "learning_rate": 2.0099319840999343e-05, "loss": 0.5652, "num_input_tokens_seen": 47285472, "step": 81445 }, { "epoch": 12.131367292225201, "grad_norm": 1.2804522514343262, "learning_rate": 2.0096133504738428e-05, "loss": 0.7303, "num_input_tokens_seen": 47288224, "step": 81450 }, { "epoch": 12.13211200476616, "grad_norm": 1.8629794120788574, "learning_rate": 2.009294725132335e-05, "loss": 0.6247, "num_input_tokens_seen": 47290976, "step": 81455 }, { "epoch": 12.132856717307119, "grad_norm": 0.555899441242218, "learning_rate": 2.0089761080807945e-05, "loss": 0.4557, "num_input_tokens_seen": 47294048, "step": 81460 }, { "epoch": 12.13360142984808, "grad_norm": 1.7589097023010254, "learning_rate": 2.0086574993246034e-05, "loss": 0.5612, "num_input_tokens_seen": 47296800, "step": 81465 }, { "epoch": 12.134346142389038, "grad_norm": 2.44280743598938, "learning_rate": 2.008338898869145e-05, "loss": 0.6414, "num_input_tokens_seen": 47299648, "step": 81470 }, { "epoch": 12.135090854929997, "grad_norm": 0.8429486751556396, "learning_rate": 2.0080203067198003e-05, "loss": 0.5925, "num_input_tokens_seen": 47302528, "step": 81475 }, { "epoch": 12.135835567470956, "grad_norm": 1.4165363311767578, "learning_rate": 2.0077017228819534e-05, "loss": 0.6615, "num_input_tokens_seen": 47305280, "step": 81480 }, { "epoch": 12.136580280011916, "grad_norm": 1.519932508468628, "learning_rate": 2.0073831473609855e-05, "loss": 0.6953, "num_input_tokens_seen": 47308032, "step": 81485 }, { "epoch": 12.137324992552875, "grad_norm": 1.5785484313964844, "learning_rate": 2.007064580162278e-05, "loss": 0.6939, "num_input_tokens_seen": 47310944, "step": 81490 }, { "epoch": 12.138069705093834, "grad_norm": 1.6837010383605957, "learning_rate": 2.0067460212912137e-05, "loss": 0.529, "num_input_tokens_seen": 47314336, "step": 81495 }, { "epoch": 12.138814417634793, "grad_norm": 1.1462293863296509, "learning_rate": 2.006427470753174e-05, "loss": 0.5633, "num_input_tokens_seen": 47317312, "step": 81500 }, { "epoch": 12.139559130175751, "grad_norm": 1.388168454170227, "learning_rate": 2.006108928553541e-05, "loss": 0.7014, "num_input_tokens_seen": 47320480, "step": 81505 }, { "epoch": 12.140303842716712, "grad_norm": 2.437725305557251, "learning_rate": 2.0057903946976944e-05, "loss": 0.8751, "num_input_tokens_seen": 47323648, "step": 81510 }, { "epoch": 12.14104855525767, "grad_norm": 1.1915708780288696, "learning_rate": 2.0054718691910178e-05, "loss": 0.517, "num_input_tokens_seen": 47326688, "step": 81515 }, { "epoch": 12.14179326779863, "grad_norm": 0.9420854449272156, "learning_rate": 2.0051533520388918e-05, "loss": 0.471, "num_input_tokens_seen": 47329568, "step": 81520 }, { "epoch": 12.142537980339588, "grad_norm": 1.876585602760315, "learning_rate": 2.0048348432466963e-05, "loss": 0.536, "num_input_tokens_seen": 47332352, "step": 81525 }, { "epoch": 12.143282692880549, "grad_norm": 1.2121508121490479, "learning_rate": 2.004516342819813e-05, "loss": 0.4085, "num_input_tokens_seen": 47335200, "step": 81530 }, { "epoch": 12.144027405421507, "grad_norm": 0.7967005372047424, "learning_rate": 2.0041978507636222e-05, "loss": 0.391, "num_input_tokens_seen": 47337984, "step": 81535 }, { "epoch": 12.144772117962466, "grad_norm": 2.0481321811676025, "learning_rate": 2.0038793670835054e-05, "loss": 0.7331, "num_input_tokens_seen": 47340864, "step": 81540 }, { "epoch": 12.145516830503425, "grad_norm": 1.7820682525634766, "learning_rate": 2.0035608917848415e-05, "loss": 0.619, "num_input_tokens_seen": 47343584, "step": 81545 }, { "epoch": 12.146261543044385, "grad_norm": 2.037396192550659, "learning_rate": 2.0032424248730124e-05, "loss": 0.5708, "num_input_tokens_seen": 47346368, "step": 81550 }, { "epoch": 12.147006255585344, "grad_norm": 1.581576943397522, "learning_rate": 2.0029239663533977e-05, "loss": 0.5796, "num_input_tokens_seen": 47349120, "step": 81555 }, { "epoch": 12.147750968126303, "grad_norm": 1.4230527877807617, "learning_rate": 2.0026055162313778e-05, "loss": 0.5854, "num_input_tokens_seen": 47352032, "step": 81560 }, { "epoch": 12.148495680667262, "grad_norm": 1.421838402748108, "learning_rate": 2.002287074512332e-05, "loss": 0.6017, "num_input_tokens_seen": 47354688, "step": 81565 }, { "epoch": 12.149240393208222, "grad_norm": 0.7668560147285461, "learning_rate": 2.001968641201639e-05, "loss": 0.6838, "num_input_tokens_seen": 47357568, "step": 81570 }, { "epoch": 12.149985105749181, "grad_norm": 1.1514785289764404, "learning_rate": 2.0016502163046815e-05, "loss": 0.5863, "num_input_tokens_seen": 47360960, "step": 81575 }, { "epoch": 12.15072981829014, "grad_norm": 1.5453307628631592, "learning_rate": 2.0013317998268352e-05, "loss": 0.5902, "num_input_tokens_seen": 47363712, "step": 81580 }, { "epoch": 12.151474530831099, "grad_norm": 1.9363229274749756, "learning_rate": 2.0010133917734825e-05, "loss": 0.5768, "num_input_tokens_seen": 47366432, "step": 81585 }, { "epoch": 12.152219243372059, "grad_norm": 1.740958571434021, "learning_rate": 2.0006949921500012e-05, "loss": 0.6157, "num_input_tokens_seen": 47369088, "step": 81590 }, { "epoch": 12.152963955913018, "grad_norm": 2.087132215499878, "learning_rate": 2.0003766009617707e-05, "loss": 0.5949, "num_input_tokens_seen": 47371936, "step": 81595 }, { "epoch": 12.153708668453977, "grad_norm": 1.109649419784546, "learning_rate": 2.00005821821417e-05, "loss": 0.5518, "num_input_tokens_seen": 47374752, "step": 81600 }, { "epoch": 12.154453380994935, "grad_norm": 1.2465052604675293, "learning_rate": 1.9997398439125763e-05, "loss": 0.5949, "num_input_tokens_seen": 47377504, "step": 81605 }, { "epoch": 12.155198093535896, "grad_norm": 1.2266652584075928, "learning_rate": 1.999421478062371e-05, "loss": 0.6892, "num_input_tokens_seen": 47380384, "step": 81610 }, { "epoch": 12.155942806076855, "grad_norm": 2.2497241497039795, "learning_rate": 1.9991031206689294e-05, "loss": 0.5038, "num_input_tokens_seen": 47383456, "step": 81615 }, { "epoch": 12.156687518617813, "grad_norm": 1.512585163116455, "learning_rate": 1.998784771737633e-05, "loss": 0.5764, "num_input_tokens_seen": 47386464, "step": 81620 }, { "epoch": 12.157432231158772, "grad_norm": 1.8124682903289795, "learning_rate": 1.9984664312738578e-05, "loss": 0.5808, "num_input_tokens_seen": 47389504, "step": 81625 }, { "epoch": 12.158176943699733, "grad_norm": 1.610541820526123, "learning_rate": 1.9981480992829832e-05, "loss": 0.6806, "num_input_tokens_seen": 47392192, "step": 81630 }, { "epoch": 12.158921656240691, "grad_norm": 1.6068228483200073, "learning_rate": 1.997829775770385e-05, "loss": 0.6774, "num_input_tokens_seen": 47395168, "step": 81635 }, { "epoch": 12.15966636878165, "grad_norm": 1.4695155620574951, "learning_rate": 1.9975114607414434e-05, "loss": 0.5818, "num_input_tokens_seen": 47398208, "step": 81640 }, { "epoch": 12.160411081322609, "grad_norm": 2.1011507511138916, "learning_rate": 1.9971931542015355e-05, "loss": 0.748, "num_input_tokens_seen": 47401216, "step": 81645 }, { "epoch": 12.16115579386357, "grad_norm": 1.8539808988571167, "learning_rate": 1.9968748561560366e-05, "loss": 0.5757, "num_input_tokens_seen": 47404320, "step": 81650 }, { "epoch": 12.161900506404528, "grad_norm": 2.342689275741577, "learning_rate": 1.996556566610327e-05, "loss": 0.5393, "num_input_tokens_seen": 47407296, "step": 81655 }, { "epoch": 12.162645218945487, "grad_norm": 1.4717727899551392, "learning_rate": 1.996238285569782e-05, "loss": 0.6551, "num_input_tokens_seen": 47410048, "step": 81660 }, { "epoch": 12.163389931486446, "grad_norm": 1.5657951831817627, "learning_rate": 1.9959200130397795e-05, "loss": 0.7188, "num_input_tokens_seen": 47412896, "step": 81665 }, { "epoch": 12.164134644027406, "grad_norm": 1.6443746089935303, "learning_rate": 1.995601749025695e-05, "loss": 0.4886, "num_input_tokens_seen": 47415840, "step": 81670 }, { "epoch": 12.164879356568365, "grad_norm": 1.6619441509246826, "learning_rate": 1.9952834935329077e-05, "loss": 0.5347, "num_input_tokens_seen": 47418528, "step": 81675 }, { "epoch": 12.165624069109324, "grad_norm": 2.423558473587036, "learning_rate": 1.9949652465667915e-05, "loss": 0.7633, "num_input_tokens_seen": 47421376, "step": 81680 }, { "epoch": 12.166368781650283, "grad_norm": 1.0916962623596191, "learning_rate": 1.9946470081327253e-05, "loss": 0.4894, "num_input_tokens_seen": 47424768, "step": 81685 }, { "epoch": 12.167113494191241, "grad_norm": 1.4659931659698486, "learning_rate": 1.9943287782360844e-05, "loss": 0.6435, "num_input_tokens_seen": 47427744, "step": 81690 }, { "epoch": 12.167858206732202, "grad_norm": 1.1615310907363892, "learning_rate": 1.9940105568822437e-05, "loss": 0.6418, "num_input_tokens_seen": 47430464, "step": 81695 }, { "epoch": 12.16860291927316, "grad_norm": 1.1904296875, "learning_rate": 1.9936923440765813e-05, "loss": 0.5152, "num_input_tokens_seen": 47433184, "step": 81700 }, { "epoch": 12.16934763181412, "grad_norm": 1.5392519235610962, "learning_rate": 1.9933741398244714e-05, "loss": 0.4532, "num_input_tokens_seen": 47435968, "step": 81705 }, { "epoch": 12.170092344355078, "grad_norm": 2.209907293319702, "learning_rate": 1.9930559441312913e-05, "loss": 0.9075, "num_input_tokens_seen": 47438880, "step": 81710 }, { "epoch": 12.170837056896039, "grad_norm": 1.1552485227584839, "learning_rate": 1.9927377570024146e-05, "loss": 0.4923, "num_input_tokens_seen": 47441824, "step": 81715 }, { "epoch": 12.171581769436997, "grad_norm": 1.671122670173645, "learning_rate": 1.9924195784432192e-05, "loss": 0.5914, "num_input_tokens_seen": 47444672, "step": 81720 }, { "epoch": 12.172326481977956, "grad_norm": 1.0747236013412476, "learning_rate": 1.992101408459079e-05, "loss": 0.6592, "num_input_tokens_seen": 47447360, "step": 81725 }, { "epoch": 12.173071194518915, "grad_norm": 0.6906753182411194, "learning_rate": 1.9917832470553692e-05, "loss": 0.3652, "num_input_tokens_seen": 47450176, "step": 81730 }, { "epoch": 12.173815907059875, "grad_norm": 1.843654990196228, "learning_rate": 1.9914650942374648e-05, "loss": 0.5998, "num_input_tokens_seen": 47453088, "step": 81735 }, { "epoch": 12.174560619600834, "grad_norm": 2.046602249145508, "learning_rate": 1.9911469500107398e-05, "loss": 0.6862, "num_input_tokens_seen": 47455936, "step": 81740 }, { "epoch": 12.175305332141793, "grad_norm": 1.8359261751174927, "learning_rate": 1.9908288143805714e-05, "loss": 0.5863, "num_input_tokens_seen": 47458880, "step": 81745 }, { "epoch": 12.176050044682752, "grad_norm": 1.7794196605682373, "learning_rate": 1.9905106873523316e-05, "loss": 0.6171, "num_input_tokens_seen": 47462112, "step": 81750 }, { "epoch": 12.176794757223712, "grad_norm": 1.233906865119934, "learning_rate": 1.9901925689313967e-05, "loss": 0.7453, "num_input_tokens_seen": 47464832, "step": 81755 }, { "epoch": 12.177539469764671, "grad_norm": 1.314401388168335, "learning_rate": 1.9898744591231396e-05, "loss": 0.6491, "num_input_tokens_seen": 47467808, "step": 81760 }, { "epoch": 12.17828418230563, "grad_norm": 1.3979637622833252, "learning_rate": 1.989556357932936e-05, "loss": 0.6368, "num_input_tokens_seen": 47470752, "step": 81765 }, { "epoch": 12.179028894846589, "grad_norm": 1.7094672918319702, "learning_rate": 1.9892382653661584e-05, "loss": 0.6564, "num_input_tokens_seen": 47473696, "step": 81770 }, { "epoch": 12.179773607387549, "grad_norm": 1.0335367918014526, "learning_rate": 1.9889201814281804e-05, "loss": 0.5708, "num_input_tokens_seen": 47476864, "step": 81775 }, { "epoch": 12.180518319928508, "grad_norm": 1.3496694564819336, "learning_rate": 1.9886021061243775e-05, "loss": 0.5577, "num_input_tokens_seen": 47479744, "step": 81780 }, { "epoch": 12.181263032469467, "grad_norm": 1.776803731918335, "learning_rate": 1.9882840394601213e-05, "loss": 0.5495, "num_input_tokens_seen": 47482464, "step": 81785 }, { "epoch": 12.182007745010425, "grad_norm": 2.878448009490967, "learning_rate": 1.987965981440787e-05, "loss": 0.5204, "num_input_tokens_seen": 47485408, "step": 81790 }, { "epoch": 12.182752457551386, "grad_norm": 1.0797933340072632, "learning_rate": 1.987647932071747e-05, "loss": 0.559, "num_input_tokens_seen": 47488192, "step": 81795 }, { "epoch": 12.183497170092345, "grad_norm": 1.3420281410217285, "learning_rate": 1.9873298913583746e-05, "loss": 0.5291, "num_input_tokens_seen": 47491040, "step": 81800 }, { "epoch": 12.184241882633303, "grad_norm": 2.7473740577697754, "learning_rate": 1.987011859306043e-05, "loss": 0.5251, "num_input_tokens_seen": 47493856, "step": 81805 }, { "epoch": 12.184986595174262, "grad_norm": 2.035234212875366, "learning_rate": 1.986693835920123e-05, "loss": 0.5862, "num_input_tokens_seen": 47496608, "step": 81810 }, { "epoch": 12.185731307715223, "grad_norm": 1.1676056385040283, "learning_rate": 1.9863758212059902e-05, "loss": 0.5369, "num_input_tokens_seen": 47499616, "step": 81815 }, { "epoch": 12.186476020256181, "grad_norm": 1.5409928560256958, "learning_rate": 1.9860578151690154e-05, "loss": 0.5883, "num_input_tokens_seen": 47502528, "step": 81820 }, { "epoch": 12.18722073279714, "grad_norm": 1.2028154134750366, "learning_rate": 1.9857398178145718e-05, "loss": 0.5336, "num_input_tokens_seen": 47505376, "step": 81825 }, { "epoch": 12.187965445338099, "grad_norm": 1.5742316246032715, "learning_rate": 1.985421829148031e-05, "loss": 0.5637, "num_input_tokens_seen": 47508448, "step": 81830 }, { "epoch": 12.18871015787906, "grad_norm": 1.8045599460601807, "learning_rate": 1.985103849174766e-05, "loss": 0.7415, "num_input_tokens_seen": 47511296, "step": 81835 }, { "epoch": 12.189454870420018, "grad_norm": 1.2883682250976562, "learning_rate": 1.984785877900147e-05, "loss": 0.475, "num_input_tokens_seen": 47514176, "step": 81840 }, { "epoch": 12.190199582960977, "grad_norm": 1.020601749420166, "learning_rate": 1.984467915329548e-05, "loss": 0.553, "num_input_tokens_seen": 47517216, "step": 81845 }, { "epoch": 12.190944295501936, "grad_norm": 2.090512752532959, "learning_rate": 1.9841499614683394e-05, "loss": 0.5825, "num_input_tokens_seen": 47520256, "step": 81850 }, { "epoch": 12.191689008042895, "grad_norm": 2.0465798377990723, "learning_rate": 1.9838320163218927e-05, "loss": 0.6119, "num_input_tokens_seen": 47523008, "step": 81855 }, { "epoch": 12.192433720583855, "grad_norm": 0.9631640911102295, "learning_rate": 1.98351407989558e-05, "loss": 0.4378, "num_input_tokens_seen": 47526048, "step": 81860 }, { "epoch": 12.193178433124814, "grad_norm": 1.9913346767425537, "learning_rate": 1.983196152194771e-05, "loss": 0.5816, "num_input_tokens_seen": 47528928, "step": 81865 }, { "epoch": 12.193923145665773, "grad_norm": 1.2955546379089355, "learning_rate": 1.9828782332248385e-05, "loss": 0.5879, "num_input_tokens_seen": 47531904, "step": 81870 }, { "epoch": 12.194667858206731, "grad_norm": 1.8888579607009888, "learning_rate": 1.982560322991152e-05, "loss": 0.7752, "num_input_tokens_seen": 47534848, "step": 81875 }, { "epoch": 12.195412570747692, "grad_norm": 0.8366089463233948, "learning_rate": 1.9822424214990837e-05, "loss": 0.5627, "num_input_tokens_seen": 47537824, "step": 81880 }, { "epoch": 12.19615728328865, "grad_norm": 1.3049614429473877, "learning_rate": 1.9819245287540036e-05, "loss": 0.5659, "num_input_tokens_seen": 47540768, "step": 81885 }, { "epoch": 12.19690199582961, "grad_norm": 1.2499287128448486, "learning_rate": 1.9816066447612815e-05, "loss": 0.5277, "num_input_tokens_seen": 47543968, "step": 81890 }, { "epoch": 12.197646708370568, "grad_norm": 1.1178134679794312, "learning_rate": 1.9812887695262887e-05, "loss": 0.64, "num_input_tokens_seen": 47546592, "step": 81895 }, { "epoch": 12.198391420911529, "grad_norm": 1.2354460954666138, "learning_rate": 1.980970903054394e-05, "loss": 0.4703, "num_input_tokens_seen": 47549728, "step": 81900 }, { "epoch": 12.199136133452487, "grad_norm": 1.9107295274734497, "learning_rate": 1.9806530453509693e-05, "loss": 0.6278, "num_input_tokens_seen": 47554112, "step": 81905 }, { "epoch": 12.199880845993446, "grad_norm": 1.9169702529907227, "learning_rate": 1.9803351964213827e-05, "loss": 0.7344, "num_input_tokens_seen": 47556992, "step": 81910 }, { "epoch": 12.200625558534405, "grad_norm": 1.5702950954437256, "learning_rate": 1.9800173562710055e-05, "loss": 0.4517, "num_input_tokens_seen": 47559808, "step": 81915 }, { "epoch": 12.201370271075366, "grad_norm": 1.9213887453079224, "learning_rate": 1.9796995249052064e-05, "loss": 0.6313, "num_input_tokens_seen": 47562976, "step": 81920 }, { "epoch": 12.202114983616324, "grad_norm": 1.5436447858810425, "learning_rate": 1.9793817023293555e-05, "loss": 0.6448, "num_input_tokens_seen": 47566016, "step": 81925 }, { "epoch": 12.202859696157283, "grad_norm": 1.8817700147628784, "learning_rate": 1.9790638885488216e-05, "loss": 0.5598, "num_input_tokens_seen": 47569088, "step": 81930 }, { "epoch": 12.203604408698242, "grad_norm": 1.5064945220947266, "learning_rate": 1.9787460835689726e-05, "loss": 0.5905, "num_input_tokens_seen": 47572000, "step": 81935 }, { "epoch": 12.204349121239202, "grad_norm": 2.5853381156921387, "learning_rate": 1.9784282873951797e-05, "loss": 0.7599, "num_input_tokens_seen": 47574752, "step": 81940 }, { "epoch": 12.205093833780161, "grad_norm": 1.345345139503479, "learning_rate": 1.97811050003281e-05, "loss": 0.582, "num_input_tokens_seen": 47577920, "step": 81945 }, { "epoch": 12.20583854632112, "grad_norm": 1.5359712839126587, "learning_rate": 1.977792721487234e-05, "loss": 0.5988, "num_input_tokens_seen": 47580992, "step": 81950 }, { "epoch": 12.206583258862079, "grad_norm": 1.8419548273086548, "learning_rate": 1.9774749517638188e-05, "loss": 0.5398, "num_input_tokens_seen": 47583808, "step": 81955 }, { "epoch": 12.20732797140304, "grad_norm": 1.468344807624817, "learning_rate": 1.9771571908679337e-05, "loss": 0.7204, "num_input_tokens_seen": 47586784, "step": 81960 }, { "epoch": 12.208072683943998, "grad_norm": 1.4117789268493652, "learning_rate": 1.976839438804946e-05, "loss": 0.5589, "num_input_tokens_seen": 47589728, "step": 81965 }, { "epoch": 12.208817396484957, "grad_norm": 0.9124284386634827, "learning_rate": 1.976521695580224e-05, "loss": 0.56, "num_input_tokens_seen": 47592672, "step": 81970 }, { "epoch": 12.209562109025915, "grad_norm": 1.1693673133850098, "learning_rate": 1.9762039611991365e-05, "loss": 0.7049, "num_input_tokens_seen": 47595808, "step": 81975 }, { "epoch": 12.210306821566876, "grad_norm": 1.5464779138565063, "learning_rate": 1.9758862356670498e-05, "loss": 0.4633, "num_input_tokens_seen": 47598816, "step": 81980 }, { "epoch": 12.211051534107835, "grad_norm": 1.650213360786438, "learning_rate": 1.9755685189893332e-05, "loss": 0.591, "num_input_tokens_seen": 47601952, "step": 81985 }, { "epoch": 12.211796246648793, "grad_norm": 1.1497286558151245, "learning_rate": 1.9752508111713532e-05, "loss": 0.7202, "num_input_tokens_seen": 47604736, "step": 81990 }, { "epoch": 12.212540959189752, "grad_norm": 1.3118840456008911, "learning_rate": 1.974933112218478e-05, "loss": 0.5957, "num_input_tokens_seen": 47607680, "step": 81995 }, { "epoch": 12.213285671730713, "grad_norm": 1.937381386756897, "learning_rate": 1.9746154221360732e-05, "loss": 0.616, "num_input_tokens_seen": 47610560, "step": 82000 }, { "epoch": 12.214030384271672, "grad_norm": 2.2972137928009033, "learning_rate": 1.9742977409295076e-05, "loss": 0.6659, "num_input_tokens_seen": 47613376, "step": 82005 }, { "epoch": 12.21477509681263, "grad_norm": 1.1386549472808838, "learning_rate": 1.9739800686041478e-05, "loss": 0.4246, "num_input_tokens_seen": 47616128, "step": 82010 }, { "epoch": 12.215519809353589, "grad_norm": 2.038139581680298, "learning_rate": 1.973662405165359e-05, "loss": 0.6221, "num_input_tokens_seen": 47619136, "step": 82015 }, { "epoch": 12.216264521894548, "grad_norm": 1.5050642490386963, "learning_rate": 1.9733447506185095e-05, "loss": 0.7416, "num_input_tokens_seen": 47622080, "step": 82020 }, { "epoch": 12.217009234435508, "grad_norm": 1.1660078763961792, "learning_rate": 1.973027104968965e-05, "loss": 0.5531, "num_input_tokens_seen": 47624832, "step": 82025 }, { "epoch": 12.217753946976467, "grad_norm": 1.925214171409607, "learning_rate": 1.9727094682220925e-05, "loss": 0.5193, "num_input_tokens_seen": 47627712, "step": 82030 }, { "epoch": 12.218498659517426, "grad_norm": 1.467390537261963, "learning_rate": 1.9723918403832565e-05, "loss": 0.5675, "num_input_tokens_seen": 47630688, "step": 82035 }, { "epoch": 12.219243372058385, "grad_norm": 2.1239736080169678, "learning_rate": 1.9720742214578254e-05, "loss": 0.7147, "num_input_tokens_seen": 47633824, "step": 82040 }, { "epoch": 12.219988084599345, "grad_norm": 1.3511450290679932, "learning_rate": 1.9717566114511634e-05, "loss": 0.4902, "num_input_tokens_seen": 47636576, "step": 82045 }, { "epoch": 12.220732797140304, "grad_norm": 1.1619460582733154, "learning_rate": 1.9714390103686355e-05, "loss": 0.6946, "num_input_tokens_seen": 47639232, "step": 82050 }, { "epoch": 12.221477509681263, "grad_norm": 2.4201838970184326, "learning_rate": 1.9711214182156096e-05, "loss": 0.4163, "num_input_tokens_seen": 47642240, "step": 82055 }, { "epoch": 12.222222222222221, "grad_norm": 1.8672748804092407, "learning_rate": 1.9708038349974494e-05, "loss": 0.6508, "num_input_tokens_seen": 47644960, "step": 82060 }, { "epoch": 12.222966934763182, "grad_norm": 1.8442686796188354, "learning_rate": 1.9704862607195207e-05, "loss": 0.7521, "num_input_tokens_seen": 47647904, "step": 82065 }, { "epoch": 12.22371164730414, "grad_norm": 1.2310954332351685, "learning_rate": 1.970168695387188e-05, "loss": 0.6188, "num_input_tokens_seen": 47650752, "step": 82070 }, { "epoch": 12.2244563598451, "grad_norm": 1.929387092590332, "learning_rate": 1.9698511390058172e-05, "loss": 0.6743, "num_input_tokens_seen": 47653952, "step": 82075 }, { "epoch": 12.225201072386058, "grad_norm": 1.0916143655776978, "learning_rate": 1.9695335915807716e-05, "loss": 0.6273, "num_input_tokens_seen": 47656832, "step": 82080 }, { "epoch": 12.225945784927019, "grad_norm": 1.6405407190322876, "learning_rate": 1.969216053117418e-05, "loss": 0.7996, "num_input_tokens_seen": 47659584, "step": 82085 }, { "epoch": 12.226690497467978, "grad_norm": 1.367045521736145, "learning_rate": 1.9688985236211197e-05, "loss": 0.5126, "num_input_tokens_seen": 47662400, "step": 82090 }, { "epoch": 12.227435210008936, "grad_norm": 2.690114974975586, "learning_rate": 1.9685810030972405e-05, "loss": 0.8304, "num_input_tokens_seen": 47665600, "step": 82095 }, { "epoch": 12.228179922549895, "grad_norm": 1.1013820171356201, "learning_rate": 1.9682634915511455e-05, "loss": 0.58, "num_input_tokens_seen": 47668768, "step": 82100 }, { "epoch": 12.228924635090856, "grad_norm": 1.0763511657714844, "learning_rate": 1.9679459889881977e-05, "loss": 0.5172, "num_input_tokens_seen": 47671616, "step": 82105 }, { "epoch": 12.229669347631814, "grad_norm": 1.7190673351287842, "learning_rate": 1.9676284954137624e-05, "loss": 0.6424, "num_input_tokens_seen": 47674432, "step": 82110 }, { "epoch": 12.230414060172773, "grad_norm": 1.9380362033843994, "learning_rate": 1.9673110108332014e-05, "loss": 0.7235, "num_input_tokens_seen": 47677152, "step": 82115 }, { "epoch": 12.231158772713732, "grad_norm": 2.006706953048706, "learning_rate": 1.966993535251881e-05, "loss": 0.5323, "num_input_tokens_seen": 47680160, "step": 82120 }, { "epoch": 12.231903485254692, "grad_norm": 0.9102163910865784, "learning_rate": 1.966676068675163e-05, "loss": 0.5681, "num_input_tokens_seen": 47682848, "step": 82125 }, { "epoch": 12.232648197795651, "grad_norm": 1.5032001733779907, "learning_rate": 1.96635861110841e-05, "loss": 0.6692, "num_input_tokens_seen": 47685728, "step": 82130 }, { "epoch": 12.23339291033661, "grad_norm": 0.9790263175964355, "learning_rate": 1.9660411625569867e-05, "loss": 0.6048, "num_input_tokens_seen": 47688384, "step": 82135 }, { "epoch": 12.234137622877569, "grad_norm": 1.2772544622421265, "learning_rate": 1.965723723026254e-05, "loss": 0.4935, "num_input_tokens_seen": 47691136, "step": 82140 }, { "epoch": 12.23488233541853, "grad_norm": 1.81697678565979, "learning_rate": 1.965406292521577e-05, "loss": 0.5445, "num_input_tokens_seen": 47694144, "step": 82145 }, { "epoch": 12.235627047959488, "grad_norm": 2.902151107788086, "learning_rate": 1.965088871048317e-05, "loss": 0.6142, "num_input_tokens_seen": 47697408, "step": 82150 }, { "epoch": 12.236371760500447, "grad_norm": 1.8818660974502563, "learning_rate": 1.964771458611837e-05, "loss": 0.8249, "num_input_tokens_seen": 47700384, "step": 82155 }, { "epoch": 12.237116473041405, "grad_norm": 1.8209576606750488, "learning_rate": 1.964454055217499e-05, "loss": 0.6647, "num_input_tokens_seen": 47704032, "step": 82160 }, { "epoch": 12.237861185582366, "grad_norm": 0.8739141225814819, "learning_rate": 1.9641366608706656e-05, "loss": 0.6502, "num_input_tokens_seen": 47706880, "step": 82165 }, { "epoch": 12.238605898123325, "grad_norm": 1.696750283241272, "learning_rate": 1.9638192755766993e-05, "loss": 0.6282, "num_input_tokens_seen": 47709696, "step": 82170 }, { "epoch": 12.239350610664284, "grad_norm": 0.8625497221946716, "learning_rate": 1.9635018993409602e-05, "loss": 0.6576, "num_input_tokens_seen": 47712448, "step": 82175 }, { "epoch": 12.240095323205242, "grad_norm": 0.9016672372817993, "learning_rate": 1.963184532168812e-05, "loss": 0.6722, "num_input_tokens_seen": 47715328, "step": 82180 }, { "epoch": 12.240840035746203, "grad_norm": 1.4064210653305054, "learning_rate": 1.9628671740656154e-05, "loss": 0.4224, "num_input_tokens_seen": 47718272, "step": 82185 }, { "epoch": 12.241584748287162, "grad_norm": 1.0567982196807861, "learning_rate": 1.962549825036732e-05, "loss": 0.6476, "num_input_tokens_seen": 47721248, "step": 82190 }, { "epoch": 12.24232946082812, "grad_norm": 1.7002333402633667, "learning_rate": 1.9622324850875227e-05, "loss": 0.5449, "num_input_tokens_seen": 47724000, "step": 82195 }, { "epoch": 12.243074173369079, "grad_norm": 1.0940401554107666, "learning_rate": 1.9619151542233494e-05, "loss": 0.7798, "num_input_tokens_seen": 47726848, "step": 82200 }, { "epoch": 12.243818885910038, "grad_norm": 0.8943315148353577, "learning_rate": 1.9615978324495733e-05, "loss": 0.611, "num_input_tokens_seen": 47730048, "step": 82205 }, { "epoch": 12.244563598450998, "grad_norm": 1.2829725742340088, "learning_rate": 1.961280519771553e-05, "loss": 0.4953, "num_input_tokens_seen": 47732608, "step": 82210 }, { "epoch": 12.245308310991957, "grad_norm": 0.6863623261451721, "learning_rate": 1.960963216194652e-05, "loss": 0.5381, "num_input_tokens_seen": 47735840, "step": 82215 }, { "epoch": 12.246053023532916, "grad_norm": 1.0504904985427856, "learning_rate": 1.960645921724229e-05, "loss": 0.4093, "num_input_tokens_seen": 47738816, "step": 82220 }, { "epoch": 12.246797736073875, "grad_norm": 1.3118282556533813, "learning_rate": 1.960328636365646e-05, "loss": 0.5665, "num_input_tokens_seen": 47741952, "step": 82225 }, { "epoch": 12.247542448614835, "grad_norm": 1.1387856006622314, "learning_rate": 1.9600113601242605e-05, "loss": 0.5709, "num_input_tokens_seen": 47744928, "step": 82230 }, { "epoch": 12.248287161155794, "grad_norm": 2.4156675338745117, "learning_rate": 1.9596940930054358e-05, "loss": 0.532, "num_input_tokens_seen": 47747616, "step": 82235 }, { "epoch": 12.249031873696753, "grad_norm": 2.9317309856414795, "learning_rate": 1.9593768350145288e-05, "loss": 0.5219, "num_input_tokens_seen": 47750656, "step": 82240 }, { "epoch": 12.249776586237711, "grad_norm": 2.6361641883850098, "learning_rate": 1.9590595861569023e-05, "loss": 0.5835, "num_input_tokens_seen": 47753504, "step": 82245 }, { "epoch": 12.250521298778672, "grad_norm": 1.4487051963806152, "learning_rate": 1.9587423464379136e-05, "loss": 0.555, "num_input_tokens_seen": 47756256, "step": 82250 }, { "epoch": 12.25126601131963, "grad_norm": 1.487475872039795, "learning_rate": 1.9584251158629228e-05, "loss": 0.4655, "num_input_tokens_seen": 47759200, "step": 82255 }, { "epoch": 12.25201072386059, "grad_norm": 2.7333662509918213, "learning_rate": 1.9581078944372897e-05, "loss": 0.7192, "num_input_tokens_seen": 47762240, "step": 82260 }, { "epoch": 12.252755436401548, "grad_norm": 0.8323240280151367, "learning_rate": 1.957790682166372e-05, "loss": 0.487, "num_input_tokens_seen": 47765408, "step": 82265 }, { "epoch": 12.253500148942509, "grad_norm": 1.547951579093933, "learning_rate": 1.9574734790555305e-05, "loss": 0.6436, "num_input_tokens_seen": 47768224, "step": 82270 }, { "epoch": 12.254244861483468, "grad_norm": 1.1018075942993164, "learning_rate": 1.9571562851101223e-05, "loss": 0.5768, "num_input_tokens_seen": 47771296, "step": 82275 }, { "epoch": 12.254989574024426, "grad_norm": 2.1929163932800293, "learning_rate": 1.9568391003355073e-05, "loss": 0.6859, "num_input_tokens_seen": 47774432, "step": 82280 }, { "epoch": 12.255734286565385, "grad_norm": 2.6633944511413574, "learning_rate": 1.956521924737044e-05, "loss": 0.76, "num_input_tokens_seen": 47777216, "step": 82285 }, { "epoch": 12.256478999106346, "grad_norm": 1.7144726514816284, "learning_rate": 1.9562047583200906e-05, "loss": 0.6372, "num_input_tokens_seen": 47779872, "step": 82290 }, { "epoch": 12.257223711647304, "grad_norm": 2.436722755432129, "learning_rate": 1.955887601090005e-05, "loss": 0.7543, "num_input_tokens_seen": 47782720, "step": 82295 }, { "epoch": 12.257968424188263, "grad_norm": 1.6275968551635742, "learning_rate": 1.9555704530521445e-05, "loss": 0.5895, "num_input_tokens_seen": 47785280, "step": 82300 }, { "epoch": 12.258713136729222, "grad_norm": 2.004194498062134, "learning_rate": 1.955253314211869e-05, "loss": 0.5631, "num_input_tokens_seen": 47788096, "step": 82305 }, { "epoch": 12.259457849270182, "grad_norm": 1.3319154977798462, "learning_rate": 1.9549361845745338e-05, "loss": 0.5934, "num_input_tokens_seen": 47791232, "step": 82310 }, { "epoch": 12.260202561811141, "grad_norm": 1.8869457244873047, "learning_rate": 1.9546190641454993e-05, "loss": 0.6772, "num_input_tokens_seen": 47794336, "step": 82315 }, { "epoch": 12.2609472743521, "grad_norm": 2.2269341945648193, "learning_rate": 1.9543019529301203e-05, "loss": 0.857, "num_input_tokens_seen": 47797280, "step": 82320 }, { "epoch": 12.261691986893059, "grad_norm": 2.0869529247283936, "learning_rate": 1.953984850933756e-05, "loss": 0.7398, "num_input_tokens_seen": 47800224, "step": 82325 }, { "epoch": 12.26243669943402, "grad_norm": 0.950422465801239, "learning_rate": 1.953667758161763e-05, "loss": 0.6468, "num_input_tokens_seen": 47802880, "step": 82330 }, { "epoch": 12.263181411974978, "grad_norm": 1.2286226749420166, "learning_rate": 1.9533506746194964e-05, "loss": 0.6451, "num_input_tokens_seen": 47805664, "step": 82335 }, { "epoch": 12.263926124515937, "grad_norm": 1.2325726747512817, "learning_rate": 1.9530336003123156e-05, "loss": 0.5257, "num_input_tokens_seen": 47808384, "step": 82340 }, { "epoch": 12.264670837056896, "grad_norm": 1.36738920211792, "learning_rate": 1.9527165352455755e-05, "loss": 0.4143, "num_input_tokens_seen": 47811296, "step": 82345 }, { "epoch": 12.265415549597856, "grad_norm": 2.2806038856506348, "learning_rate": 1.9523994794246344e-05, "loss": 0.5864, "num_input_tokens_seen": 47814048, "step": 82350 }, { "epoch": 12.266160262138815, "grad_norm": 1.4009400606155396, "learning_rate": 1.9520824328548465e-05, "loss": 0.5712, "num_input_tokens_seen": 47816800, "step": 82355 }, { "epoch": 12.266904974679774, "grad_norm": 0.9674972891807556, "learning_rate": 1.9517653955415698e-05, "loss": 0.6105, "num_input_tokens_seen": 47819456, "step": 82360 }, { "epoch": 12.267649687220732, "grad_norm": 0.9378535151481628, "learning_rate": 1.9514483674901586e-05, "loss": 0.7105, "num_input_tokens_seen": 47822176, "step": 82365 }, { "epoch": 12.268394399761693, "grad_norm": 1.158941388130188, "learning_rate": 1.9511313487059706e-05, "loss": 0.5898, "num_input_tokens_seen": 47825088, "step": 82370 }, { "epoch": 12.269139112302652, "grad_norm": 2.440748453140259, "learning_rate": 1.950814339194361e-05, "loss": 0.6833, "num_input_tokens_seen": 47827840, "step": 82375 }, { "epoch": 12.26988382484361, "grad_norm": 0.9466033577919006, "learning_rate": 1.9504973389606834e-05, "loss": 0.5066, "num_input_tokens_seen": 47830912, "step": 82380 }, { "epoch": 12.27062853738457, "grad_norm": 1.4355461597442627, "learning_rate": 1.9501803480102962e-05, "loss": 0.6074, "num_input_tokens_seen": 47833984, "step": 82385 }, { "epoch": 12.271373249925528, "grad_norm": 1.8151278495788574, "learning_rate": 1.9498633663485526e-05, "loss": 0.6218, "num_input_tokens_seen": 47836736, "step": 82390 }, { "epoch": 12.272117962466488, "grad_norm": 2.4089221954345703, "learning_rate": 1.9495463939808085e-05, "loss": 0.6151, "num_input_tokens_seen": 47839552, "step": 82395 }, { "epoch": 12.272862675007447, "grad_norm": 3.768872022628784, "learning_rate": 1.9492294309124183e-05, "loss": 0.6169, "num_input_tokens_seen": 47842304, "step": 82400 }, { "epoch": 12.273607387548406, "grad_norm": 4.488533020019531, "learning_rate": 1.9489124771487375e-05, "loss": 0.7816, "num_input_tokens_seen": 47845184, "step": 82405 }, { "epoch": 12.274352100089365, "grad_norm": 1.305804967880249, "learning_rate": 1.9485955326951204e-05, "loss": 0.609, "num_input_tokens_seen": 47848160, "step": 82410 }, { "epoch": 12.275096812630325, "grad_norm": 1.7180367708206177, "learning_rate": 1.9482785975569202e-05, "loss": 0.7417, "num_input_tokens_seen": 47851360, "step": 82415 }, { "epoch": 12.275841525171284, "grad_norm": 1.8734430074691772, "learning_rate": 1.9479616717394937e-05, "loss": 0.5223, "num_input_tokens_seen": 47854176, "step": 82420 }, { "epoch": 12.276586237712243, "grad_norm": 1.794216513633728, "learning_rate": 1.947644755248193e-05, "loss": 0.5636, "num_input_tokens_seen": 47857088, "step": 82425 }, { "epoch": 12.277330950253202, "grad_norm": 1.1662009954452515, "learning_rate": 1.9473278480883733e-05, "loss": 0.4573, "num_input_tokens_seen": 47859712, "step": 82430 }, { "epoch": 12.278075662794162, "grad_norm": 1.46829092502594, "learning_rate": 1.947010950265387e-05, "loss": 0.474, "num_input_tokens_seen": 47862880, "step": 82435 }, { "epoch": 12.27882037533512, "grad_norm": 2.598261833190918, "learning_rate": 1.9466940617845897e-05, "loss": 0.6793, "num_input_tokens_seen": 47866080, "step": 82440 }, { "epoch": 12.27956508787608, "grad_norm": 1.7739546298980713, "learning_rate": 1.9463771826513326e-05, "loss": 0.6773, "num_input_tokens_seen": 47868832, "step": 82445 }, { "epoch": 12.280309800417038, "grad_norm": 1.3885372877120972, "learning_rate": 1.9460603128709715e-05, "loss": 0.388, "num_input_tokens_seen": 47871648, "step": 82450 }, { "epoch": 12.281054512957999, "grad_norm": 1.4427634477615356, "learning_rate": 1.9457434524488582e-05, "loss": 0.5451, "num_input_tokens_seen": 47874592, "step": 82455 }, { "epoch": 12.281799225498958, "grad_norm": 1.2718851566314697, "learning_rate": 1.9454266013903458e-05, "loss": 0.6403, "num_input_tokens_seen": 47877952, "step": 82460 }, { "epoch": 12.282543938039916, "grad_norm": 2.087792158126831, "learning_rate": 1.945109759700788e-05, "loss": 0.7643, "num_input_tokens_seen": 47881152, "step": 82465 }, { "epoch": 12.283288650580875, "grad_norm": 1.4304778575897217, "learning_rate": 1.9447929273855354e-05, "loss": 0.6645, "num_input_tokens_seen": 47883904, "step": 82470 }, { "epoch": 12.284033363121836, "grad_norm": 1.3274610042572021, "learning_rate": 1.944476104449943e-05, "loss": 0.5468, "num_input_tokens_seen": 47886880, "step": 82475 }, { "epoch": 12.284778075662794, "grad_norm": 1.414838194847107, "learning_rate": 1.9441592908993616e-05, "loss": 0.719, "num_input_tokens_seen": 47890048, "step": 82480 }, { "epoch": 12.285522788203753, "grad_norm": 1.3067618608474731, "learning_rate": 1.9438424867391444e-05, "loss": 0.5606, "num_input_tokens_seen": 47892960, "step": 82485 }, { "epoch": 12.286267500744712, "grad_norm": 1.383410096168518, "learning_rate": 1.9435256919746436e-05, "loss": 0.8206, "num_input_tokens_seen": 47895872, "step": 82490 }, { "epoch": 12.287012213285673, "grad_norm": 1.14431893825531, "learning_rate": 1.94320890661121e-05, "loss": 0.4915, "num_input_tokens_seen": 47898912, "step": 82495 }, { "epoch": 12.287756925826631, "grad_norm": 1.2039674520492554, "learning_rate": 1.9428921306541963e-05, "loss": 0.6183, "num_input_tokens_seen": 47901760, "step": 82500 }, { "epoch": 12.28850163836759, "grad_norm": 2.016223669052124, "learning_rate": 1.9425753641089535e-05, "loss": 0.5324, "num_input_tokens_seen": 47904640, "step": 82505 }, { "epoch": 12.289246350908549, "grad_norm": 1.9512357711791992, "learning_rate": 1.9422586069808337e-05, "loss": 0.7806, "num_input_tokens_seen": 47907680, "step": 82510 }, { "epoch": 12.28999106344951, "grad_norm": 1.4171353578567505, "learning_rate": 1.941941859275188e-05, "loss": 0.6367, "num_input_tokens_seen": 47910848, "step": 82515 }, { "epoch": 12.290735775990468, "grad_norm": 1.0594029426574707, "learning_rate": 1.9416251209973672e-05, "loss": 0.4321, "num_input_tokens_seen": 47913792, "step": 82520 }, { "epoch": 12.291480488531427, "grad_norm": 2.569662094116211, "learning_rate": 1.941308392152722e-05, "loss": 0.7435, "num_input_tokens_seen": 47916672, "step": 82525 }, { "epoch": 12.292225201072386, "grad_norm": 1.5706104040145874, "learning_rate": 1.9409916727466047e-05, "loss": 0.5862, "num_input_tokens_seen": 47919712, "step": 82530 }, { "epoch": 12.292969913613344, "grad_norm": 0.7451274991035461, "learning_rate": 1.9406749627843645e-05, "loss": 0.7544, "num_input_tokens_seen": 47922592, "step": 82535 }, { "epoch": 12.293714626154305, "grad_norm": 1.1214025020599365, "learning_rate": 1.940358262271352e-05, "loss": 0.6025, "num_input_tokens_seen": 47925376, "step": 82540 }, { "epoch": 12.294459338695264, "grad_norm": 2.088618040084839, "learning_rate": 1.9400415712129184e-05, "loss": 0.5933, "num_input_tokens_seen": 47928256, "step": 82545 }, { "epoch": 12.295204051236222, "grad_norm": 1.4800161123275757, "learning_rate": 1.9397248896144127e-05, "loss": 0.5645, "num_input_tokens_seen": 47931072, "step": 82550 }, { "epoch": 12.295948763777181, "grad_norm": 1.6582846641540527, "learning_rate": 1.939408217481186e-05, "loss": 0.5888, "num_input_tokens_seen": 47933760, "step": 82555 }, { "epoch": 12.296693476318142, "grad_norm": 1.3045654296875, "learning_rate": 1.939091554818587e-05, "loss": 0.6775, "num_input_tokens_seen": 47936832, "step": 82560 }, { "epoch": 12.2974381888591, "grad_norm": 1.3022642135620117, "learning_rate": 1.9387749016319673e-05, "loss": 0.5957, "num_input_tokens_seen": 47939872, "step": 82565 }, { "epoch": 12.29818290140006, "grad_norm": 1.7544941902160645, "learning_rate": 1.938458257926675e-05, "loss": 0.7811, "num_input_tokens_seen": 47943072, "step": 82570 }, { "epoch": 12.298927613941018, "grad_norm": 0.7967035174369812, "learning_rate": 1.9381416237080586e-05, "loss": 0.5192, "num_input_tokens_seen": 47945888, "step": 82575 }, { "epoch": 12.299672326481979, "grad_norm": 1.399993896484375, "learning_rate": 1.9378249989814697e-05, "loss": 0.675, "num_input_tokens_seen": 47948864, "step": 82580 }, { "epoch": 12.300417039022937, "grad_norm": 1.4013921022415161, "learning_rate": 1.937508383752255e-05, "loss": 0.4434, "num_input_tokens_seen": 47951648, "step": 82585 }, { "epoch": 12.301161751563896, "grad_norm": 1.766470193862915, "learning_rate": 1.9371917780257657e-05, "loss": 0.6357, "num_input_tokens_seen": 47954560, "step": 82590 }, { "epoch": 12.301906464104855, "grad_norm": 2.33624005317688, "learning_rate": 1.9368751818073476e-05, "loss": 0.6034, "num_input_tokens_seen": 47957536, "step": 82595 }, { "epoch": 12.302651176645815, "grad_norm": 0.7337892651557922, "learning_rate": 1.9365585951023523e-05, "loss": 0.5259, "num_input_tokens_seen": 47960256, "step": 82600 }, { "epoch": 12.303395889186774, "grad_norm": 1.7289003133773804, "learning_rate": 1.9362420179161262e-05, "loss": 0.6026, "num_input_tokens_seen": 47963008, "step": 82605 }, { "epoch": 12.304140601727733, "grad_norm": 2.012004852294922, "learning_rate": 1.935925450254019e-05, "loss": 0.6532, "num_input_tokens_seen": 47965824, "step": 82610 }, { "epoch": 12.304885314268692, "grad_norm": 1.6435976028442383, "learning_rate": 1.935608892121378e-05, "loss": 0.593, "num_input_tokens_seen": 47968576, "step": 82615 }, { "epoch": 12.305630026809652, "grad_norm": 1.6003062725067139, "learning_rate": 1.9352923435235505e-05, "loss": 0.6797, "num_input_tokens_seen": 47971712, "step": 82620 }, { "epoch": 12.30637473935061, "grad_norm": 1.326083779335022, "learning_rate": 1.9349758044658854e-05, "loss": 0.5487, "num_input_tokens_seen": 47974464, "step": 82625 }, { "epoch": 12.30711945189157, "grad_norm": 1.978827953338623, "learning_rate": 1.934659274953729e-05, "loss": 0.6764, "num_input_tokens_seen": 47977408, "step": 82630 }, { "epoch": 12.307864164432528, "grad_norm": 1.869823694229126, "learning_rate": 1.9343427549924302e-05, "loss": 0.6969, "num_input_tokens_seen": 47980320, "step": 82635 }, { "epoch": 12.308608876973489, "grad_norm": 0.9655501246452332, "learning_rate": 1.9340262445873354e-05, "loss": 0.6529, "num_input_tokens_seen": 47983328, "step": 82640 }, { "epoch": 12.309353589514448, "grad_norm": 1.874637484550476, "learning_rate": 1.9337097437437924e-05, "loss": 0.5216, "num_input_tokens_seen": 47986240, "step": 82645 }, { "epoch": 12.310098302055406, "grad_norm": 1.3756400346755981, "learning_rate": 1.9333932524671478e-05, "loss": 0.5083, "num_input_tokens_seen": 47989312, "step": 82650 }, { "epoch": 12.310843014596365, "grad_norm": 1.0254161357879639, "learning_rate": 1.933076770762748e-05, "loss": 0.7276, "num_input_tokens_seen": 47992224, "step": 82655 }, { "epoch": 12.311587727137326, "grad_norm": 2.397759199142456, "learning_rate": 1.93276029863594e-05, "loss": 0.8698, "num_input_tokens_seen": 47995136, "step": 82660 }, { "epoch": 12.312332439678285, "grad_norm": 1.435232162475586, "learning_rate": 1.9324438360920696e-05, "loss": 0.4618, "num_input_tokens_seen": 47998016, "step": 82665 }, { "epoch": 12.313077152219243, "grad_norm": 2.0981838703155518, "learning_rate": 1.9321273831364847e-05, "loss": 0.6392, "num_input_tokens_seen": 48000864, "step": 82670 }, { "epoch": 12.313821864760202, "grad_norm": 1.1941043138504028, "learning_rate": 1.9318109397745295e-05, "loss": 0.6022, "num_input_tokens_seen": 48004000, "step": 82675 }, { "epoch": 12.314566577301163, "grad_norm": 2.315551280975342, "learning_rate": 1.9314945060115517e-05, "loss": 0.7461, "num_input_tokens_seen": 48006848, "step": 82680 }, { "epoch": 12.315311289842121, "grad_norm": 1.5989667177200317, "learning_rate": 1.9311780818528966e-05, "loss": 0.694, "num_input_tokens_seen": 48009728, "step": 82685 }, { "epoch": 12.31605600238308, "grad_norm": 1.441907286643982, "learning_rate": 1.93086166730391e-05, "loss": 0.6772, "num_input_tokens_seen": 48012480, "step": 82690 }, { "epoch": 12.316800714924039, "grad_norm": 1.5843207836151123, "learning_rate": 1.9305452623699367e-05, "loss": 0.5251, "num_input_tokens_seen": 48015296, "step": 82695 }, { "epoch": 12.317545427465, "grad_norm": 1.1818569898605347, "learning_rate": 1.9302288670563215e-05, "loss": 0.5307, "num_input_tokens_seen": 48018048, "step": 82700 }, { "epoch": 12.318290140005958, "grad_norm": 1.6116483211517334, "learning_rate": 1.9299124813684117e-05, "loss": 0.9858, "num_input_tokens_seen": 48020928, "step": 82705 }, { "epoch": 12.319034852546917, "grad_norm": 1.3193695545196533, "learning_rate": 1.92959610531155e-05, "loss": 0.5715, "num_input_tokens_seen": 48023616, "step": 82710 }, { "epoch": 12.319779565087876, "grad_norm": 1.6325407028198242, "learning_rate": 1.9292797388910832e-05, "loss": 0.8144, "num_input_tokens_seen": 48026656, "step": 82715 }, { "epoch": 12.320524277628834, "grad_norm": 1.0339919328689575, "learning_rate": 1.928963382112355e-05, "loss": 0.5186, "num_input_tokens_seen": 48029440, "step": 82720 }, { "epoch": 12.321268990169795, "grad_norm": 1.5026301145553589, "learning_rate": 1.9286470349807108e-05, "loss": 0.6443, "num_input_tokens_seen": 48032512, "step": 82725 }, { "epoch": 12.322013702710754, "grad_norm": 1.7408289909362793, "learning_rate": 1.9283306975014935e-05, "loss": 0.4558, "num_input_tokens_seen": 48035392, "step": 82730 }, { "epoch": 12.322758415251712, "grad_norm": 1.1205854415893555, "learning_rate": 1.9280143696800473e-05, "loss": 0.6161, "num_input_tokens_seen": 48038336, "step": 82735 }, { "epoch": 12.323503127792671, "grad_norm": 2.2404630184173584, "learning_rate": 1.9276980515217183e-05, "loss": 0.6498, "num_input_tokens_seen": 48041600, "step": 82740 }, { "epoch": 12.324247840333632, "grad_norm": 0.9162212610244751, "learning_rate": 1.927381743031848e-05, "loss": 0.6829, "num_input_tokens_seen": 48044448, "step": 82745 }, { "epoch": 12.32499255287459, "grad_norm": 1.020107626914978, "learning_rate": 1.927065444215782e-05, "loss": 0.4093, "num_input_tokens_seen": 48047168, "step": 82750 }, { "epoch": 12.32573726541555, "grad_norm": 2.6282668113708496, "learning_rate": 1.9267491550788626e-05, "loss": 0.6292, "num_input_tokens_seen": 48050016, "step": 82755 }, { "epoch": 12.326481977956508, "grad_norm": 1.8933062553405762, "learning_rate": 1.926432875626434e-05, "loss": 0.6894, "num_input_tokens_seen": 48052992, "step": 82760 }, { "epoch": 12.327226690497469, "grad_norm": 1.205168604850769, "learning_rate": 1.926116605863838e-05, "loss": 0.6078, "num_input_tokens_seen": 48055776, "step": 82765 }, { "epoch": 12.327971403038427, "grad_norm": 1.124545931816101, "learning_rate": 1.9258003457964198e-05, "loss": 0.5703, "num_input_tokens_seen": 48058720, "step": 82770 }, { "epoch": 12.328716115579386, "grad_norm": 1.3470721244812012, "learning_rate": 1.925484095429521e-05, "loss": 0.3311, "num_input_tokens_seen": 48061600, "step": 82775 }, { "epoch": 12.329460828120345, "grad_norm": 1.0427950620651245, "learning_rate": 1.9251678547684836e-05, "loss": 0.6286, "num_input_tokens_seen": 48064352, "step": 82780 }, { "epoch": 12.330205540661305, "grad_norm": 2.200892925262451, "learning_rate": 1.924851623818652e-05, "loss": 0.6046, "num_input_tokens_seen": 48067040, "step": 82785 }, { "epoch": 12.330950253202264, "grad_norm": 1.1773626804351807, "learning_rate": 1.9245354025853673e-05, "loss": 0.6345, "num_input_tokens_seen": 48070208, "step": 82790 }, { "epoch": 12.331694965743223, "grad_norm": 1.8476929664611816, "learning_rate": 1.9242191910739727e-05, "loss": 0.5952, "num_input_tokens_seen": 48073280, "step": 82795 }, { "epoch": 12.332439678284182, "grad_norm": 3.0573384761810303, "learning_rate": 1.9239029892898083e-05, "loss": 0.5727, "num_input_tokens_seen": 48076096, "step": 82800 }, { "epoch": 12.333184390825142, "grad_norm": 1.417218565940857, "learning_rate": 1.9235867972382188e-05, "loss": 0.5705, "num_input_tokens_seen": 48078944, "step": 82805 }, { "epoch": 12.333929103366101, "grad_norm": 1.7568542957305908, "learning_rate": 1.9232706149245443e-05, "loss": 0.7076, "num_input_tokens_seen": 48081952, "step": 82810 }, { "epoch": 12.33467381590706, "grad_norm": 2.189199686050415, "learning_rate": 1.9229544423541254e-05, "loss": 0.5299, "num_input_tokens_seen": 48084768, "step": 82815 }, { "epoch": 12.335418528448018, "grad_norm": 1.5834976434707642, "learning_rate": 1.922638279532306e-05, "loss": 0.5411, "num_input_tokens_seen": 48087584, "step": 82820 }, { "epoch": 12.336163240988979, "grad_norm": 1.034375786781311, "learning_rate": 1.9223221264644253e-05, "loss": 0.5777, "num_input_tokens_seen": 48090656, "step": 82825 }, { "epoch": 12.336907953529938, "grad_norm": 1.233601450920105, "learning_rate": 1.922005983155826e-05, "loss": 0.5378, "num_input_tokens_seen": 48093696, "step": 82830 }, { "epoch": 12.337652666070897, "grad_norm": 1.3075966835021973, "learning_rate": 1.921689849611847e-05, "loss": 0.5939, "num_input_tokens_seen": 48096448, "step": 82835 }, { "epoch": 12.338397378611855, "grad_norm": 1.4318830966949463, "learning_rate": 1.921373725837831e-05, "loss": 0.7282, "num_input_tokens_seen": 48099712, "step": 82840 }, { "epoch": 12.339142091152816, "grad_norm": 1.4609078168869019, "learning_rate": 1.9210576118391177e-05, "loss": 0.4824, "num_input_tokens_seen": 48102592, "step": 82845 }, { "epoch": 12.339886803693775, "grad_norm": 1.4059354066848755, "learning_rate": 1.920741507621048e-05, "loss": 0.6806, "num_input_tokens_seen": 48105632, "step": 82850 }, { "epoch": 12.340631516234733, "grad_norm": 0.976235032081604, "learning_rate": 1.9204254131889612e-05, "loss": 0.4934, "num_input_tokens_seen": 48108768, "step": 82855 }, { "epoch": 12.341376228775692, "grad_norm": 1.2188149690628052, "learning_rate": 1.920109328548198e-05, "loss": 0.6375, "num_input_tokens_seen": 48111872, "step": 82860 }, { "epoch": 12.342120941316653, "grad_norm": 1.1895933151245117, "learning_rate": 1.919793253704099e-05, "loss": 0.5133, "num_input_tokens_seen": 48114976, "step": 82865 }, { "epoch": 12.342865653857611, "grad_norm": 1.5890520811080933, "learning_rate": 1.9194771886620023e-05, "loss": 0.5988, "num_input_tokens_seen": 48117728, "step": 82870 }, { "epoch": 12.34361036639857, "grad_norm": 2.183450698852539, "learning_rate": 1.919161133427249e-05, "loss": 0.653, "num_input_tokens_seen": 48120576, "step": 82875 }, { "epoch": 12.344355078939529, "grad_norm": 1.3627777099609375, "learning_rate": 1.918845088005178e-05, "loss": 0.6434, "num_input_tokens_seen": 48123456, "step": 82880 }, { "epoch": 12.34509979148049, "grad_norm": 1.9112073183059692, "learning_rate": 1.918529052401129e-05, "loss": 0.5809, "num_input_tokens_seen": 48126400, "step": 82885 }, { "epoch": 12.345844504021448, "grad_norm": 1.101853609085083, "learning_rate": 1.9182130266204396e-05, "loss": 0.4941, "num_input_tokens_seen": 48129280, "step": 82890 }, { "epoch": 12.346589216562407, "grad_norm": 1.3901151418685913, "learning_rate": 1.9178970106684506e-05, "loss": 0.527, "num_input_tokens_seen": 48131968, "step": 82895 }, { "epoch": 12.347333929103366, "grad_norm": 1.073232889175415, "learning_rate": 1.9175810045505006e-05, "loss": 0.6332, "num_input_tokens_seen": 48134784, "step": 82900 }, { "epoch": 12.348078641644324, "grad_norm": 1.1844427585601807, "learning_rate": 1.917265008271926e-05, "loss": 0.5032, "num_input_tokens_seen": 48137600, "step": 82905 }, { "epoch": 12.348823354185285, "grad_norm": 1.5272057056427002, "learning_rate": 1.916949021838068e-05, "loss": 0.5933, "num_input_tokens_seen": 48140288, "step": 82910 }, { "epoch": 12.349568066726244, "grad_norm": 1.5998116731643677, "learning_rate": 1.916633045254263e-05, "loss": 0.5428, "num_input_tokens_seen": 48143424, "step": 82915 }, { "epoch": 12.350312779267203, "grad_norm": 1.3562935590744019, "learning_rate": 1.9163170785258507e-05, "loss": 0.567, "num_input_tokens_seen": 48146656, "step": 82920 }, { "epoch": 12.351057491808161, "grad_norm": 1.3914525508880615, "learning_rate": 1.916001121658167e-05, "loss": 0.4746, "num_input_tokens_seen": 48149472, "step": 82925 }, { "epoch": 12.351802204349122, "grad_norm": 2.271366596221924, "learning_rate": 1.9156851746565514e-05, "loss": 0.5975, "num_input_tokens_seen": 48152576, "step": 82930 }, { "epoch": 12.35254691689008, "grad_norm": 1.6509851217269897, "learning_rate": 1.9153692375263413e-05, "loss": 0.5582, "num_input_tokens_seen": 48155648, "step": 82935 }, { "epoch": 12.35329162943104, "grad_norm": 1.0549489259719849, "learning_rate": 1.9150533102728728e-05, "loss": 0.576, "num_input_tokens_seen": 48158752, "step": 82940 }, { "epoch": 12.354036341971998, "grad_norm": 0.9476318955421448, "learning_rate": 1.914737392901485e-05, "loss": 0.5955, "num_input_tokens_seen": 48161568, "step": 82945 }, { "epoch": 12.354781054512959, "grad_norm": 2.8427438735961914, "learning_rate": 1.9144214854175136e-05, "loss": 0.6648, "num_input_tokens_seen": 48164352, "step": 82950 }, { "epoch": 12.355525767053917, "grad_norm": 1.9316866397857666, "learning_rate": 1.9141055878262963e-05, "loss": 0.6305, "num_input_tokens_seen": 48167328, "step": 82955 }, { "epoch": 12.356270479594876, "grad_norm": 1.09422767162323, "learning_rate": 1.913789700133169e-05, "loss": 0.6536, "num_input_tokens_seen": 48170432, "step": 82960 }, { "epoch": 12.357015192135835, "grad_norm": 2.7659361362457275, "learning_rate": 1.9134738223434697e-05, "loss": 0.5931, "num_input_tokens_seen": 48173408, "step": 82965 }, { "epoch": 12.357759904676795, "grad_norm": 1.0020575523376465, "learning_rate": 1.913157954462533e-05, "loss": 0.5471, "num_input_tokens_seen": 48176512, "step": 82970 }, { "epoch": 12.358504617217754, "grad_norm": 1.160705327987671, "learning_rate": 1.9128420964956972e-05, "loss": 0.6474, "num_input_tokens_seen": 48179552, "step": 82975 }, { "epoch": 12.359249329758713, "grad_norm": 1.6333582401275635, "learning_rate": 1.912526248448298e-05, "loss": 0.5432, "num_input_tokens_seen": 48182368, "step": 82980 }, { "epoch": 12.359994042299672, "grad_norm": 1.6721713542938232, "learning_rate": 1.9122104103256693e-05, "loss": 0.5594, "num_input_tokens_seen": 48185504, "step": 82985 }, { "epoch": 12.360738754840632, "grad_norm": 2.1643013954162598, "learning_rate": 1.9118945821331495e-05, "loss": 0.6679, "num_input_tokens_seen": 48188256, "step": 82990 }, { "epoch": 12.361483467381591, "grad_norm": 2.6105599403381348, "learning_rate": 1.9115787638760717e-05, "loss": 0.5596, "num_input_tokens_seen": 48190912, "step": 82995 }, { "epoch": 12.36222817992255, "grad_norm": 1.8395612239837646, "learning_rate": 1.911262955559774e-05, "loss": 0.6587, "num_input_tokens_seen": 48193696, "step": 83000 }, { "epoch": 12.362972892463509, "grad_norm": 1.6216939687728882, "learning_rate": 1.910947157189589e-05, "loss": 0.5995, "num_input_tokens_seen": 48196544, "step": 83005 }, { "epoch": 12.363717605004469, "grad_norm": 1.1812013387680054, "learning_rate": 1.9106313687708543e-05, "loss": 0.5688, "num_input_tokens_seen": 48199264, "step": 83010 }, { "epoch": 12.364462317545428, "grad_norm": 1.7633053064346313, "learning_rate": 1.9103155903089036e-05, "loss": 0.6189, "num_input_tokens_seen": 48202016, "step": 83015 }, { "epoch": 12.365207030086387, "grad_norm": 1.1717162132263184, "learning_rate": 1.9099998218090707e-05, "loss": 0.4986, "num_input_tokens_seen": 48205024, "step": 83020 }, { "epoch": 12.365951742627345, "grad_norm": 1.8987468481063843, "learning_rate": 1.9096840632766923e-05, "loss": 0.7141, "num_input_tokens_seen": 48208192, "step": 83025 }, { "epoch": 12.366696455168306, "grad_norm": 1.652745008468628, "learning_rate": 1.9093683147171002e-05, "loss": 0.6916, "num_input_tokens_seen": 48210720, "step": 83030 }, { "epoch": 12.367441167709265, "grad_norm": 1.404241919517517, "learning_rate": 1.9090525761356315e-05, "loss": 0.8324, "num_input_tokens_seen": 48213504, "step": 83035 }, { "epoch": 12.368185880250223, "grad_norm": 1.7662668228149414, "learning_rate": 1.9087368475376176e-05, "loss": 0.5293, "num_input_tokens_seen": 48216320, "step": 83040 }, { "epoch": 12.368930592791182, "grad_norm": 0.9927001595497131, "learning_rate": 1.908421128928395e-05, "loss": 0.4765, "num_input_tokens_seen": 48219360, "step": 83045 }, { "epoch": 12.36967530533214, "grad_norm": 1.8180954456329346, "learning_rate": 1.9081054203132955e-05, "loss": 0.5236, "num_input_tokens_seen": 48222240, "step": 83050 }, { "epoch": 12.370420017873101, "grad_norm": 1.1052250862121582, "learning_rate": 1.9077897216976537e-05, "loss": 0.5356, "num_input_tokens_seen": 48225216, "step": 83055 }, { "epoch": 12.37116473041406, "grad_norm": 1.4070827960968018, "learning_rate": 1.907474033086803e-05, "loss": 0.6726, "num_input_tokens_seen": 48227840, "step": 83060 }, { "epoch": 12.371909442955019, "grad_norm": 0.9135267734527588, "learning_rate": 1.907158354486075e-05, "loss": 0.6336, "num_input_tokens_seen": 48230656, "step": 83065 }, { "epoch": 12.37265415549598, "grad_norm": 1.6869996786117554, "learning_rate": 1.9068426859008055e-05, "loss": 0.708, "num_input_tokens_seen": 48233408, "step": 83070 }, { "epoch": 12.373398868036938, "grad_norm": 1.9821443557739258, "learning_rate": 1.9065270273363244e-05, "loss": 0.6857, "num_input_tokens_seen": 48236416, "step": 83075 }, { "epoch": 12.374143580577897, "grad_norm": 1.5101784467697144, "learning_rate": 1.9062113787979674e-05, "loss": 0.6476, "num_input_tokens_seen": 48239200, "step": 83080 }, { "epoch": 12.374888293118856, "grad_norm": 1.3910523653030396, "learning_rate": 1.905895740291065e-05, "loss": 0.5011, "num_input_tokens_seen": 48241952, "step": 83085 }, { "epoch": 12.375633005659815, "grad_norm": 1.8503239154815674, "learning_rate": 1.9055801118209507e-05, "loss": 0.5728, "num_input_tokens_seen": 48244864, "step": 83090 }, { "epoch": 12.376377718200775, "grad_norm": 0.8283455967903137, "learning_rate": 1.9052644933929564e-05, "loss": 0.5431, "num_input_tokens_seen": 48247680, "step": 83095 }, { "epoch": 12.377122430741734, "grad_norm": 1.621392846107483, "learning_rate": 1.9049488850124128e-05, "loss": 0.7606, "num_input_tokens_seen": 48250912, "step": 83100 }, { "epoch": 12.377867143282693, "grad_norm": 1.0960737466812134, "learning_rate": 1.9046332866846544e-05, "loss": 0.4955, "num_input_tokens_seen": 48253696, "step": 83105 }, { "epoch": 12.378611855823651, "grad_norm": 1.0143330097198486, "learning_rate": 1.90431769841501e-05, "loss": 0.6898, "num_input_tokens_seen": 48256320, "step": 83110 }, { "epoch": 12.379356568364612, "grad_norm": 3.154447317123413, "learning_rate": 1.904002120208814e-05, "loss": 0.6624, "num_input_tokens_seen": 48259360, "step": 83115 }, { "epoch": 12.38010128090557, "grad_norm": 1.098853349685669, "learning_rate": 1.903686552071396e-05, "loss": 0.4091, "num_input_tokens_seen": 48262336, "step": 83120 }, { "epoch": 12.38084599344653, "grad_norm": 1.407607078552246, "learning_rate": 1.903370994008088e-05, "loss": 0.3798, "num_input_tokens_seen": 48265248, "step": 83125 }, { "epoch": 12.381590705987488, "grad_norm": 1.3420922756195068, "learning_rate": 1.9030554460242194e-05, "loss": 0.6571, "num_input_tokens_seen": 48268128, "step": 83130 }, { "epoch": 12.382335418528449, "grad_norm": 0.7992682456970215, "learning_rate": 1.902739908125124e-05, "loss": 0.6303, "num_input_tokens_seen": 48271008, "step": 83135 }, { "epoch": 12.383080131069407, "grad_norm": 1.0405806303024292, "learning_rate": 1.9024243803161304e-05, "loss": 0.6111, "num_input_tokens_seen": 48274368, "step": 83140 }, { "epoch": 12.383824843610366, "grad_norm": 2.101832866668701, "learning_rate": 1.9021088626025684e-05, "loss": 0.6446, "num_input_tokens_seen": 48277088, "step": 83145 }, { "epoch": 12.384569556151325, "grad_norm": 1.5942164659500122, "learning_rate": 1.9017933549897706e-05, "loss": 0.616, "num_input_tokens_seen": 48280000, "step": 83150 }, { "epoch": 12.385314268692285, "grad_norm": 3.0644402503967285, "learning_rate": 1.901477857483066e-05, "loss": 0.7309, "num_input_tokens_seen": 48282752, "step": 83155 }, { "epoch": 12.386058981233244, "grad_norm": 2.284703493118286, "learning_rate": 1.9011623700877845e-05, "loss": 0.548, "num_input_tokens_seen": 48285728, "step": 83160 }, { "epoch": 12.386803693774203, "grad_norm": 1.9857637882232666, "learning_rate": 1.9008468928092555e-05, "loss": 0.568, "num_input_tokens_seen": 48289408, "step": 83165 }, { "epoch": 12.387548406315162, "grad_norm": 1.236213207244873, "learning_rate": 1.9005314256528103e-05, "loss": 0.4768, "num_input_tokens_seen": 48292576, "step": 83170 }, { "epoch": 12.388293118856122, "grad_norm": 2.692121744155884, "learning_rate": 1.9002159686237776e-05, "loss": 0.8989, "num_input_tokens_seen": 48295872, "step": 83175 }, { "epoch": 12.389037831397081, "grad_norm": 1.6676234006881714, "learning_rate": 1.8999005217274857e-05, "loss": 0.694, "num_input_tokens_seen": 48298336, "step": 83180 }, { "epoch": 12.38978254393804, "grad_norm": 1.654592514038086, "learning_rate": 1.8995850849692646e-05, "loss": 0.6015, "num_input_tokens_seen": 48300960, "step": 83185 }, { "epoch": 12.390527256478999, "grad_norm": 0.6035280823707581, "learning_rate": 1.8992696583544434e-05, "loss": 0.4733, "num_input_tokens_seen": 48303616, "step": 83190 }, { "epoch": 12.391271969019959, "grad_norm": 0.973148763179779, "learning_rate": 1.898954241888351e-05, "loss": 0.5437, "num_input_tokens_seen": 48306656, "step": 83195 }, { "epoch": 12.392016681560918, "grad_norm": 1.2672685384750366, "learning_rate": 1.8986388355763147e-05, "loss": 0.6441, "num_input_tokens_seen": 48309696, "step": 83200 }, { "epoch": 12.392761394101877, "grad_norm": 1.6454877853393555, "learning_rate": 1.8983234394236657e-05, "loss": 0.6765, "num_input_tokens_seen": 48312608, "step": 83205 }, { "epoch": 12.393506106642835, "grad_norm": 2.4975221157073975, "learning_rate": 1.8980080534357298e-05, "loss": 0.6407, "num_input_tokens_seen": 48315456, "step": 83210 }, { "epoch": 12.394250819183796, "grad_norm": 1.9581382274627686, "learning_rate": 1.8976926776178366e-05, "loss": 0.6653, "num_input_tokens_seen": 48318720, "step": 83215 }, { "epoch": 12.394995531724755, "grad_norm": 1.3706313371658325, "learning_rate": 1.8973773119753132e-05, "loss": 0.6307, "num_input_tokens_seen": 48321792, "step": 83220 }, { "epoch": 12.395740244265713, "grad_norm": 1.955476999282837, "learning_rate": 1.8970619565134866e-05, "loss": 0.5662, "num_input_tokens_seen": 48324544, "step": 83225 }, { "epoch": 12.396484956806672, "grad_norm": 1.0988937616348267, "learning_rate": 1.896746611237687e-05, "loss": 0.5769, "num_input_tokens_seen": 48327520, "step": 83230 }, { "epoch": 12.397229669347631, "grad_norm": 1.0267523527145386, "learning_rate": 1.8964312761532388e-05, "loss": 0.5779, "num_input_tokens_seen": 48330080, "step": 83235 }, { "epoch": 12.397974381888591, "grad_norm": 1.367073655128479, "learning_rate": 1.896115951265472e-05, "loss": 0.4705, "num_input_tokens_seen": 48333184, "step": 83240 }, { "epoch": 12.39871909442955, "grad_norm": 1.2886452674865723, "learning_rate": 1.8958006365797118e-05, "loss": 0.8237, "num_input_tokens_seen": 48336160, "step": 83245 }, { "epoch": 12.399463806970509, "grad_norm": 2.746795177459717, "learning_rate": 1.8954853321012865e-05, "loss": 0.5857, "num_input_tokens_seen": 48339616, "step": 83250 }, { "epoch": 12.400208519511468, "grad_norm": 2.3546221256256104, "learning_rate": 1.8951700378355218e-05, "loss": 0.7437, "num_input_tokens_seen": 48342592, "step": 83255 }, { "epoch": 12.400953232052428, "grad_norm": 1.3594552278518677, "learning_rate": 1.8948547537877436e-05, "loss": 0.7279, "num_input_tokens_seen": 48345440, "step": 83260 }, { "epoch": 12.401697944593387, "grad_norm": 1.742384433746338, "learning_rate": 1.8945394799632804e-05, "loss": 0.6443, "num_input_tokens_seen": 48348448, "step": 83265 }, { "epoch": 12.402442657134346, "grad_norm": 1.745800495147705, "learning_rate": 1.8942242163674563e-05, "loss": 0.5929, "num_input_tokens_seen": 48351072, "step": 83270 }, { "epoch": 12.403187369675305, "grad_norm": 1.0406842231750488, "learning_rate": 1.8939089630055994e-05, "loss": 0.6341, "num_input_tokens_seen": 48353760, "step": 83275 }, { "epoch": 12.403932082216265, "grad_norm": 2.0170466899871826, "learning_rate": 1.8935937198830343e-05, "loss": 0.6084, "num_input_tokens_seen": 48356768, "step": 83280 }, { "epoch": 12.404676794757224, "grad_norm": 2.559095621109009, "learning_rate": 1.893278487005087e-05, "loss": 0.6035, "num_input_tokens_seen": 48359744, "step": 83285 }, { "epoch": 12.405421507298183, "grad_norm": 1.9833866357803345, "learning_rate": 1.8929632643770824e-05, "loss": 0.4906, "num_input_tokens_seen": 48362720, "step": 83290 }, { "epoch": 12.406166219839141, "grad_norm": 2.340756416320801, "learning_rate": 1.8926480520043472e-05, "loss": 0.5826, "num_input_tokens_seen": 48366016, "step": 83295 }, { "epoch": 12.406910932380102, "grad_norm": 1.281738519668579, "learning_rate": 1.892332849892206e-05, "loss": 0.5436, "num_input_tokens_seen": 48369056, "step": 83300 }, { "epoch": 12.40765564492106, "grad_norm": 1.7021185159683228, "learning_rate": 1.8920176580459827e-05, "loss": 0.6537, "num_input_tokens_seen": 48371968, "step": 83305 }, { "epoch": 12.40840035746202, "grad_norm": 1.0064643621444702, "learning_rate": 1.8917024764710043e-05, "loss": 0.5978, "num_input_tokens_seen": 48375008, "step": 83310 }, { "epoch": 12.409145070002978, "grad_norm": 1.1632832288742065, "learning_rate": 1.8913873051725935e-05, "loss": 0.7163, "num_input_tokens_seen": 48377792, "step": 83315 }, { "epoch": 12.409889782543939, "grad_norm": 1.649099588394165, "learning_rate": 1.8910721441560765e-05, "loss": 0.6605, "num_input_tokens_seen": 48380832, "step": 83320 }, { "epoch": 12.410634495084897, "grad_norm": 0.9733962416648865, "learning_rate": 1.8907569934267756e-05, "loss": 0.6122, "num_input_tokens_seen": 48383616, "step": 83325 }, { "epoch": 12.411379207625856, "grad_norm": 1.3540244102478027, "learning_rate": 1.890441852990017e-05, "loss": 0.8395, "num_input_tokens_seen": 48386112, "step": 83330 }, { "epoch": 12.412123920166815, "grad_norm": 1.168689489364624, "learning_rate": 1.890126722851124e-05, "loss": 0.7672, "num_input_tokens_seen": 48388896, "step": 83335 }, { "epoch": 12.412868632707776, "grad_norm": 1.3485829830169678, "learning_rate": 1.8898116030154185e-05, "loss": 0.5657, "num_input_tokens_seen": 48391616, "step": 83340 }, { "epoch": 12.413613345248734, "grad_norm": 1.6388170719146729, "learning_rate": 1.8894964934882274e-05, "loss": 0.5698, "num_input_tokens_seen": 48394400, "step": 83345 }, { "epoch": 12.414358057789693, "grad_norm": 1.1733694076538086, "learning_rate": 1.8891813942748717e-05, "loss": 0.579, "num_input_tokens_seen": 48397152, "step": 83350 }, { "epoch": 12.415102770330652, "grad_norm": 1.985368013381958, "learning_rate": 1.8888663053806765e-05, "loss": 0.6582, "num_input_tokens_seen": 48400000, "step": 83355 }, { "epoch": 12.415847482871612, "grad_norm": 1.510223150253296, "learning_rate": 1.8885512268109625e-05, "loss": 0.6991, "num_input_tokens_seen": 48402688, "step": 83360 }, { "epoch": 12.416592195412571, "grad_norm": 0.9460591673851013, "learning_rate": 1.8882361585710554e-05, "loss": 0.6227, "num_input_tokens_seen": 48405536, "step": 83365 }, { "epoch": 12.41733690795353, "grad_norm": 2.232841730117798, "learning_rate": 1.887921100666275e-05, "loss": 0.745, "num_input_tokens_seen": 48408448, "step": 83370 }, { "epoch": 12.418081620494489, "grad_norm": 1.327526330947876, "learning_rate": 1.8876060531019474e-05, "loss": 0.6255, "num_input_tokens_seen": 48411264, "step": 83375 }, { "epoch": 12.41882633303545, "grad_norm": 1.275809407234192, "learning_rate": 1.887291015883393e-05, "loss": 0.5996, "num_input_tokens_seen": 48414304, "step": 83380 }, { "epoch": 12.419571045576408, "grad_norm": 1.3410875797271729, "learning_rate": 1.8869759890159333e-05, "loss": 0.7372, "num_input_tokens_seen": 48417376, "step": 83385 }, { "epoch": 12.420315758117367, "grad_norm": 1.641373872756958, "learning_rate": 1.8866609725048918e-05, "loss": 0.3653, "num_input_tokens_seen": 48420288, "step": 83390 }, { "epoch": 12.421060470658325, "grad_norm": 1.7668179273605347, "learning_rate": 1.8863459663555885e-05, "loss": 0.6259, "num_input_tokens_seen": 48423296, "step": 83395 }, { "epoch": 12.421805183199286, "grad_norm": 1.8693102598190308, "learning_rate": 1.8860309705733477e-05, "loss": 0.5419, "num_input_tokens_seen": 48426560, "step": 83400 }, { "epoch": 12.422549895740245, "grad_norm": 1.552695870399475, "learning_rate": 1.8857159851634888e-05, "loss": 0.5738, "num_input_tokens_seen": 48429504, "step": 83405 }, { "epoch": 12.423294608281203, "grad_norm": 1.2193514108657837, "learning_rate": 1.885401010131335e-05, "loss": 0.6317, "num_input_tokens_seen": 48432288, "step": 83410 }, { "epoch": 12.424039320822162, "grad_norm": 2.2266674041748047, "learning_rate": 1.8850860454822056e-05, "loss": 0.6925, "num_input_tokens_seen": 48435072, "step": 83415 }, { "epoch": 12.424784033363121, "grad_norm": 1.3398457765579224, "learning_rate": 1.8847710912214233e-05, "loss": 0.411, "num_input_tokens_seen": 48437728, "step": 83420 }, { "epoch": 12.425528745904082, "grad_norm": 0.9278979301452637, "learning_rate": 1.8844561473543082e-05, "loss": 0.5855, "num_input_tokens_seen": 48440672, "step": 83425 }, { "epoch": 12.42627345844504, "grad_norm": 1.690307855606079, "learning_rate": 1.8841412138861797e-05, "loss": 0.668, "num_input_tokens_seen": 48443552, "step": 83430 }, { "epoch": 12.427018170985999, "grad_norm": 2.1327977180480957, "learning_rate": 1.8838262908223602e-05, "loss": 0.6394, "num_input_tokens_seen": 48446368, "step": 83435 }, { "epoch": 12.427762883526958, "grad_norm": 2.364372491836548, "learning_rate": 1.8835113781681686e-05, "loss": 0.6837, "num_input_tokens_seen": 48449248, "step": 83440 }, { "epoch": 12.428507596067918, "grad_norm": 1.5624034404754639, "learning_rate": 1.8831964759289265e-05, "loss": 0.3912, "num_input_tokens_seen": 48452320, "step": 83445 }, { "epoch": 12.429252308608877, "grad_norm": 2.408837080001831, "learning_rate": 1.8828815841099528e-05, "loss": 0.6365, "num_input_tokens_seen": 48455008, "step": 83450 }, { "epoch": 12.429997021149836, "grad_norm": 0.8168818354606628, "learning_rate": 1.882566702716568e-05, "loss": 0.7626, "num_input_tokens_seen": 48458080, "step": 83455 }, { "epoch": 12.430741733690795, "grad_norm": 1.7791078090667725, "learning_rate": 1.8822518317540913e-05, "loss": 0.5891, "num_input_tokens_seen": 48460928, "step": 83460 }, { "epoch": 12.431486446231755, "grad_norm": 1.808053731918335, "learning_rate": 1.8819369712278408e-05, "loss": 0.7715, "num_input_tokens_seen": 48464448, "step": 83465 }, { "epoch": 12.432231158772714, "grad_norm": 1.758975863456726, "learning_rate": 1.8816221211431382e-05, "loss": 0.6271, "num_input_tokens_seen": 48467264, "step": 83470 }, { "epoch": 12.432975871313673, "grad_norm": 1.121956467628479, "learning_rate": 1.8813072815053003e-05, "loss": 0.6808, "num_input_tokens_seen": 48470080, "step": 83475 }, { "epoch": 12.433720583854631, "grad_norm": 1.448242425918579, "learning_rate": 1.880992452319648e-05, "loss": 0.3088, "num_input_tokens_seen": 48473152, "step": 83480 }, { "epoch": 12.434465296395592, "grad_norm": 2.032696008682251, "learning_rate": 1.8806776335914986e-05, "loss": 0.7665, "num_input_tokens_seen": 48475840, "step": 83485 }, { "epoch": 12.43521000893655, "grad_norm": 1.5465829372406006, "learning_rate": 1.8803628253261717e-05, "loss": 0.5473, "num_input_tokens_seen": 48479104, "step": 83490 }, { "epoch": 12.43595472147751, "grad_norm": 1.4572951793670654, "learning_rate": 1.880048027528984e-05, "loss": 0.6332, "num_input_tokens_seen": 48482144, "step": 83495 }, { "epoch": 12.436699434018468, "grad_norm": 1.2908234596252441, "learning_rate": 1.879733240205256e-05, "loss": 0.6571, "num_input_tokens_seen": 48484992, "step": 83500 }, { "epoch": 12.437444146559429, "grad_norm": 0.8890805244445801, "learning_rate": 1.879418463360304e-05, "loss": 0.6487, "num_input_tokens_seen": 48488032, "step": 83505 }, { "epoch": 12.438188859100388, "grad_norm": 1.4819332361221313, "learning_rate": 1.8791036969994462e-05, "loss": 0.6748, "num_input_tokens_seen": 48491040, "step": 83510 }, { "epoch": 12.438933571641346, "grad_norm": 1.2063610553741455, "learning_rate": 1.8787889411280005e-05, "loss": 0.5092, "num_input_tokens_seen": 48493792, "step": 83515 }, { "epoch": 12.439678284182305, "grad_norm": 2.4800283908843994, "learning_rate": 1.8784741957512842e-05, "loss": 0.6475, "num_input_tokens_seen": 48496576, "step": 83520 }, { "epoch": 12.440422996723266, "grad_norm": 2.2474966049194336, "learning_rate": 1.878159460874615e-05, "loss": 0.8137, "num_input_tokens_seen": 48499296, "step": 83525 }, { "epoch": 12.441167709264224, "grad_norm": 1.644385576248169, "learning_rate": 1.8778447365033085e-05, "loss": 0.9503, "num_input_tokens_seen": 48501984, "step": 83530 }, { "epoch": 12.441912421805183, "grad_norm": 1.2878799438476562, "learning_rate": 1.877530022642684e-05, "loss": 0.4947, "num_input_tokens_seen": 48504800, "step": 83535 }, { "epoch": 12.442657134346142, "grad_norm": 1.7720705270767212, "learning_rate": 1.8772153192980578e-05, "loss": 0.5682, "num_input_tokens_seen": 48507520, "step": 83540 }, { "epoch": 12.443401846887102, "grad_norm": 2.852508068084717, "learning_rate": 1.8769006264747445e-05, "loss": 0.7251, "num_input_tokens_seen": 48510560, "step": 83545 }, { "epoch": 12.444146559428061, "grad_norm": 1.3102352619171143, "learning_rate": 1.8765859441780625e-05, "loss": 0.57, "num_input_tokens_seen": 48513440, "step": 83550 }, { "epoch": 12.44489127196902, "grad_norm": 1.149430513381958, "learning_rate": 1.8762712724133266e-05, "loss": 0.5015, "num_input_tokens_seen": 48516352, "step": 83555 }, { "epoch": 12.445635984509979, "grad_norm": 1.2926901578903198, "learning_rate": 1.8759566111858544e-05, "loss": 0.5889, "num_input_tokens_seen": 48519584, "step": 83560 }, { "epoch": 12.44638069705094, "grad_norm": 1.4516398906707764, "learning_rate": 1.87564196050096e-05, "loss": 0.6819, "num_input_tokens_seen": 48522240, "step": 83565 }, { "epoch": 12.447125409591898, "grad_norm": 1.6669931411743164, "learning_rate": 1.8753273203639614e-05, "loss": 0.557, "num_input_tokens_seen": 48525248, "step": 83570 }, { "epoch": 12.447870122132857, "grad_norm": 0.7434579133987427, "learning_rate": 1.875012690780172e-05, "loss": 0.5702, "num_input_tokens_seen": 48528032, "step": 83575 }, { "epoch": 12.448614834673815, "grad_norm": 1.3100851774215698, "learning_rate": 1.8746980717549088e-05, "loss": 0.5058, "num_input_tokens_seen": 48530848, "step": 83580 }, { "epoch": 12.449359547214776, "grad_norm": 1.575721025466919, "learning_rate": 1.8743834632934858e-05, "loss": 0.5119, "num_input_tokens_seen": 48533632, "step": 83585 }, { "epoch": 12.450104259755735, "grad_norm": 2.5069456100463867, "learning_rate": 1.8740688654012172e-05, "loss": 0.6178, "num_input_tokens_seen": 48536608, "step": 83590 }, { "epoch": 12.450848972296694, "grad_norm": 1.5691266059875488, "learning_rate": 1.8737542780834205e-05, "loss": 0.5491, "num_input_tokens_seen": 48539584, "step": 83595 }, { "epoch": 12.451593684837652, "grad_norm": 1.0723156929016113, "learning_rate": 1.8734397013454075e-05, "loss": 0.5931, "num_input_tokens_seen": 48542688, "step": 83600 }, { "epoch": 12.452338397378611, "grad_norm": 1.5834845304489136, "learning_rate": 1.873125135192495e-05, "loss": 0.5744, "num_input_tokens_seen": 48545760, "step": 83605 }, { "epoch": 12.453083109919572, "grad_norm": 2.6277008056640625, "learning_rate": 1.8728105796299954e-05, "loss": 0.6264, "num_input_tokens_seen": 48548608, "step": 83610 }, { "epoch": 12.45382782246053, "grad_norm": 1.0073418617248535, "learning_rate": 1.8724960346632247e-05, "loss": 0.6798, "num_input_tokens_seen": 48551424, "step": 83615 }, { "epoch": 12.454572535001489, "grad_norm": 1.7346093654632568, "learning_rate": 1.8721815002974954e-05, "loss": 0.5565, "num_input_tokens_seen": 48554176, "step": 83620 }, { "epoch": 12.455317247542448, "grad_norm": 2.0003645420074463, "learning_rate": 1.8718669765381207e-05, "loss": 0.675, "num_input_tokens_seen": 48556672, "step": 83625 }, { "epoch": 12.456061960083408, "grad_norm": 1.0096490383148193, "learning_rate": 1.8715524633904157e-05, "loss": 0.4725, "num_input_tokens_seen": 48559520, "step": 83630 }, { "epoch": 12.456806672624367, "grad_norm": 0.8649212121963501, "learning_rate": 1.8712379608596926e-05, "loss": 0.6396, "num_input_tokens_seen": 48562592, "step": 83635 }, { "epoch": 12.457551385165326, "grad_norm": 2.00303316116333, "learning_rate": 1.8709234689512656e-05, "loss": 0.7825, "num_input_tokens_seen": 48565376, "step": 83640 }, { "epoch": 12.458296097706285, "grad_norm": 1.6508758068084717, "learning_rate": 1.8706089876704468e-05, "loss": 0.5412, "num_input_tokens_seen": 48568192, "step": 83645 }, { "epoch": 12.459040810247245, "grad_norm": 1.2096223831176758, "learning_rate": 1.8702945170225504e-05, "loss": 0.5592, "num_input_tokens_seen": 48571072, "step": 83650 }, { "epoch": 12.459785522788204, "grad_norm": 1.191487431526184, "learning_rate": 1.8699800570128868e-05, "loss": 0.5538, "num_input_tokens_seen": 48573888, "step": 83655 }, { "epoch": 12.460530235329163, "grad_norm": 1.3358505964279175, "learning_rate": 1.8696656076467705e-05, "loss": 0.4773, "num_input_tokens_seen": 48576768, "step": 83660 }, { "epoch": 12.461274947870121, "grad_norm": 2.0380282402038574, "learning_rate": 1.8693511689295138e-05, "loss": 0.583, "num_input_tokens_seen": 48579712, "step": 83665 }, { "epoch": 12.462019660411082, "grad_norm": 1.2720664739608765, "learning_rate": 1.8690367408664265e-05, "loss": 0.4581, "num_input_tokens_seen": 48583072, "step": 83670 }, { "epoch": 12.46276437295204, "grad_norm": 1.575880527496338, "learning_rate": 1.8687223234628237e-05, "loss": 0.4303, "num_input_tokens_seen": 48585728, "step": 83675 }, { "epoch": 12.463509085493, "grad_norm": 1.1516004800796509, "learning_rate": 1.868407916724015e-05, "loss": 0.6274, "num_input_tokens_seen": 48588256, "step": 83680 }, { "epoch": 12.464253798033958, "grad_norm": 0.9675431847572327, "learning_rate": 1.868093520655313e-05, "loss": 0.5381, "num_input_tokens_seen": 48591168, "step": 83685 }, { "epoch": 12.464998510574919, "grad_norm": 1.8759241104125977, "learning_rate": 1.8677791352620278e-05, "loss": 0.5347, "num_input_tokens_seen": 48594112, "step": 83690 }, { "epoch": 12.465743223115878, "grad_norm": 1.0360405445098877, "learning_rate": 1.8674647605494727e-05, "loss": 0.5566, "num_input_tokens_seen": 48597024, "step": 83695 }, { "epoch": 12.466487935656836, "grad_norm": 1.2812660932540894, "learning_rate": 1.8671503965229572e-05, "loss": 0.4655, "num_input_tokens_seen": 48600288, "step": 83700 }, { "epoch": 12.467232648197795, "grad_norm": 1.136616826057434, "learning_rate": 1.8668360431877918e-05, "loss": 0.5556, "num_input_tokens_seen": 48603648, "step": 83705 }, { "epoch": 12.467977360738756, "grad_norm": 1.0137306451797485, "learning_rate": 1.8665217005492892e-05, "loss": 0.5061, "num_input_tokens_seen": 48606528, "step": 83710 }, { "epoch": 12.468722073279714, "grad_norm": 1.509393572807312, "learning_rate": 1.8662073686127575e-05, "loss": 0.4607, "num_input_tokens_seen": 48609376, "step": 83715 }, { "epoch": 12.469466785820673, "grad_norm": 0.7952808141708374, "learning_rate": 1.865893047383509e-05, "loss": 0.6366, "num_input_tokens_seen": 48612320, "step": 83720 }, { "epoch": 12.470211498361632, "grad_norm": 2.776270866394043, "learning_rate": 1.865578736866852e-05, "loss": 0.7193, "num_input_tokens_seen": 48615072, "step": 83725 }, { "epoch": 12.470956210902592, "grad_norm": 1.5637125968933105, "learning_rate": 1.8652644370680986e-05, "loss": 0.6463, "num_input_tokens_seen": 48618304, "step": 83730 }, { "epoch": 12.471700923443551, "grad_norm": 2.1001603603363037, "learning_rate": 1.8649501479925562e-05, "loss": 0.5084, "num_input_tokens_seen": 48621152, "step": 83735 }, { "epoch": 12.47244563598451, "grad_norm": 2.278486490249634, "learning_rate": 1.8646358696455365e-05, "loss": 0.4453, "num_input_tokens_seen": 48624224, "step": 83740 }, { "epoch": 12.473190348525469, "grad_norm": 3.4434380531311035, "learning_rate": 1.8643216020323483e-05, "loss": 0.8526, "num_input_tokens_seen": 48627296, "step": 83745 }, { "epoch": 12.473935061066427, "grad_norm": 0.7627968192100525, "learning_rate": 1.8640073451583003e-05, "loss": 0.6604, "num_input_tokens_seen": 48630336, "step": 83750 }, { "epoch": 12.474679773607388, "grad_norm": 2.28192138671875, "learning_rate": 1.8636930990287015e-05, "loss": 0.5469, "num_input_tokens_seen": 48633120, "step": 83755 }, { "epoch": 12.475424486148347, "grad_norm": 2.4338924884796143, "learning_rate": 1.8633788636488605e-05, "loss": 0.5413, "num_input_tokens_seen": 48635872, "step": 83760 }, { "epoch": 12.476169198689306, "grad_norm": 2.2939858436584473, "learning_rate": 1.8630646390240876e-05, "loss": 0.6022, "num_input_tokens_seen": 48638720, "step": 83765 }, { "epoch": 12.476913911230264, "grad_norm": 0.6427319049835205, "learning_rate": 1.8627504251596895e-05, "loss": 0.5693, "num_input_tokens_seen": 48641600, "step": 83770 }, { "epoch": 12.477658623771225, "grad_norm": 1.646214485168457, "learning_rate": 1.862436222060976e-05, "loss": 0.6001, "num_input_tokens_seen": 48644480, "step": 83775 }, { "epoch": 12.478403336312184, "grad_norm": 0.9168964624404907, "learning_rate": 1.8621220297332544e-05, "loss": 0.4768, "num_input_tokens_seen": 48647328, "step": 83780 }, { "epoch": 12.479148048853142, "grad_norm": 1.454781413078308, "learning_rate": 1.8618078481818324e-05, "loss": 0.6413, "num_input_tokens_seen": 48650368, "step": 83785 }, { "epoch": 12.479892761394101, "grad_norm": 2.903369188308716, "learning_rate": 1.861493677412019e-05, "loss": 0.6991, "num_input_tokens_seen": 48653088, "step": 83790 }, { "epoch": 12.480637473935062, "grad_norm": 1.24839186668396, "learning_rate": 1.8611795174291198e-05, "loss": 0.5557, "num_input_tokens_seen": 48656096, "step": 83795 }, { "epoch": 12.48138218647602, "grad_norm": 1.3584212064743042, "learning_rate": 1.8608653682384442e-05, "loss": 0.6662, "num_input_tokens_seen": 48658976, "step": 83800 }, { "epoch": 12.48212689901698, "grad_norm": 3.252657651901245, "learning_rate": 1.8605512298452977e-05, "loss": 0.5954, "num_input_tokens_seen": 48661920, "step": 83805 }, { "epoch": 12.482871611557938, "grad_norm": 1.3930915594100952, "learning_rate": 1.8602371022549895e-05, "loss": 0.6381, "num_input_tokens_seen": 48664864, "step": 83810 }, { "epoch": 12.483616324098898, "grad_norm": 1.6614902019500732, "learning_rate": 1.8599229854728244e-05, "loss": 0.7027, "num_input_tokens_seen": 48667520, "step": 83815 }, { "epoch": 12.484361036639857, "grad_norm": 1.10092031955719, "learning_rate": 1.8596088795041106e-05, "loss": 0.7128, "num_input_tokens_seen": 48670336, "step": 83820 }, { "epoch": 12.485105749180816, "grad_norm": 1.0081474781036377, "learning_rate": 1.859294784354154e-05, "loss": 0.6586, "num_input_tokens_seen": 48673184, "step": 83825 }, { "epoch": 12.485850461721775, "grad_norm": 2.0801515579223633, "learning_rate": 1.8589807000282592e-05, "loss": 0.7231, "num_input_tokens_seen": 48676000, "step": 83830 }, { "epoch": 12.486595174262735, "grad_norm": 2.355756998062134, "learning_rate": 1.858666626531736e-05, "loss": 0.5469, "num_input_tokens_seen": 48678880, "step": 83835 }, { "epoch": 12.487339886803694, "grad_norm": 1.3781133890151978, "learning_rate": 1.8583525638698873e-05, "loss": 0.7378, "num_input_tokens_seen": 48681664, "step": 83840 }, { "epoch": 12.488084599344653, "grad_norm": 1.1750355958938599, "learning_rate": 1.85803851204802e-05, "loss": 0.6125, "num_input_tokens_seen": 48684320, "step": 83845 }, { "epoch": 12.488829311885612, "grad_norm": 1.3363884687423706, "learning_rate": 1.857724471071439e-05, "loss": 0.5831, "num_input_tokens_seen": 48687104, "step": 83850 }, { "epoch": 12.489574024426572, "grad_norm": 1.4538742303848267, "learning_rate": 1.8574104409454514e-05, "loss": 0.4581, "num_input_tokens_seen": 48690048, "step": 83855 }, { "epoch": 12.49031873696753, "grad_norm": 1.873447299003601, "learning_rate": 1.857096421675361e-05, "loss": 0.712, "num_input_tokens_seen": 48692992, "step": 83860 }, { "epoch": 12.49106344950849, "grad_norm": 1.7307640314102173, "learning_rate": 1.8567824132664724e-05, "loss": 0.5996, "num_input_tokens_seen": 48695968, "step": 83865 }, { "epoch": 12.491808162049448, "grad_norm": 1.4096146821975708, "learning_rate": 1.856468415724092e-05, "loss": 0.7332, "num_input_tokens_seen": 48699200, "step": 83870 }, { "epoch": 12.492552874590409, "grad_norm": 1.5152727365493774, "learning_rate": 1.8561544290535234e-05, "loss": 0.6609, "num_input_tokens_seen": 48702144, "step": 83875 }, { "epoch": 12.493297587131368, "grad_norm": 1.3557990789413452, "learning_rate": 1.8558404532600717e-05, "loss": 0.7224, "num_input_tokens_seen": 48705120, "step": 83880 }, { "epoch": 12.494042299672326, "grad_norm": 2.0238959789276123, "learning_rate": 1.8555264883490397e-05, "loss": 0.4909, "num_input_tokens_seen": 48707840, "step": 83885 }, { "epoch": 12.494787012213285, "grad_norm": 1.7209396362304688, "learning_rate": 1.8552125343257337e-05, "loss": 0.7917, "num_input_tokens_seen": 48710848, "step": 83890 }, { "epoch": 12.495531724754246, "grad_norm": 2.6348562240600586, "learning_rate": 1.8548985911954557e-05, "loss": 0.582, "num_input_tokens_seen": 48713792, "step": 83895 }, { "epoch": 12.496276437295204, "grad_norm": 1.4668991565704346, "learning_rate": 1.8545846589635115e-05, "loss": 0.7181, "num_input_tokens_seen": 48716896, "step": 83900 }, { "epoch": 12.497021149836163, "grad_norm": 0.8852407932281494, "learning_rate": 1.8542707376352033e-05, "loss": 0.4678, "num_input_tokens_seen": 48719552, "step": 83905 }, { "epoch": 12.497765862377122, "grad_norm": 2.2643773555755615, "learning_rate": 1.853956827215834e-05, "loss": 0.59, "num_input_tokens_seen": 48722144, "step": 83910 }, { "epoch": 12.498510574918082, "grad_norm": 1.009743094444275, "learning_rate": 1.8536429277107086e-05, "loss": 0.5396, "num_input_tokens_seen": 48725024, "step": 83915 }, { "epoch": 12.499255287459041, "grad_norm": 2.466179609298706, "learning_rate": 1.8533290391251278e-05, "loss": 0.5451, "num_input_tokens_seen": 48728192, "step": 83920 }, { "epoch": 12.5, "grad_norm": 0.975845456123352, "learning_rate": 1.8530151614643966e-05, "loss": 0.4631, "num_input_tokens_seen": 48731296, "step": 83925 }, { "epoch": 12.500744712540959, "grad_norm": 1.6448984146118164, "learning_rate": 1.8527012947338155e-05, "loss": 0.5453, "num_input_tokens_seen": 48734080, "step": 83930 }, { "epoch": 12.501489425081918, "grad_norm": 1.3234494924545288, "learning_rate": 1.852387438938689e-05, "loss": 0.4947, "num_input_tokens_seen": 48736736, "step": 83935 }, { "epoch": 12.502234137622878, "grad_norm": 1.3951681852340698, "learning_rate": 1.8520735940843187e-05, "loss": 0.5658, "num_input_tokens_seen": 48739584, "step": 83940 }, { "epoch": 12.502978850163837, "grad_norm": 1.4386258125305176, "learning_rate": 1.8517597601760062e-05, "loss": 0.4875, "num_input_tokens_seen": 48742720, "step": 83945 }, { "epoch": 12.503723562704796, "grad_norm": 1.1875065565109253, "learning_rate": 1.851445937219054e-05, "loss": 0.5635, "num_input_tokens_seen": 48745696, "step": 83950 }, { "epoch": 12.504468275245754, "grad_norm": 1.4160231351852417, "learning_rate": 1.8511321252187625e-05, "loss": 0.5458, "num_input_tokens_seen": 48748416, "step": 83955 }, { "epoch": 12.505212987786715, "grad_norm": 1.3112883567810059, "learning_rate": 1.8508183241804356e-05, "loss": 0.7232, "num_input_tokens_seen": 48751296, "step": 83960 }, { "epoch": 12.505957700327674, "grad_norm": 1.9691635370254517, "learning_rate": 1.850504534109372e-05, "loss": 0.4739, "num_input_tokens_seen": 48754336, "step": 83965 }, { "epoch": 12.506702412868632, "grad_norm": 1.216568946838379, "learning_rate": 1.8501907550108752e-05, "loss": 0.6323, "num_input_tokens_seen": 48757440, "step": 83970 }, { "epoch": 12.507447125409591, "grad_norm": 1.1724085807800293, "learning_rate": 1.8498769868902445e-05, "loss": 0.4984, "num_input_tokens_seen": 48760384, "step": 83975 }, { "epoch": 12.508191837950552, "grad_norm": 1.6496535539627075, "learning_rate": 1.849563229752782e-05, "loss": 0.7805, "num_input_tokens_seen": 48763296, "step": 83980 }, { "epoch": 12.50893655049151, "grad_norm": 4.177395343780518, "learning_rate": 1.849249483603788e-05, "loss": 0.7382, "num_input_tokens_seen": 48766208, "step": 83985 }, { "epoch": 12.50968126303247, "grad_norm": 1.4342609643936157, "learning_rate": 1.8489357484485616e-05, "loss": 0.4809, "num_input_tokens_seen": 48769280, "step": 83990 }, { "epoch": 12.510425975573428, "grad_norm": 1.8570462465286255, "learning_rate": 1.8486220242924042e-05, "loss": 0.5591, "num_input_tokens_seen": 48772128, "step": 83995 }, { "epoch": 12.511170688114388, "grad_norm": 1.9129406213760376, "learning_rate": 1.8483083111406154e-05, "loss": 0.5635, "num_input_tokens_seen": 48775008, "step": 84000 }, { "epoch": 12.511915400655347, "grad_norm": 1.04367196559906, "learning_rate": 1.8479946089984963e-05, "loss": 0.6155, "num_input_tokens_seen": 48777760, "step": 84005 }, { "epoch": 12.512660113196306, "grad_norm": 1.7339097261428833, "learning_rate": 1.8476809178713446e-05, "loss": 0.5727, "num_input_tokens_seen": 48780288, "step": 84010 }, { "epoch": 12.513404825737265, "grad_norm": 1.2763140201568604, "learning_rate": 1.8473672377644617e-05, "loss": 0.6593, "num_input_tokens_seen": 48783168, "step": 84015 }, { "epoch": 12.514149538278225, "grad_norm": 2.34401273727417, "learning_rate": 1.8470535686831446e-05, "loss": 0.7022, "num_input_tokens_seen": 48786816, "step": 84020 }, { "epoch": 12.514894250819184, "grad_norm": 1.1660149097442627, "learning_rate": 1.8467399106326954e-05, "loss": 0.5385, "num_input_tokens_seen": 48789696, "step": 84025 }, { "epoch": 12.515638963360143, "grad_norm": 2.7098724842071533, "learning_rate": 1.8464262636184117e-05, "loss": 0.7679, "num_input_tokens_seen": 48792576, "step": 84030 }, { "epoch": 12.516383675901102, "grad_norm": 1.760474443435669, "learning_rate": 1.8461126276455904e-05, "loss": 0.6513, "num_input_tokens_seen": 48795168, "step": 84035 }, { "epoch": 12.517128388442062, "grad_norm": 1.7046865224838257, "learning_rate": 1.8457990027195325e-05, "loss": 0.4869, "num_input_tokens_seen": 48798080, "step": 84040 }, { "epoch": 12.51787310098302, "grad_norm": 3.895146369934082, "learning_rate": 1.8454853888455352e-05, "loss": 0.8016, "num_input_tokens_seen": 48801120, "step": 84045 }, { "epoch": 12.51861781352398, "grad_norm": 1.5056811571121216, "learning_rate": 1.845171786028898e-05, "loss": 0.8832, "num_input_tokens_seen": 48804064, "step": 84050 }, { "epoch": 12.519362526064938, "grad_norm": 1.7371928691864014, "learning_rate": 1.8448581942749167e-05, "loss": 0.5816, "num_input_tokens_seen": 48806848, "step": 84055 }, { "epoch": 12.520107238605899, "grad_norm": 1.867022156715393, "learning_rate": 1.844544613588891e-05, "loss": 0.8165, "num_input_tokens_seen": 48809888, "step": 84060 }, { "epoch": 12.520851951146858, "grad_norm": 1.9811769723892212, "learning_rate": 1.8442310439761185e-05, "loss": 0.7815, "num_input_tokens_seen": 48812864, "step": 84065 }, { "epoch": 12.521596663687816, "grad_norm": 0.8848538994789124, "learning_rate": 1.8439174854418946e-05, "loss": 0.5457, "num_input_tokens_seen": 48815808, "step": 84070 }, { "epoch": 12.522341376228775, "grad_norm": 1.2126271724700928, "learning_rate": 1.843603937991519e-05, "loss": 0.6045, "num_input_tokens_seen": 48818592, "step": 84075 }, { "epoch": 12.523086088769734, "grad_norm": 2.542738914489746, "learning_rate": 1.8432904016302872e-05, "loss": 0.5099, "num_input_tokens_seen": 48821376, "step": 84080 }, { "epoch": 12.523830801310694, "grad_norm": 1.0943386554718018, "learning_rate": 1.8429768763634974e-05, "loss": 0.6688, "num_input_tokens_seen": 48824224, "step": 84085 }, { "epoch": 12.524575513851653, "grad_norm": 3.500757932662964, "learning_rate": 1.8426633621964443e-05, "loss": 0.726, "num_input_tokens_seen": 48827040, "step": 84090 }, { "epoch": 12.525320226392612, "grad_norm": 1.1913869380950928, "learning_rate": 1.8423498591344267e-05, "loss": 0.7293, "num_input_tokens_seen": 48829696, "step": 84095 }, { "epoch": 12.526064938933573, "grad_norm": 1.4277945756912231, "learning_rate": 1.8420363671827387e-05, "loss": 0.7657, "num_input_tokens_seen": 48832864, "step": 84100 }, { "epoch": 12.526809651474531, "grad_norm": 1.3126494884490967, "learning_rate": 1.8417228863466786e-05, "loss": 0.5085, "num_input_tokens_seen": 48835584, "step": 84105 }, { "epoch": 12.52755436401549, "grad_norm": 1.6869628429412842, "learning_rate": 1.841409416631541e-05, "loss": 0.7251, "num_input_tokens_seen": 48838432, "step": 84110 }, { "epoch": 12.528299076556449, "grad_norm": 2.2968735694885254, "learning_rate": 1.8410959580426222e-05, "loss": 0.6307, "num_input_tokens_seen": 48841664, "step": 84115 }, { "epoch": 12.529043789097408, "grad_norm": 0.8306938409805298, "learning_rate": 1.8407825105852175e-05, "loss": 0.56, "num_input_tokens_seen": 48844448, "step": 84120 }, { "epoch": 12.529788501638368, "grad_norm": 1.1387250423431396, "learning_rate": 1.8404690742646212e-05, "loss": 0.4993, "num_input_tokens_seen": 48847456, "step": 84125 }, { "epoch": 12.530533214179327, "grad_norm": 1.7008053064346313, "learning_rate": 1.840155649086131e-05, "loss": 0.7823, "num_input_tokens_seen": 48850464, "step": 84130 }, { "epoch": 12.531277926720286, "grad_norm": 2.823185682296753, "learning_rate": 1.8398422350550386e-05, "loss": 0.6521, "num_input_tokens_seen": 48853248, "step": 84135 }, { "epoch": 12.532022639261244, "grad_norm": 1.378844141960144, "learning_rate": 1.8395288321766424e-05, "loss": 0.5322, "num_input_tokens_seen": 48856032, "step": 84140 }, { "epoch": 12.532767351802205, "grad_norm": 1.5853701829910278, "learning_rate": 1.8392154404562354e-05, "loss": 0.5566, "num_input_tokens_seen": 48859072, "step": 84145 }, { "epoch": 12.533512064343164, "grad_norm": 1.0752023458480835, "learning_rate": 1.8389020598991113e-05, "loss": 0.6806, "num_input_tokens_seen": 48862176, "step": 84150 }, { "epoch": 12.534256776884122, "grad_norm": 1.8529741764068604, "learning_rate": 1.8385886905105653e-05, "loss": 0.5795, "num_input_tokens_seen": 48865184, "step": 84155 }, { "epoch": 12.535001489425081, "grad_norm": 1.88121497631073, "learning_rate": 1.8382753322958902e-05, "loss": 0.6594, "num_input_tokens_seen": 48868096, "step": 84160 }, { "epoch": 12.535746201966042, "grad_norm": 2.1565704345703125, "learning_rate": 1.837961985260382e-05, "loss": 0.7871, "num_input_tokens_seen": 48870784, "step": 84165 }, { "epoch": 12.536490914507, "grad_norm": 1.057510495185852, "learning_rate": 1.8376486494093327e-05, "loss": 0.5542, "num_input_tokens_seen": 48873408, "step": 84170 }, { "epoch": 12.53723562704796, "grad_norm": 0.8627826571464539, "learning_rate": 1.837335324748036e-05, "loss": 0.6045, "num_input_tokens_seen": 48876256, "step": 84175 }, { "epoch": 12.537980339588918, "grad_norm": 2.4996025562286377, "learning_rate": 1.8370220112817854e-05, "loss": 0.5608, "num_input_tokens_seen": 48878944, "step": 84180 }, { "epoch": 12.538725052129879, "grad_norm": 1.858368158340454, "learning_rate": 1.836708709015875e-05, "loss": 0.7007, "num_input_tokens_seen": 48882080, "step": 84185 }, { "epoch": 12.539469764670837, "grad_norm": 2.4412546157836914, "learning_rate": 1.836395417955597e-05, "loss": 0.6587, "num_input_tokens_seen": 48884896, "step": 84190 }, { "epoch": 12.540214477211796, "grad_norm": 1.673698902130127, "learning_rate": 1.836082138106242e-05, "loss": 0.7511, "num_input_tokens_seen": 48887840, "step": 84195 }, { "epoch": 12.540959189752755, "grad_norm": 1.168013334274292, "learning_rate": 1.8357688694731063e-05, "loss": 0.5763, "num_input_tokens_seen": 48890560, "step": 84200 }, { "epoch": 12.541703902293715, "grad_norm": 1.9195612668991089, "learning_rate": 1.8354556120614796e-05, "loss": 0.6238, "num_input_tokens_seen": 48893536, "step": 84205 }, { "epoch": 12.542448614834674, "grad_norm": 1.6628789901733398, "learning_rate": 1.8351423658766557e-05, "loss": 0.6608, "num_input_tokens_seen": 48896704, "step": 84210 }, { "epoch": 12.543193327375633, "grad_norm": 1.439420223236084, "learning_rate": 1.8348291309239248e-05, "loss": 0.4382, "num_input_tokens_seen": 48899616, "step": 84215 }, { "epoch": 12.543938039916592, "grad_norm": 1.6784614324569702, "learning_rate": 1.8345159072085803e-05, "loss": 0.4485, "num_input_tokens_seen": 48902496, "step": 84220 }, { "epoch": 12.544682752457552, "grad_norm": 1.4648410081863403, "learning_rate": 1.8342026947359137e-05, "loss": 0.5922, "num_input_tokens_seen": 48905472, "step": 84225 }, { "epoch": 12.545427464998511, "grad_norm": 3.7819225788116455, "learning_rate": 1.8338894935112144e-05, "loss": 0.5453, "num_input_tokens_seen": 48908704, "step": 84230 }, { "epoch": 12.54617217753947, "grad_norm": 0.5965850353240967, "learning_rate": 1.8335763035397765e-05, "loss": 0.457, "num_input_tokens_seen": 48911584, "step": 84235 }, { "epoch": 12.546916890080428, "grad_norm": 0.7689455151557922, "learning_rate": 1.833263124826889e-05, "loss": 0.5744, "num_input_tokens_seen": 48914144, "step": 84240 }, { "epoch": 12.547661602621389, "grad_norm": 1.981117844581604, "learning_rate": 1.832949957377844e-05, "loss": 0.6007, "num_input_tokens_seen": 48916832, "step": 84245 }, { "epoch": 12.548406315162348, "grad_norm": 1.6942596435546875, "learning_rate": 1.83263680119793e-05, "loss": 0.6657, "num_input_tokens_seen": 48919552, "step": 84250 }, { "epoch": 12.549151027703306, "grad_norm": 1.2807317972183228, "learning_rate": 1.8323236562924405e-05, "loss": 0.6981, "num_input_tokens_seen": 48922560, "step": 84255 }, { "epoch": 12.549895740244265, "grad_norm": 2.112243413925171, "learning_rate": 1.8320105226666628e-05, "loss": 0.4328, "num_input_tokens_seen": 48925312, "step": 84260 }, { "epoch": 12.550640452785224, "grad_norm": 0.8770684003829956, "learning_rate": 1.8316974003258898e-05, "loss": 0.5, "num_input_tokens_seen": 48927840, "step": 84265 }, { "epoch": 12.551385165326185, "grad_norm": 1.7367942333221436, "learning_rate": 1.8313842892754097e-05, "loss": 0.3665, "num_input_tokens_seen": 48930688, "step": 84270 }, { "epoch": 12.552129877867143, "grad_norm": 1.1202353239059448, "learning_rate": 1.8310711895205125e-05, "loss": 0.5775, "num_input_tokens_seen": 48933888, "step": 84275 }, { "epoch": 12.552874590408102, "grad_norm": 1.9582092761993408, "learning_rate": 1.8307581010664875e-05, "loss": 0.6087, "num_input_tokens_seen": 48936640, "step": 84280 }, { "epoch": 12.553619302949063, "grad_norm": 2.0614805221557617, "learning_rate": 1.8304450239186235e-05, "loss": 0.7588, "num_input_tokens_seen": 48939776, "step": 84285 }, { "epoch": 12.554364015490021, "grad_norm": 1.9531787633895874, "learning_rate": 1.8301319580822112e-05, "loss": 0.7717, "num_input_tokens_seen": 48942624, "step": 84290 }, { "epoch": 12.55510872803098, "grad_norm": 2.4781248569488525, "learning_rate": 1.829818903562538e-05, "loss": 0.4468, "num_input_tokens_seen": 48945440, "step": 84295 }, { "epoch": 12.555853440571939, "grad_norm": 4.433130264282227, "learning_rate": 1.8295058603648942e-05, "loss": 0.8395, "num_input_tokens_seen": 48948224, "step": 84300 }, { "epoch": 12.556598153112898, "grad_norm": 1.479236125946045, "learning_rate": 1.8291928284945668e-05, "loss": 0.8969, "num_input_tokens_seen": 48950944, "step": 84305 }, { "epoch": 12.557342865653858, "grad_norm": 1.1753838062286377, "learning_rate": 1.828879807956845e-05, "loss": 0.5593, "num_input_tokens_seen": 48953568, "step": 84310 }, { "epoch": 12.558087578194817, "grad_norm": 1.8690721988677979, "learning_rate": 1.828566798757017e-05, "loss": 0.5892, "num_input_tokens_seen": 48956256, "step": 84315 }, { "epoch": 12.558832290735776, "grad_norm": 1.3700029850006104, "learning_rate": 1.8282538009003696e-05, "loss": 0.4331, "num_input_tokens_seen": 48959072, "step": 84320 }, { "epoch": 12.559577003276734, "grad_norm": 0.9937037229537964, "learning_rate": 1.827940814392192e-05, "loss": 0.5325, "num_input_tokens_seen": 48961792, "step": 84325 }, { "epoch": 12.560321715817695, "grad_norm": 1.6153312921524048, "learning_rate": 1.827627839237771e-05, "loss": 0.6393, "num_input_tokens_seen": 48964768, "step": 84330 }, { "epoch": 12.561066428358654, "grad_norm": 0.8554942011833191, "learning_rate": 1.8273148754423953e-05, "loss": 0.6452, "num_input_tokens_seen": 48967872, "step": 84335 }, { "epoch": 12.561811140899612, "grad_norm": 1.4667376279830933, "learning_rate": 1.82700192301135e-05, "loss": 0.563, "num_input_tokens_seen": 48970816, "step": 84340 }, { "epoch": 12.562555853440571, "grad_norm": 1.540986180305481, "learning_rate": 1.826688981949924e-05, "loss": 0.8935, "num_input_tokens_seen": 48973760, "step": 84345 }, { "epoch": 12.563300565981532, "grad_norm": 1.9529097080230713, "learning_rate": 1.8263760522634033e-05, "loss": 0.5019, "num_input_tokens_seen": 48976576, "step": 84350 }, { "epoch": 12.56404527852249, "grad_norm": 1.2732514142990112, "learning_rate": 1.826063133957074e-05, "loss": 0.6733, "num_input_tokens_seen": 48979520, "step": 84355 }, { "epoch": 12.56478999106345, "grad_norm": 1.686697006225586, "learning_rate": 1.8257502270362235e-05, "loss": 0.4241, "num_input_tokens_seen": 48982208, "step": 84360 }, { "epoch": 12.565534703604408, "grad_norm": 1.37980318069458, "learning_rate": 1.8254373315061364e-05, "loss": 0.5972, "num_input_tokens_seen": 48985024, "step": 84365 }, { "epoch": 12.566279416145369, "grad_norm": 1.7648415565490723, "learning_rate": 1.8251244473721017e-05, "loss": 0.5724, "num_input_tokens_seen": 48987904, "step": 84370 }, { "epoch": 12.567024128686327, "grad_norm": 1.0902949571609497, "learning_rate": 1.8248115746394025e-05, "loss": 0.5127, "num_input_tokens_seen": 48990976, "step": 84375 }, { "epoch": 12.567768841227286, "grad_norm": 2.4015567302703857, "learning_rate": 1.8244987133133264e-05, "loss": 0.7072, "num_input_tokens_seen": 48994048, "step": 84380 }, { "epoch": 12.568513553768245, "grad_norm": 1.587532877922058, "learning_rate": 1.8241858633991578e-05, "loss": 0.7396, "num_input_tokens_seen": 48996896, "step": 84385 }, { "epoch": 12.569258266309205, "grad_norm": 0.6473968029022217, "learning_rate": 1.8238730249021812e-05, "loss": 0.5241, "num_input_tokens_seen": 48999552, "step": 84390 }, { "epoch": 12.570002978850164, "grad_norm": 0.8648810386657715, "learning_rate": 1.8235601978276838e-05, "loss": 0.5641, "num_input_tokens_seen": 49002400, "step": 84395 }, { "epoch": 12.570747691391123, "grad_norm": 1.3406533002853394, "learning_rate": 1.823247382180948e-05, "loss": 0.5155, "num_input_tokens_seen": 49005056, "step": 84400 }, { "epoch": 12.571492403932082, "grad_norm": 0.8050234317779541, "learning_rate": 1.8229345779672613e-05, "loss": 0.5016, "num_input_tokens_seen": 49007968, "step": 84405 }, { "epoch": 12.572237116473042, "grad_norm": 1.4270228147506714, "learning_rate": 1.8226217851919062e-05, "loss": 0.5628, "num_input_tokens_seen": 49010720, "step": 84410 }, { "epoch": 12.572981829014001, "grad_norm": 1.0092623233795166, "learning_rate": 1.8223090038601678e-05, "loss": 0.5536, "num_input_tokens_seen": 49013376, "step": 84415 }, { "epoch": 12.57372654155496, "grad_norm": 2.12316632270813, "learning_rate": 1.8219962339773292e-05, "loss": 0.7591, "num_input_tokens_seen": 49015904, "step": 84420 }, { "epoch": 12.574471254095918, "grad_norm": 2.983255624771118, "learning_rate": 1.8216834755486763e-05, "loss": 0.5697, "num_input_tokens_seen": 49018592, "step": 84425 }, { "epoch": 12.575215966636879, "grad_norm": 2.3995161056518555, "learning_rate": 1.821370728579491e-05, "loss": 0.6756, "num_input_tokens_seen": 49021376, "step": 84430 }, { "epoch": 12.575960679177838, "grad_norm": 1.8000988960266113, "learning_rate": 1.821057993075057e-05, "loss": 0.5275, "num_input_tokens_seen": 49024192, "step": 84435 }, { "epoch": 12.576705391718797, "grad_norm": 2.030552864074707, "learning_rate": 1.8207452690406594e-05, "loss": 0.7033, "num_input_tokens_seen": 49027296, "step": 84440 }, { "epoch": 12.577450104259755, "grad_norm": 1.3556112051010132, "learning_rate": 1.8204325564815796e-05, "loss": 0.7035, "num_input_tokens_seen": 49030112, "step": 84445 }, { "epoch": 12.578194816800714, "grad_norm": 1.2662625312805176, "learning_rate": 1.820119855403101e-05, "loss": 0.4733, "num_input_tokens_seen": 49032832, "step": 84450 }, { "epoch": 12.578939529341675, "grad_norm": 1.3806769847869873, "learning_rate": 1.819807165810506e-05, "loss": 0.831, "num_input_tokens_seen": 49036064, "step": 84455 }, { "epoch": 12.579684241882633, "grad_norm": 0.9704115390777588, "learning_rate": 1.819494487709078e-05, "loss": 0.586, "num_input_tokens_seen": 49038944, "step": 84460 }, { "epoch": 12.580428954423592, "grad_norm": 1.3161861896514893, "learning_rate": 1.8191818211040997e-05, "loss": 0.4411, "num_input_tokens_seen": 49041856, "step": 84465 }, { "epoch": 12.58117366696455, "grad_norm": 1.223639965057373, "learning_rate": 1.8188691660008513e-05, "loss": 0.5659, "num_input_tokens_seen": 49044832, "step": 84470 }, { "epoch": 12.581918379505511, "grad_norm": 0.9276485443115234, "learning_rate": 1.818556522404617e-05, "loss": 0.663, "num_input_tokens_seen": 49047872, "step": 84475 }, { "epoch": 12.58266309204647, "grad_norm": 1.1781587600708008, "learning_rate": 1.818243890320677e-05, "loss": 0.5934, "num_input_tokens_seen": 49050688, "step": 84480 }, { "epoch": 12.583407804587429, "grad_norm": 1.8413000106811523, "learning_rate": 1.8179312697543145e-05, "loss": 0.5767, "num_input_tokens_seen": 49053376, "step": 84485 }, { "epoch": 12.584152517128388, "grad_norm": 1.0453345775604248, "learning_rate": 1.8176186607108086e-05, "loss": 0.6904, "num_input_tokens_seen": 49056096, "step": 84490 }, { "epoch": 12.584897229669348, "grad_norm": 1.5328017473220825, "learning_rate": 1.817306063195443e-05, "loss": 0.4383, "num_input_tokens_seen": 49058944, "step": 84495 }, { "epoch": 12.585641942210307, "grad_norm": 1.09140145778656, "learning_rate": 1.8169934772134974e-05, "loss": 0.5823, "num_input_tokens_seen": 49062080, "step": 84500 }, { "epoch": 12.586386654751266, "grad_norm": 1.0337920188903809, "learning_rate": 1.8166809027702522e-05, "loss": 0.4978, "num_input_tokens_seen": 49064736, "step": 84505 }, { "epoch": 12.587131367292224, "grad_norm": 1.323034405708313, "learning_rate": 1.8163683398709898e-05, "loss": 0.506, "num_input_tokens_seen": 49067584, "step": 84510 }, { "epoch": 12.587876079833185, "grad_norm": 0.6268993616104126, "learning_rate": 1.8160557885209884e-05, "loss": 0.4745, "num_input_tokens_seen": 49070400, "step": 84515 }, { "epoch": 12.588620792374144, "grad_norm": 1.2113796472549438, "learning_rate": 1.81574324872553e-05, "loss": 0.6513, "num_input_tokens_seen": 49073312, "step": 84520 }, { "epoch": 12.589365504915103, "grad_norm": 0.8678757548332214, "learning_rate": 1.8154307204898933e-05, "loss": 0.616, "num_input_tokens_seen": 49076224, "step": 84525 }, { "epoch": 12.590110217456061, "grad_norm": 1.0833396911621094, "learning_rate": 1.8151182038193594e-05, "loss": 0.6349, "num_input_tokens_seen": 49079072, "step": 84530 }, { "epoch": 12.590854929997022, "grad_norm": 1.8482781648635864, "learning_rate": 1.814805698719207e-05, "loss": 0.8748, "num_input_tokens_seen": 49082016, "step": 84535 }, { "epoch": 12.59159964253798, "grad_norm": 0.8017575144767761, "learning_rate": 1.8144932051947166e-05, "loss": 0.4195, "num_input_tokens_seen": 49085216, "step": 84540 }, { "epoch": 12.59234435507894, "grad_norm": 1.1015074253082275, "learning_rate": 1.814180723251166e-05, "loss": 0.5247, "num_input_tokens_seen": 49087968, "step": 84545 }, { "epoch": 12.593089067619898, "grad_norm": 1.860395073890686, "learning_rate": 1.8138682528938354e-05, "loss": 0.6712, "num_input_tokens_seen": 49091232, "step": 84550 }, { "epoch": 12.593833780160859, "grad_norm": 2.0765798091888428, "learning_rate": 1.8135557941280035e-05, "loss": 0.5811, "num_input_tokens_seen": 49094368, "step": 84555 }, { "epoch": 12.594578492701817, "grad_norm": 1.126034140586853, "learning_rate": 1.813243346958948e-05, "loss": 0.6533, "num_input_tokens_seen": 49097184, "step": 84560 }, { "epoch": 12.595323205242776, "grad_norm": 0.9014512300491333, "learning_rate": 1.812930911391949e-05, "loss": 0.5293, "num_input_tokens_seen": 49100032, "step": 84565 }, { "epoch": 12.596067917783735, "grad_norm": 1.4886759519577026, "learning_rate": 1.8126184874322837e-05, "loss": 0.5729, "num_input_tokens_seen": 49102848, "step": 84570 }, { "epoch": 12.596812630324695, "grad_norm": 1.1274142265319824, "learning_rate": 1.8123060750852305e-05, "loss": 0.5491, "num_input_tokens_seen": 49105632, "step": 84575 }, { "epoch": 12.597557342865654, "grad_norm": 1.406434416770935, "learning_rate": 1.8119936743560667e-05, "loss": 0.4013, "num_input_tokens_seen": 49108352, "step": 84580 }, { "epoch": 12.598302055406613, "grad_norm": 1.64797842502594, "learning_rate": 1.8116812852500713e-05, "loss": 0.7279, "num_input_tokens_seen": 49111264, "step": 84585 }, { "epoch": 12.599046767947572, "grad_norm": 0.785957396030426, "learning_rate": 1.811368907772521e-05, "loss": 0.5382, "num_input_tokens_seen": 49114144, "step": 84590 }, { "epoch": 12.599791480488532, "grad_norm": 1.2829298973083496, "learning_rate": 1.8110565419286916e-05, "loss": 0.622, "num_input_tokens_seen": 49117120, "step": 84595 }, { "epoch": 12.600536193029491, "grad_norm": 0.8380914330482483, "learning_rate": 1.8107441877238634e-05, "loss": 0.5881, "num_input_tokens_seen": 49119904, "step": 84600 }, { "epoch": 12.60128090557045, "grad_norm": 2.092588186264038, "learning_rate": 1.8104318451633114e-05, "loss": 0.4974, "num_input_tokens_seen": 49123008, "step": 84605 }, { "epoch": 12.602025618111409, "grad_norm": 0.9164135456085205, "learning_rate": 1.810119514252312e-05, "loss": 0.5138, "num_input_tokens_seen": 49125920, "step": 84610 }, { "epoch": 12.602770330652369, "grad_norm": 1.1646746397018433, "learning_rate": 1.809807194996142e-05, "loss": 0.6652, "num_input_tokens_seen": 49128608, "step": 84615 }, { "epoch": 12.603515043193328, "grad_norm": 1.5850049257278442, "learning_rate": 1.809494887400079e-05, "loss": 0.5857, "num_input_tokens_seen": 49131456, "step": 84620 }, { "epoch": 12.604259755734287, "grad_norm": 1.4551057815551758, "learning_rate": 1.8091825914693966e-05, "loss": 0.6204, "num_input_tokens_seen": 49134144, "step": 84625 }, { "epoch": 12.605004468275245, "grad_norm": 1.653128743171692, "learning_rate": 1.8088703072093735e-05, "loss": 0.6583, "num_input_tokens_seen": 49136832, "step": 84630 }, { "epoch": 12.605749180816204, "grad_norm": 3.4564199447631836, "learning_rate": 1.808558034625284e-05, "loss": 0.8322, "num_input_tokens_seen": 49139488, "step": 84635 }, { "epoch": 12.606493893357165, "grad_norm": 1.3217819929122925, "learning_rate": 1.8082457737224034e-05, "loss": 0.615, "num_input_tokens_seen": 49142432, "step": 84640 }, { "epoch": 12.607238605898123, "grad_norm": 1.3598436117172241, "learning_rate": 1.8079335245060076e-05, "loss": 0.6309, "num_input_tokens_seen": 49145568, "step": 84645 }, { "epoch": 12.607983318439082, "grad_norm": 1.346673607826233, "learning_rate": 1.8076212869813706e-05, "loss": 0.4425, "num_input_tokens_seen": 49148416, "step": 84650 }, { "epoch": 12.608728030980041, "grad_norm": 1.046932578086853, "learning_rate": 1.8073090611537697e-05, "loss": 0.636, "num_input_tokens_seen": 49151200, "step": 84655 }, { "epoch": 12.609472743521001, "grad_norm": 1.2434762716293335, "learning_rate": 1.8069968470284768e-05, "loss": 0.6284, "num_input_tokens_seen": 49154048, "step": 84660 }, { "epoch": 12.61021745606196, "grad_norm": 1.2088594436645508, "learning_rate": 1.806684644610769e-05, "loss": 0.7651, "num_input_tokens_seen": 49156768, "step": 84665 }, { "epoch": 12.610962168602919, "grad_norm": 1.1328738927841187, "learning_rate": 1.8063724539059195e-05, "loss": 0.6616, "num_input_tokens_seen": 49159328, "step": 84670 }, { "epoch": 12.611706881143878, "grad_norm": 1.1066323518753052, "learning_rate": 1.806060274919202e-05, "loss": 0.6574, "num_input_tokens_seen": 49162368, "step": 84675 }, { "epoch": 12.612451593684838, "grad_norm": 2.0242161750793457, "learning_rate": 1.8057481076558906e-05, "loss": 0.688, "num_input_tokens_seen": 49165056, "step": 84680 }, { "epoch": 12.613196306225797, "grad_norm": 1.0420475006103516, "learning_rate": 1.8054359521212592e-05, "loss": 0.6693, "num_input_tokens_seen": 49167840, "step": 84685 }, { "epoch": 12.613941018766756, "grad_norm": 1.7436317205429077, "learning_rate": 1.805123808320582e-05, "loss": 0.4152, "num_input_tokens_seen": 49170720, "step": 84690 }, { "epoch": 12.614685731307715, "grad_norm": 1.773830771446228, "learning_rate": 1.804811676259131e-05, "loss": 0.6066, "num_input_tokens_seen": 49173600, "step": 84695 }, { "epoch": 12.615430443848675, "grad_norm": 1.7416329383850098, "learning_rate": 1.8044995559421813e-05, "loss": 0.6466, "num_input_tokens_seen": 49176512, "step": 84700 }, { "epoch": 12.616175156389634, "grad_norm": 1.238579511642456, "learning_rate": 1.804187447375004e-05, "loss": 0.5599, "num_input_tokens_seen": 49179360, "step": 84705 }, { "epoch": 12.616919868930593, "grad_norm": 1.777021884918213, "learning_rate": 1.803875350562873e-05, "loss": 0.4903, "num_input_tokens_seen": 49182528, "step": 84710 }, { "epoch": 12.617664581471551, "grad_norm": 1.1807230710983276, "learning_rate": 1.8035632655110607e-05, "loss": 0.6918, "num_input_tokens_seen": 49185312, "step": 84715 }, { "epoch": 12.618409294012512, "grad_norm": 1.8853799104690552, "learning_rate": 1.803251192224838e-05, "loss": 0.4925, "num_input_tokens_seen": 49188192, "step": 84720 }, { "epoch": 12.61915400655347, "grad_norm": 2.2818069458007812, "learning_rate": 1.8029391307094796e-05, "loss": 0.534, "num_input_tokens_seen": 49191232, "step": 84725 }, { "epoch": 12.61989871909443, "grad_norm": 1.0080070495605469, "learning_rate": 1.8026270809702547e-05, "loss": 0.5927, "num_input_tokens_seen": 49194112, "step": 84730 }, { "epoch": 12.620643431635388, "grad_norm": 1.835183024406433, "learning_rate": 1.8023150430124375e-05, "loss": 0.7146, "num_input_tokens_seen": 49196960, "step": 84735 }, { "epoch": 12.621388144176349, "grad_norm": 1.6389801502227783, "learning_rate": 1.802003016841298e-05, "loss": 0.5729, "num_input_tokens_seen": 49199808, "step": 84740 }, { "epoch": 12.622132856717307, "grad_norm": 2.0728724002838135, "learning_rate": 1.801691002462109e-05, "loss": 0.5978, "num_input_tokens_seen": 49202976, "step": 84745 }, { "epoch": 12.622877569258266, "grad_norm": 1.322805643081665, "learning_rate": 1.8013789998801407e-05, "loss": 0.643, "num_input_tokens_seen": 49205888, "step": 84750 }, { "epoch": 12.623622281799225, "grad_norm": 1.382535696029663, "learning_rate": 1.801067009100663e-05, "loss": 0.5365, "num_input_tokens_seen": 49208672, "step": 84755 }, { "epoch": 12.624366994340185, "grad_norm": 1.7046337127685547, "learning_rate": 1.800755030128949e-05, "loss": 0.656, "num_input_tokens_seen": 49211360, "step": 84760 }, { "epoch": 12.625111706881144, "grad_norm": 1.8991941213607788, "learning_rate": 1.800443062970267e-05, "loss": 0.6143, "num_input_tokens_seen": 49214144, "step": 84765 }, { "epoch": 12.625856419422103, "grad_norm": 1.3346469402313232, "learning_rate": 1.8001311076298895e-05, "loss": 0.3631, "num_input_tokens_seen": 49217216, "step": 84770 }, { "epoch": 12.626601131963062, "grad_norm": 1.3808598518371582, "learning_rate": 1.799819164113085e-05, "loss": 0.6537, "num_input_tokens_seen": 49219936, "step": 84775 }, { "epoch": 12.62734584450402, "grad_norm": 2.0016024112701416, "learning_rate": 1.799507232425125e-05, "loss": 0.7393, "num_input_tokens_seen": 49222912, "step": 84780 }, { "epoch": 12.628090557044981, "grad_norm": 2.6958117485046387, "learning_rate": 1.799195312571277e-05, "loss": 0.7018, "num_input_tokens_seen": 49225632, "step": 84785 }, { "epoch": 12.62883526958594, "grad_norm": 2.7162322998046875, "learning_rate": 1.7988834045568126e-05, "loss": 0.8123, "num_input_tokens_seen": 49228416, "step": 84790 }, { "epoch": 12.629579982126899, "grad_norm": 1.2359391450881958, "learning_rate": 1.7985715083870008e-05, "loss": 0.6267, "num_input_tokens_seen": 49231392, "step": 84795 }, { "epoch": 12.63032469466786, "grad_norm": 1.861465334892273, "learning_rate": 1.7982596240671095e-05, "loss": 0.5716, "num_input_tokens_seen": 49234080, "step": 84800 }, { "epoch": 12.631069407208818, "grad_norm": 1.2471568584442139, "learning_rate": 1.7979477516024096e-05, "loss": 0.6892, "num_input_tokens_seen": 49237024, "step": 84805 }, { "epoch": 12.631814119749777, "grad_norm": 1.3759232759475708, "learning_rate": 1.7976358909981686e-05, "loss": 0.5572, "num_input_tokens_seen": 49240000, "step": 84810 }, { "epoch": 12.632558832290735, "grad_norm": 0.9838666915893555, "learning_rate": 1.7973240422596557e-05, "loss": 0.4788, "num_input_tokens_seen": 49242752, "step": 84815 }, { "epoch": 12.633303544831694, "grad_norm": 1.3399571180343628, "learning_rate": 1.7970122053921378e-05, "loss": 0.5886, "num_input_tokens_seen": 49245632, "step": 84820 }, { "epoch": 12.634048257372655, "grad_norm": 1.5304855108261108, "learning_rate": 1.7967003804008855e-05, "loss": 0.603, "num_input_tokens_seen": 49248480, "step": 84825 }, { "epoch": 12.634792969913613, "grad_norm": 1.1132420301437378, "learning_rate": 1.7963885672911655e-05, "loss": 0.6231, "num_input_tokens_seen": 49251584, "step": 84830 }, { "epoch": 12.635537682454572, "grad_norm": 1.5830038785934448, "learning_rate": 1.7960767660682442e-05, "loss": 0.6835, "num_input_tokens_seen": 49254432, "step": 84835 }, { "epoch": 12.636282394995531, "grad_norm": 1.3971316814422607, "learning_rate": 1.7957649767373916e-05, "loss": 0.4863, "num_input_tokens_seen": 49257280, "step": 84840 }, { "epoch": 12.637027107536491, "grad_norm": 1.5799640417099, "learning_rate": 1.7954531993038737e-05, "loss": 0.7329, "num_input_tokens_seen": 49260544, "step": 84845 }, { "epoch": 12.63777182007745, "grad_norm": 1.2503080368041992, "learning_rate": 1.7951414337729584e-05, "loss": 0.4542, "num_input_tokens_seen": 49263232, "step": 84850 }, { "epoch": 12.638516532618409, "grad_norm": 1.75569486618042, "learning_rate": 1.794829680149911e-05, "loss": 0.5866, "num_input_tokens_seen": 49266464, "step": 84855 }, { "epoch": 12.639261245159368, "grad_norm": 1.6606197357177734, "learning_rate": 1.7945179384400002e-05, "loss": 0.5276, "num_input_tokens_seen": 49269344, "step": 84860 }, { "epoch": 12.640005957700328, "grad_norm": 0.8665639162063599, "learning_rate": 1.794206208648492e-05, "loss": 0.5306, "num_input_tokens_seen": 49272096, "step": 84865 }, { "epoch": 12.640750670241287, "grad_norm": 1.6981667280197144, "learning_rate": 1.7938944907806523e-05, "loss": 0.8147, "num_input_tokens_seen": 49274944, "step": 84870 }, { "epoch": 12.641495382782246, "grad_norm": 1.9528489112854004, "learning_rate": 1.7935827848417476e-05, "loss": 0.8246, "num_input_tokens_seen": 49277440, "step": 84875 }, { "epoch": 12.642240095323205, "grad_norm": 0.9044343829154968, "learning_rate": 1.7932710908370434e-05, "loss": 0.4615, "num_input_tokens_seen": 49280128, "step": 84880 }, { "epoch": 12.642984807864165, "grad_norm": 2.3905677795410156, "learning_rate": 1.7929594087718067e-05, "loss": 0.6049, "num_input_tokens_seen": 49283040, "step": 84885 }, { "epoch": 12.643729520405124, "grad_norm": 1.6184182167053223, "learning_rate": 1.7926477386513008e-05, "loss": 0.5057, "num_input_tokens_seen": 49285632, "step": 84890 }, { "epoch": 12.644474232946083, "grad_norm": 1.6467258930206299, "learning_rate": 1.7923360804807937e-05, "loss": 0.7487, "num_input_tokens_seen": 49288448, "step": 84895 }, { "epoch": 12.645218945487041, "grad_norm": 2.2792882919311523, "learning_rate": 1.7920244342655485e-05, "loss": 0.5083, "num_input_tokens_seen": 49291552, "step": 84900 }, { "epoch": 12.645963658028002, "grad_norm": 1.613287329673767, "learning_rate": 1.791712800010832e-05, "loss": 0.6506, "num_input_tokens_seen": 49294528, "step": 84905 }, { "epoch": 12.64670837056896, "grad_norm": 1.0946872234344482, "learning_rate": 1.7914011777219074e-05, "loss": 0.5968, "num_input_tokens_seen": 49297600, "step": 84910 }, { "epoch": 12.64745308310992, "grad_norm": 1.9704898595809937, "learning_rate": 1.7910895674040387e-05, "loss": 0.511, "num_input_tokens_seen": 49300512, "step": 84915 }, { "epoch": 12.648197795650878, "grad_norm": 1.6872446537017822, "learning_rate": 1.7907779690624923e-05, "loss": 0.7147, "num_input_tokens_seen": 49303168, "step": 84920 }, { "epoch": 12.648942508191839, "grad_norm": 2.189800977706909, "learning_rate": 1.7904663827025304e-05, "loss": 0.7787, "num_input_tokens_seen": 49305888, "step": 84925 }, { "epoch": 12.649687220732797, "grad_norm": 1.6742310523986816, "learning_rate": 1.790154808329419e-05, "loss": 0.7569, "num_input_tokens_seen": 49308640, "step": 84930 }, { "epoch": 12.650431933273756, "grad_norm": 0.958443284034729, "learning_rate": 1.78984324594842e-05, "loss": 0.6842, "num_input_tokens_seen": 49311264, "step": 84935 }, { "epoch": 12.651176645814715, "grad_norm": 1.7790207862854004, "learning_rate": 1.7895316955647977e-05, "loss": 0.4742, "num_input_tokens_seen": 49313952, "step": 84940 }, { "epoch": 12.651921358355676, "grad_norm": 1.4077050685882568, "learning_rate": 1.7892201571838147e-05, "loss": 0.4875, "num_input_tokens_seen": 49316928, "step": 84945 }, { "epoch": 12.652666070896634, "grad_norm": 1.4227899312973022, "learning_rate": 1.788908630810736e-05, "loss": 0.5635, "num_input_tokens_seen": 49319840, "step": 84950 }, { "epoch": 12.653410783437593, "grad_norm": 0.7977939248085022, "learning_rate": 1.7885971164508227e-05, "loss": 0.361, "num_input_tokens_seen": 49322976, "step": 84955 }, { "epoch": 12.654155495978552, "grad_norm": 1.1341642141342163, "learning_rate": 1.7882856141093372e-05, "loss": 0.5378, "num_input_tokens_seen": 49326336, "step": 84960 }, { "epoch": 12.65490020851951, "grad_norm": 1.3199105262756348, "learning_rate": 1.7879741237915444e-05, "loss": 0.5478, "num_input_tokens_seen": 49329344, "step": 84965 }, { "epoch": 12.655644921060471, "grad_norm": 1.5979670286178589, "learning_rate": 1.787662645502704e-05, "loss": 0.4186, "num_input_tokens_seen": 49332352, "step": 84970 }, { "epoch": 12.65638963360143, "grad_norm": 2.168724536895752, "learning_rate": 1.78735117924808e-05, "loss": 0.7187, "num_input_tokens_seen": 49335328, "step": 84975 }, { "epoch": 12.657134346142389, "grad_norm": 2.0156521797180176, "learning_rate": 1.7870397250329325e-05, "loss": 0.7338, "num_input_tokens_seen": 49338016, "step": 84980 }, { "epoch": 12.657879058683347, "grad_norm": 0.982389509677887, "learning_rate": 1.7867282828625253e-05, "loss": 0.4863, "num_input_tokens_seen": 49340864, "step": 84985 }, { "epoch": 12.658623771224308, "grad_norm": 1.5715888738632202, "learning_rate": 1.786416852742119e-05, "loss": 0.5432, "num_input_tokens_seen": 49343616, "step": 84990 }, { "epoch": 12.659368483765267, "grad_norm": 1.705829381942749, "learning_rate": 1.786105434676973e-05, "loss": 0.7684, "num_input_tokens_seen": 49346816, "step": 84995 }, { "epoch": 12.660113196306225, "grad_norm": 1.1800858974456787, "learning_rate": 1.785794028672352e-05, "loss": 0.7393, "num_input_tokens_seen": 49349856, "step": 85000 }, { "epoch": 12.660857908847184, "grad_norm": 1.6313525438308716, "learning_rate": 1.785482634733514e-05, "loss": 0.5253, "num_input_tokens_seen": 49352768, "step": 85005 }, { "epoch": 12.661602621388145, "grad_norm": 1.4090421199798584, "learning_rate": 1.785171252865721e-05, "loss": 0.58, "num_input_tokens_seen": 49355648, "step": 85010 }, { "epoch": 12.662347333929103, "grad_norm": 0.9852685928344727, "learning_rate": 1.7848598830742323e-05, "loss": 0.7071, "num_input_tokens_seen": 49358624, "step": 85015 }, { "epoch": 12.663092046470062, "grad_norm": 1.4808619022369385, "learning_rate": 1.78454852536431e-05, "loss": 0.5545, "num_input_tokens_seen": 49361600, "step": 85020 }, { "epoch": 12.663836759011021, "grad_norm": 1.2851163148880005, "learning_rate": 1.784237179741213e-05, "loss": 0.8038, "num_input_tokens_seen": 49364576, "step": 85025 }, { "epoch": 12.664581471551982, "grad_norm": 1.302262306213379, "learning_rate": 1.7839258462102015e-05, "loss": 0.462, "num_input_tokens_seen": 49367584, "step": 85030 }, { "epoch": 12.66532618409294, "grad_norm": 1.6092376708984375, "learning_rate": 1.783614524776535e-05, "loss": 0.5956, "num_input_tokens_seen": 49370592, "step": 85035 }, { "epoch": 12.666070896633899, "grad_norm": 1.1812301874160767, "learning_rate": 1.783303215445473e-05, "loss": 0.6082, "num_input_tokens_seen": 49373344, "step": 85040 }, { "epoch": 12.666815609174858, "grad_norm": 1.4754558801651, "learning_rate": 1.7829919182222752e-05, "loss": 0.6233, "num_input_tokens_seen": 49376416, "step": 85045 }, { "epoch": 12.667560321715818, "grad_norm": 1.3448594808578491, "learning_rate": 1.7826806331121987e-05, "loss": 0.4611, "num_input_tokens_seen": 49379456, "step": 85050 }, { "epoch": 12.668305034256777, "grad_norm": 1.7195762395858765, "learning_rate": 1.7823693601205054e-05, "loss": 0.5229, "num_input_tokens_seen": 49382496, "step": 85055 }, { "epoch": 12.669049746797736, "grad_norm": 1.3029268980026245, "learning_rate": 1.782058099252451e-05, "loss": 0.673, "num_input_tokens_seen": 49385248, "step": 85060 }, { "epoch": 12.669794459338695, "grad_norm": 1.0071226358413696, "learning_rate": 1.7817468505132966e-05, "loss": 0.5122, "num_input_tokens_seen": 49388448, "step": 85065 }, { "epoch": 12.670539171879655, "grad_norm": 2.09122896194458, "learning_rate": 1.7814356139082993e-05, "loss": 0.5598, "num_input_tokens_seen": 49391168, "step": 85070 }, { "epoch": 12.671283884420614, "grad_norm": 1.1087344884872437, "learning_rate": 1.781124389442716e-05, "loss": 0.7122, "num_input_tokens_seen": 49394016, "step": 85075 }, { "epoch": 12.672028596961573, "grad_norm": 1.5073961019515991, "learning_rate": 1.7808131771218065e-05, "loss": 0.4929, "num_input_tokens_seen": 49396704, "step": 85080 }, { "epoch": 12.672773309502531, "grad_norm": 1.087007999420166, "learning_rate": 1.7805019769508262e-05, "loss": 0.6213, "num_input_tokens_seen": 49399616, "step": 85085 }, { "epoch": 12.673518022043492, "grad_norm": 1.9801499843597412, "learning_rate": 1.7801907889350346e-05, "loss": 0.5288, "num_input_tokens_seen": 49402208, "step": 85090 }, { "epoch": 12.67426273458445, "grad_norm": 0.996764600276947, "learning_rate": 1.7798796130796875e-05, "loss": 0.5868, "num_input_tokens_seen": 49405216, "step": 85095 }, { "epoch": 12.67500744712541, "grad_norm": 2.3542966842651367, "learning_rate": 1.779568449390043e-05, "loss": 0.5281, "num_input_tokens_seen": 49408128, "step": 85100 }, { "epoch": 12.675752159666368, "grad_norm": 2.0588464736938477, "learning_rate": 1.7792572978713567e-05, "loss": 0.6271, "num_input_tokens_seen": 49410912, "step": 85105 }, { "epoch": 12.676496872207329, "grad_norm": 1.1807804107666016, "learning_rate": 1.778946158528887e-05, "loss": 0.6411, "num_input_tokens_seen": 49413568, "step": 85110 }, { "epoch": 12.677241584748288, "grad_norm": 1.3254340887069702, "learning_rate": 1.7786350313678885e-05, "loss": 0.5427, "num_input_tokens_seen": 49416352, "step": 85115 }, { "epoch": 12.677986297289246, "grad_norm": 1.026712417602539, "learning_rate": 1.7783239163936172e-05, "loss": 0.5055, "num_input_tokens_seen": 49419200, "step": 85120 }, { "epoch": 12.678731009830205, "grad_norm": 1.4007713794708252, "learning_rate": 1.7780128136113305e-05, "loss": 0.6631, "num_input_tokens_seen": 49422336, "step": 85125 }, { "epoch": 12.679475722371166, "grad_norm": 1.0413098335266113, "learning_rate": 1.777701723026283e-05, "loss": 0.5362, "num_input_tokens_seen": 49425056, "step": 85130 }, { "epoch": 12.680220434912124, "grad_norm": 1.230995774269104, "learning_rate": 1.7773906446437316e-05, "loss": 0.6452, "num_input_tokens_seen": 49428096, "step": 85135 }, { "epoch": 12.680965147453083, "grad_norm": 2.62211012840271, "learning_rate": 1.7770795784689302e-05, "loss": 0.5851, "num_input_tokens_seen": 49431040, "step": 85140 }, { "epoch": 12.681709859994042, "grad_norm": 1.5723838806152344, "learning_rate": 1.7767685245071353e-05, "loss": 0.7526, "num_input_tokens_seen": 49433888, "step": 85145 }, { "epoch": 12.682454572535, "grad_norm": 1.5720295906066895, "learning_rate": 1.776457482763601e-05, "loss": 0.63, "num_input_tokens_seen": 49436864, "step": 85150 }, { "epoch": 12.683199285075961, "grad_norm": 0.9730406403541565, "learning_rate": 1.776146453243581e-05, "loss": 0.5022, "num_input_tokens_seen": 49439968, "step": 85155 }, { "epoch": 12.68394399761692, "grad_norm": 1.8516970872879028, "learning_rate": 1.7758354359523328e-05, "loss": 0.5863, "num_input_tokens_seen": 49442944, "step": 85160 }, { "epoch": 12.684688710157879, "grad_norm": 2.010791063308716, "learning_rate": 1.775524430895107e-05, "loss": 0.6584, "num_input_tokens_seen": 49445888, "step": 85165 }, { "epoch": 12.685433422698837, "grad_norm": 2.1879234313964844, "learning_rate": 1.775213438077161e-05, "loss": 0.6268, "num_input_tokens_seen": 49449216, "step": 85170 }, { "epoch": 12.686178135239798, "grad_norm": 2.0962038040161133, "learning_rate": 1.7749024575037466e-05, "loss": 0.5525, "num_input_tokens_seen": 49452160, "step": 85175 }, { "epoch": 12.686922847780757, "grad_norm": 1.0189476013183594, "learning_rate": 1.774591489180119e-05, "loss": 0.5706, "num_input_tokens_seen": 49455008, "step": 85180 }, { "epoch": 12.687667560321715, "grad_norm": 1.4504365921020508, "learning_rate": 1.77428053311153e-05, "loss": 0.5734, "num_input_tokens_seen": 49457824, "step": 85185 }, { "epoch": 12.688412272862674, "grad_norm": 1.2959775924682617, "learning_rate": 1.7739695893032344e-05, "loss": 0.7039, "num_input_tokens_seen": 49460736, "step": 85190 }, { "epoch": 12.689156985403635, "grad_norm": 0.8441987633705139, "learning_rate": 1.7736586577604846e-05, "loss": 0.7451, "num_input_tokens_seen": 49463744, "step": 85195 }, { "epoch": 12.689901697944594, "grad_norm": 1.816026210784912, "learning_rate": 1.7733477384885333e-05, "loss": 0.5753, "num_input_tokens_seen": 49466528, "step": 85200 }, { "epoch": 12.690646410485552, "grad_norm": 1.1946632862091064, "learning_rate": 1.7730368314926336e-05, "loss": 0.649, "num_input_tokens_seen": 49469632, "step": 85205 }, { "epoch": 12.691391123026511, "grad_norm": 1.1638448238372803, "learning_rate": 1.7727259367780374e-05, "loss": 0.548, "num_input_tokens_seen": 49472608, "step": 85210 }, { "epoch": 12.692135835567472, "grad_norm": 2.5534324645996094, "learning_rate": 1.772415054349998e-05, "loss": 0.5988, "num_input_tokens_seen": 49475712, "step": 85215 }, { "epoch": 12.69288054810843, "grad_norm": 2.422983407974243, "learning_rate": 1.772104184213766e-05, "loss": 0.8241, "num_input_tokens_seen": 49478688, "step": 85220 }, { "epoch": 12.69362526064939, "grad_norm": 2.893153667449951, "learning_rate": 1.7717933263745947e-05, "loss": 0.7425, "num_input_tokens_seen": 49481632, "step": 85225 }, { "epoch": 12.694369973190348, "grad_norm": 1.04704749584198, "learning_rate": 1.7714824808377346e-05, "loss": 0.579, "num_input_tokens_seen": 49484672, "step": 85230 }, { "epoch": 12.695114685731308, "grad_norm": 1.793152928352356, "learning_rate": 1.7711716476084384e-05, "loss": 0.635, "num_input_tokens_seen": 49487680, "step": 85235 }, { "epoch": 12.695859398272267, "grad_norm": 2.066030502319336, "learning_rate": 1.770860826691956e-05, "loss": 0.5818, "num_input_tokens_seen": 49490752, "step": 85240 }, { "epoch": 12.696604110813226, "grad_norm": 1.5695151090621948, "learning_rate": 1.770550018093538e-05, "loss": 0.4273, "num_input_tokens_seen": 49493536, "step": 85245 }, { "epoch": 12.697348823354185, "grad_norm": 1.6678754091262817, "learning_rate": 1.770239221818437e-05, "loss": 0.6654, "num_input_tokens_seen": 49496640, "step": 85250 }, { "epoch": 12.698093535895145, "grad_norm": 1.6503061056137085, "learning_rate": 1.7699284378719017e-05, "loss": 0.4974, "num_input_tokens_seen": 49499616, "step": 85255 }, { "epoch": 12.698838248436104, "grad_norm": 1.7714803218841553, "learning_rate": 1.7696176662591844e-05, "loss": 0.7616, "num_input_tokens_seen": 49502400, "step": 85260 }, { "epoch": 12.699582960977063, "grad_norm": 2.5653140544891357, "learning_rate": 1.7693069069855343e-05, "loss": 0.7271, "num_input_tokens_seen": 49505120, "step": 85265 }, { "epoch": 12.700327673518021, "grad_norm": 1.4412715435028076, "learning_rate": 1.7689961600562014e-05, "loss": 0.6691, "num_input_tokens_seen": 49508032, "step": 85270 }, { "epoch": 12.701072386058982, "grad_norm": 0.9128320813179016, "learning_rate": 1.7686854254764355e-05, "loss": 0.6133, "num_input_tokens_seen": 49510944, "step": 85275 }, { "epoch": 12.70181709859994, "grad_norm": 1.537376046180725, "learning_rate": 1.768374703251485e-05, "loss": 0.5141, "num_input_tokens_seen": 49513632, "step": 85280 }, { "epoch": 12.7025618111409, "grad_norm": 0.9935268759727478, "learning_rate": 1.768063993386601e-05, "loss": 0.5567, "num_input_tokens_seen": 49516320, "step": 85285 }, { "epoch": 12.703306523681858, "grad_norm": 1.8702765703201294, "learning_rate": 1.7677532958870315e-05, "loss": 0.6249, "num_input_tokens_seen": 49519232, "step": 85290 }, { "epoch": 12.704051236222817, "grad_norm": 0.9287283420562744, "learning_rate": 1.7674426107580268e-05, "loss": 0.6041, "num_input_tokens_seen": 49521952, "step": 85295 }, { "epoch": 12.704795948763778, "grad_norm": 1.3700230121612549, "learning_rate": 1.767131938004834e-05, "loss": 0.7212, "num_input_tokens_seen": 49525088, "step": 85300 }, { "epoch": 12.705540661304736, "grad_norm": 1.1886696815490723, "learning_rate": 1.7668212776327025e-05, "loss": 0.6629, "num_input_tokens_seen": 49528032, "step": 85305 }, { "epoch": 12.706285373845695, "grad_norm": 0.9847473502159119, "learning_rate": 1.7665106296468793e-05, "loss": 0.5101, "num_input_tokens_seen": 49530880, "step": 85310 }, { "epoch": 12.707030086386656, "grad_norm": 0.9994857907295227, "learning_rate": 1.766199994052615e-05, "loss": 0.6395, "num_input_tokens_seen": 49533728, "step": 85315 }, { "epoch": 12.707774798927614, "grad_norm": 1.5858409404754639, "learning_rate": 1.7658893708551557e-05, "loss": 0.5868, "num_input_tokens_seen": 49536416, "step": 85320 }, { "epoch": 12.708519511468573, "grad_norm": 2.3124420642852783, "learning_rate": 1.7655787600597486e-05, "loss": 0.7703, "num_input_tokens_seen": 49539488, "step": 85325 }, { "epoch": 12.709264224009532, "grad_norm": 1.7278618812561035, "learning_rate": 1.7652681616716428e-05, "loss": 0.5077, "num_input_tokens_seen": 49542272, "step": 85330 }, { "epoch": 12.71000893655049, "grad_norm": 2.0778496265411377, "learning_rate": 1.764957575696084e-05, "loss": 0.5196, "num_input_tokens_seen": 49545024, "step": 85335 }, { "epoch": 12.710753649091451, "grad_norm": 1.2063050270080566, "learning_rate": 1.7646470021383204e-05, "loss": 0.6001, "num_input_tokens_seen": 49548160, "step": 85340 }, { "epoch": 12.71149836163241, "grad_norm": 1.333180546760559, "learning_rate": 1.7643364410035974e-05, "loss": 0.5904, "num_input_tokens_seen": 49551008, "step": 85345 }, { "epoch": 12.712243074173369, "grad_norm": 2.3053019046783447, "learning_rate": 1.7640258922971636e-05, "loss": 0.6207, "num_input_tokens_seen": 49553600, "step": 85350 }, { "epoch": 12.712987786714327, "grad_norm": 1.4306445121765137, "learning_rate": 1.763715356024264e-05, "loss": 0.6628, "num_input_tokens_seen": 49556832, "step": 85355 }, { "epoch": 12.713732499255288, "grad_norm": 1.4421883821487427, "learning_rate": 1.7634048321901447e-05, "loss": 0.7104, "num_input_tokens_seen": 49560320, "step": 85360 }, { "epoch": 12.714477211796247, "grad_norm": 0.8426980972290039, "learning_rate": 1.7630943208000526e-05, "loss": 0.5468, "num_input_tokens_seen": 49563264, "step": 85365 }, { "epoch": 12.715221924337206, "grad_norm": 1.2352683544158936, "learning_rate": 1.762783821859233e-05, "loss": 0.6567, "num_input_tokens_seen": 49566656, "step": 85370 }, { "epoch": 12.715966636878164, "grad_norm": 1.7243609428405762, "learning_rate": 1.762473335372932e-05, "loss": 0.5235, "num_input_tokens_seen": 49569312, "step": 85375 }, { "epoch": 12.716711349419125, "grad_norm": 2.207054853439331, "learning_rate": 1.7621628613463928e-05, "loss": 0.6792, "num_input_tokens_seen": 49572096, "step": 85380 }, { "epoch": 12.717456061960084, "grad_norm": 2.1038904190063477, "learning_rate": 1.7618523997848634e-05, "loss": 0.7033, "num_input_tokens_seen": 49574976, "step": 85385 }, { "epoch": 12.718200774501042, "grad_norm": 1.584535837173462, "learning_rate": 1.7615419506935866e-05, "loss": 0.7775, "num_input_tokens_seen": 49577632, "step": 85390 }, { "epoch": 12.718945487042001, "grad_norm": 1.581773042678833, "learning_rate": 1.761231514077809e-05, "loss": 0.6307, "num_input_tokens_seen": 49580576, "step": 85395 }, { "epoch": 12.719690199582962, "grad_norm": 1.7511310577392578, "learning_rate": 1.760921089942774e-05, "loss": 0.4557, "num_input_tokens_seen": 49583584, "step": 85400 }, { "epoch": 12.72043491212392, "grad_norm": 3.943552017211914, "learning_rate": 1.7606106782937256e-05, "loss": 0.593, "num_input_tokens_seen": 49586560, "step": 85405 }, { "epoch": 12.72117962466488, "grad_norm": 2.1674697399139404, "learning_rate": 1.760300279135909e-05, "loss": 0.4917, "num_input_tokens_seen": 49589056, "step": 85410 }, { "epoch": 12.721924337205838, "grad_norm": 2.7655975818634033, "learning_rate": 1.759989892474566e-05, "loss": 0.8053, "num_input_tokens_seen": 49592096, "step": 85415 }, { "epoch": 12.722669049746798, "grad_norm": 1.3367979526519775, "learning_rate": 1.7596795183149428e-05, "loss": 0.5956, "num_input_tokens_seen": 49595072, "step": 85420 }, { "epoch": 12.723413762287757, "grad_norm": 2.880643129348755, "learning_rate": 1.759369156662281e-05, "loss": 0.6843, "num_input_tokens_seen": 49597792, "step": 85425 }, { "epoch": 12.724158474828716, "grad_norm": 2.396437168121338, "learning_rate": 1.759058807521825e-05, "loss": 0.7295, "num_input_tokens_seen": 49600832, "step": 85430 }, { "epoch": 12.724903187369675, "grad_norm": 1.9551618099212646, "learning_rate": 1.7587484708988176e-05, "loss": 0.5353, "num_input_tokens_seen": 49603488, "step": 85435 }, { "epoch": 12.725647899910635, "grad_norm": 1.759950041770935, "learning_rate": 1.758438146798501e-05, "loss": 0.6435, "num_input_tokens_seen": 49606720, "step": 85440 }, { "epoch": 12.726392612451594, "grad_norm": 1.9007800817489624, "learning_rate": 1.758127835226119e-05, "loss": 0.7106, "num_input_tokens_seen": 49609664, "step": 85445 }, { "epoch": 12.727137324992553, "grad_norm": 1.217301368713379, "learning_rate": 1.757817536186912e-05, "loss": 0.7654, "num_input_tokens_seen": 49612480, "step": 85450 }, { "epoch": 12.727882037533512, "grad_norm": 1.1736738681793213, "learning_rate": 1.7575072496861243e-05, "loss": 0.5529, "num_input_tokens_seen": 49615488, "step": 85455 }, { "epoch": 12.728626750074472, "grad_norm": 1.457838773727417, "learning_rate": 1.757196975728996e-05, "loss": 0.6418, "num_input_tokens_seen": 49618176, "step": 85460 }, { "epoch": 12.72937146261543, "grad_norm": 1.7816340923309326, "learning_rate": 1.7568867143207708e-05, "loss": 0.5804, "num_input_tokens_seen": 49620864, "step": 85465 }, { "epoch": 12.73011617515639, "grad_norm": 2.2710158824920654, "learning_rate": 1.7565764654666888e-05, "loss": 0.5792, "num_input_tokens_seen": 49623648, "step": 85470 }, { "epoch": 12.730860887697348, "grad_norm": 1.1434992551803589, "learning_rate": 1.756266229171993e-05, "loss": 0.5899, "num_input_tokens_seen": 49626912, "step": 85475 }, { "epoch": 12.731605600238307, "grad_norm": 1.1245390176773071, "learning_rate": 1.7559560054419225e-05, "loss": 0.6266, "num_input_tokens_seen": 49629856, "step": 85480 }, { "epoch": 12.732350312779268, "grad_norm": 1.7418732643127441, "learning_rate": 1.7556457942817184e-05, "loss": 0.6368, "num_input_tokens_seen": 49632768, "step": 85485 }, { "epoch": 12.733095025320226, "grad_norm": 1.03827965259552, "learning_rate": 1.7553355956966227e-05, "loss": 0.4567, "num_input_tokens_seen": 49635680, "step": 85490 }, { "epoch": 12.733839737861185, "grad_norm": 1.1507306098937988, "learning_rate": 1.7550254096918748e-05, "loss": 0.4909, "num_input_tokens_seen": 49638336, "step": 85495 }, { "epoch": 12.734584450402146, "grad_norm": 1.3863190412521362, "learning_rate": 1.754715236272716e-05, "loss": 0.6581, "num_input_tokens_seen": 49641216, "step": 85500 }, { "epoch": 12.735329162943104, "grad_norm": 1.488198161125183, "learning_rate": 1.7544050754443857e-05, "loss": 0.8057, "num_input_tokens_seen": 49644544, "step": 85505 }, { "epoch": 12.736073875484063, "grad_norm": 1.7692512273788452, "learning_rate": 1.7540949272121244e-05, "loss": 0.6183, "num_input_tokens_seen": 49647488, "step": 85510 }, { "epoch": 12.736818588025022, "grad_norm": 1.5507545471191406, "learning_rate": 1.753784791581171e-05, "loss": 0.5298, "num_input_tokens_seen": 49650112, "step": 85515 }, { "epoch": 12.73756330056598, "grad_norm": 1.3749401569366455, "learning_rate": 1.753474668556764e-05, "loss": 0.4832, "num_input_tokens_seen": 49652800, "step": 85520 }, { "epoch": 12.738308013106941, "grad_norm": 1.333747386932373, "learning_rate": 1.7531645581441447e-05, "loss": 0.7742, "num_input_tokens_seen": 49655680, "step": 85525 }, { "epoch": 12.7390527256479, "grad_norm": 1.8334378004074097, "learning_rate": 1.7528544603485507e-05, "loss": 0.6891, "num_input_tokens_seen": 49658496, "step": 85530 }, { "epoch": 12.739797438188859, "grad_norm": 2.7928316593170166, "learning_rate": 1.7525443751752218e-05, "loss": 0.89, "num_input_tokens_seen": 49661472, "step": 85535 }, { "epoch": 12.740542150729818, "grad_norm": 1.8089298009872437, "learning_rate": 1.7522343026293953e-05, "loss": 0.4952, "num_input_tokens_seen": 49664448, "step": 85540 }, { "epoch": 12.741286863270778, "grad_norm": 1.4223464727401733, "learning_rate": 1.751924242716311e-05, "loss": 0.6365, "num_input_tokens_seen": 49667424, "step": 85545 }, { "epoch": 12.742031575811737, "grad_norm": 1.9336493015289307, "learning_rate": 1.751614195441205e-05, "loss": 0.5393, "num_input_tokens_seen": 49670400, "step": 85550 }, { "epoch": 12.742776288352696, "grad_norm": 1.9673399925231934, "learning_rate": 1.7513041608093185e-05, "loss": 0.5994, "num_input_tokens_seen": 49673024, "step": 85555 }, { "epoch": 12.743521000893654, "grad_norm": 2.3019020557403564, "learning_rate": 1.7509941388258865e-05, "loss": 0.7131, "num_input_tokens_seen": 49676320, "step": 85560 }, { "epoch": 12.744265713434615, "grad_norm": 1.4576339721679688, "learning_rate": 1.750684129496147e-05, "loss": 0.6102, "num_input_tokens_seen": 49679360, "step": 85565 }, { "epoch": 12.745010425975574, "grad_norm": 1.2354367971420288, "learning_rate": 1.7503741328253377e-05, "loss": 0.599, "num_input_tokens_seen": 49682176, "step": 85570 }, { "epoch": 12.745755138516532, "grad_norm": 1.5873548984527588, "learning_rate": 1.7500641488186946e-05, "loss": 0.5295, "num_input_tokens_seen": 49684928, "step": 85575 }, { "epoch": 12.746499851057491, "grad_norm": 1.3104921579360962, "learning_rate": 1.7497541774814568e-05, "loss": 0.4628, "num_input_tokens_seen": 49687808, "step": 85580 }, { "epoch": 12.747244563598452, "grad_norm": 1.0252187252044678, "learning_rate": 1.7494442188188588e-05, "loss": 0.3981, "num_input_tokens_seen": 49690752, "step": 85585 }, { "epoch": 12.74798927613941, "grad_norm": 1.8221267461776733, "learning_rate": 1.7491342728361383e-05, "loss": 0.7404, "num_input_tokens_seen": 49693696, "step": 85590 }, { "epoch": 12.74873398868037, "grad_norm": 1.4844913482666016, "learning_rate": 1.7488243395385317e-05, "loss": 0.6622, "num_input_tokens_seen": 49697024, "step": 85595 }, { "epoch": 12.749478701221328, "grad_norm": 1.7228578329086304, "learning_rate": 1.7485144189312735e-05, "loss": 0.6565, "num_input_tokens_seen": 49699712, "step": 85600 }, { "epoch": 12.750223413762289, "grad_norm": 1.2486047744750977, "learning_rate": 1.7482045110196014e-05, "loss": 0.5186, "num_input_tokens_seen": 49702464, "step": 85605 }, { "epoch": 12.750968126303247, "grad_norm": 3.285142421722412, "learning_rate": 1.7478946158087484e-05, "loss": 0.77, "num_input_tokens_seen": 49705184, "step": 85610 }, { "epoch": 12.751712838844206, "grad_norm": 1.480409026145935, "learning_rate": 1.747584733303953e-05, "loss": 0.5695, "num_input_tokens_seen": 49708192, "step": 85615 }, { "epoch": 12.752457551385165, "grad_norm": 1.4913020133972168, "learning_rate": 1.7472748635104475e-05, "loss": 0.5062, "num_input_tokens_seen": 49710912, "step": 85620 }, { "epoch": 12.753202263926125, "grad_norm": 1.8645601272583008, "learning_rate": 1.7469650064334693e-05, "loss": 0.6681, "num_input_tokens_seen": 49713536, "step": 85625 }, { "epoch": 12.753946976467084, "grad_norm": 1.1522340774536133, "learning_rate": 1.746655162078251e-05, "loss": 0.4721, "num_input_tokens_seen": 49716320, "step": 85630 }, { "epoch": 12.754691689008043, "grad_norm": 1.2224336862564087, "learning_rate": 1.7463453304500292e-05, "loss": 0.5616, "num_input_tokens_seen": 49719424, "step": 85635 }, { "epoch": 12.755436401549002, "grad_norm": 2.2935822010040283, "learning_rate": 1.7460355115540366e-05, "loss": 0.652, "num_input_tokens_seen": 49722240, "step": 85640 }, { "epoch": 12.756181114089962, "grad_norm": 2.019148349761963, "learning_rate": 1.7457257053955073e-05, "loss": 0.758, "num_input_tokens_seen": 49724864, "step": 85645 }, { "epoch": 12.756925826630921, "grad_norm": 1.604650616645813, "learning_rate": 1.7454159119796758e-05, "loss": 0.5214, "num_input_tokens_seen": 49727776, "step": 85650 }, { "epoch": 12.75767053917188, "grad_norm": 1.3028210401535034, "learning_rate": 1.745106131311775e-05, "loss": 0.6289, "num_input_tokens_seen": 49730752, "step": 85655 }, { "epoch": 12.758415251712838, "grad_norm": 1.3053946495056152, "learning_rate": 1.74479636339704e-05, "loss": 0.5707, "num_input_tokens_seen": 49733504, "step": 85660 }, { "epoch": 12.759159964253797, "grad_norm": 1.0214213132858276, "learning_rate": 1.7444866082407026e-05, "loss": 0.582, "num_input_tokens_seen": 49736480, "step": 85665 }, { "epoch": 12.759904676794758, "grad_norm": 1.4680031538009644, "learning_rate": 1.744176865847996e-05, "loss": 0.5954, "num_input_tokens_seen": 49739520, "step": 85670 }, { "epoch": 12.760649389335716, "grad_norm": 0.9694309234619141, "learning_rate": 1.743867136224153e-05, "loss": 0.5964, "num_input_tokens_seen": 49742272, "step": 85675 }, { "epoch": 12.761394101876675, "grad_norm": 1.4136006832122803, "learning_rate": 1.743557419374406e-05, "loss": 0.5199, "num_input_tokens_seen": 49745120, "step": 85680 }, { "epoch": 12.762138814417634, "grad_norm": 2.828706741333008, "learning_rate": 1.7432477153039877e-05, "loss": 0.6898, "num_input_tokens_seen": 49748096, "step": 85685 }, { "epoch": 12.762883526958595, "grad_norm": 1.957072138786316, "learning_rate": 1.7429380240181296e-05, "loss": 0.738, "num_input_tokens_seen": 49750880, "step": 85690 }, { "epoch": 12.763628239499553, "grad_norm": 1.2336664199829102, "learning_rate": 1.7426283455220652e-05, "loss": 0.6245, "num_input_tokens_seen": 49753824, "step": 85695 }, { "epoch": 12.764372952040512, "grad_norm": 1.0376790761947632, "learning_rate": 1.7423186798210244e-05, "loss": 0.4725, "num_input_tokens_seen": 49756640, "step": 85700 }, { "epoch": 12.76511766458147, "grad_norm": 1.0031919479370117, "learning_rate": 1.74200902692024e-05, "loss": 0.4995, "num_input_tokens_seen": 49759648, "step": 85705 }, { "epoch": 12.765862377122431, "grad_norm": 0.9110342860221863, "learning_rate": 1.7416993868249422e-05, "loss": 0.4453, "num_input_tokens_seen": 49762592, "step": 85710 }, { "epoch": 12.76660708966339, "grad_norm": 1.32224440574646, "learning_rate": 1.7413897595403627e-05, "loss": 0.6167, "num_input_tokens_seen": 49765600, "step": 85715 }, { "epoch": 12.767351802204349, "grad_norm": 1.3249462842941284, "learning_rate": 1.741080145071733e-05, "loss": 0.6185, "num_input_tokens_seen": 49768352, "step": 85720 }, { "epoch": 12.768096514745308, "grad_norm": 1.3975369930267334, "learning_rate": 1.740770543424281e-05, "loss": 0.4529, "num_input_tokens_seen": 49771520, "step": 85725 }, { "epoch": 12.768841227286268, "grad_norm": 0.9154037833213806, "learning_rate": 1.7404609546032407e-05, "loss": 0.3868, "num_input_tokens_seen": 49774336, "step": 85730 }, { "epoch": 12.769585939827227, "grad_norm": 3.066896915435791, "learning_rate": 1.7401513786138402e-05, "loss": 0.6578, "num_input_tokens_seen": 49777152, "step": 85735 }, { "epoch": 12.770330652368186, "grad_norm": 1.1123348474502563, "learning_rate": 1.73984181546131e-05, "loss": 0.5379, "num_input_tokens_seen": 49779840, "step": 85740 }, { "epoch": 12.771075364909144, "grad_norm": 1.6111253499984741, "learning_rate": 1.7395322651508788e-05, "loss": 0.458, "num_input_tokens_seen": 49782592, "step": 85745 }, { "epoch": 12.771820077450105, "grad_norm": 1.1295464038848877, "learning_rate": 1.739222727687778e-05, "loss": 0.6445, "num_input_tokens_seen": 49785760, "step": 85750 }, { "epoch": 12.772564789991064, "grad_norm": 1.4998506307601929, "learning_rate": 1.7389132030772365e-05, "loss": 0.4091, "num_input_tokens_seen": 49788640, "step": 85755 }, { "epoch": 12.773309502532022, "grad_norm": 1.2697975635528564, "learning_rate": 1.7386036913244812e-05, "loss": 0.5908, "num_input_tokens_seen": 49791392, "step": 85760 }, { "epoch": 12.774054215072981, "grad_norm": 2.2658464908599854, "learning_rate": 1.7382941924347443e-05, "loss": 0.7142, "num_input_tokens_seen": 49794208, "step": 85765 }, { "epoch": 12.774798927613942, "grad_norm": 1.4318127632141113, "learning_rate": 1.737984706413252e-05, "loss": 0.5623, "num_input_tokens_seen": 49797056, "step": 85770 }, { "epoch": 12.7755436401549, "grad_norm": 1.386723518371582, "learning_rate": 1.737675233265234e-05, "loss": 0.8121, "num_input_tokens_seen": 49799776, "step": 85775 }, { "epoch": 12.77628835269586, "grad_norm": 1.33384108543396, "learning_rate": 1.7373657729959176e-05, "loss": 0.6063, "num_input_tokens_seen": 49802688, "step": 85780 }, { "epoch": 12.777033065236818, "grad_norm": 0.6870362162590027, "learning_rate": 1.7370563256105322e-05, "loss": 0.6172, "num_input_tokens_seen": 49805696, "step": 85785 }, { "epoch": 12.777777777777779, "grad_norm": 2.136997699737549, "learning_rate": 1.7367468911143036e-05, "loss": 0.6095, "num_input_tokens_seen": 49808704, "step": 85790 }, { "epoch": 12.778522490318737, "grad_norm": 1.6147305965423584, "learning_rate": 1.7364374695124613e-05, "loss": 0.6093, "num_input_tokens_seen": 49811712, "step": 85795 }, { "epoch": 12.779267202859696, "grad_norm": 0.5756670236587524, "learning_rate": 1.7361280608102325e-05, "loss": 0.516, "num_input_tokens_seen": 49814688, "step": 85800 }, { "epoch": 12.780011915400655, "grad_norm": 2.7914247512817383, "learning_rate": 1.7358186650128427e-05, "loss": 0.655, "num_input_tokens_seen": 49817472, "step": 85805 }, { "epoch": 12.780756627941614, "grad_norm": 0.8961474299430847, "learning_rate": 1.7355092821255208e-05, "loss": 0.4799, "num_input_tokens_seen": 49820384, "step": 85810 }, { "epoch": 12.781501340482574, "grad_norm": 2.945312738418579, "learning_rate": 1.7351999121534913e-05, "loss": 0.6593, "num_input_tokens_seen": 49823456, "step": 85815 }, { "epoch": 12.782246053023533, "grad_norm": 2.2939035892486572, "learning_rate": 1.734890555101983e-05, "loss": 0.4049, "num_input_tokens_seen": 49826272, "step": 85820 }, { "epoch": 12.782990765564492, "grad_norm": 1.3135532140731812, "learning_rate": 1.7345812109762204e-05, "loss": 0.6285, "num_input_tokens_seen": 49829216, "step": 85825 }, { "epoch": 12.783735478105452, "grad_norm": 1.0690085887908936, "learning_rate": 1.734271879781431e-05, "loss": 0.4402, "num_input_tokens_seen": 49832128, "step": 85830 }, { "epoch": 12.784480190646411, "grad_norm": 2.8392293453216553, "learning_rate": 1.7339625615228396e-05, "loss": 0.7487, "num_input_tokens_seen": 49835136, "step": 85835 }, { "epoch": 12.78522490318737, "grad_norm": 0.8395838737487793, "learning_rate": 1.733653256205673e-05, "loss": 0.5374, "num_input_tokens_seen": 49837984, "step": 85840 }, { "epoch": 12.785969615728328, "grad_norm": 3.622182607650757, "learning_rate": 1.733343963835155e-05, "loss": 0.3553, "num_input_tokens_seen": 49840896, "step": 85845 }, { "epoch": 12.786714328269287, "grad_norm": 3.3859825134277344, "learning_rate": 1.733034684416511e-05, "loss": 0.5805, "num_input_tokens_seen": 49843872, "step": 85850 }, { "epoch": 12.787459040810248, "grad_norm": 0.9453681111335754, "learning_rate": 1.7327254179549674e-05, "loss": 0.5209, "num_input_tokens_seen": 49846944, "step": 85855 }, { "epoch": 12.788203753351207, "grad_norm": 1.6270034313201904, "learning_rate": 1.7324161644557472e-05, "loss": 0.6257, "num_input_tokens_seen": 49849760, "step": 85860 }, { "epoch": 12.788948465892165, "grad_norm": 1.1343128681182861, "learning_rate": 1.7321069239240767e-05, "loss": 0.6469, "num_input_tokens_seen": 49852800, "step": 85865 }, { "epoch": 12.789693178433124, "grad_norm": 1.8529103994369507, "learning_rate": 1.7317976963651783e-05, "loss": 0.6012, "num_input_tokens_seen": 49855712, "step": 85870 }, { "epoch": 12.790437890974085, "grad_norm": 1.0812337398529053, "learning_rate": 1.731488481784278e-05, "loss": 0.5172, "num_input_tokens_seen": 49858560, "step": 85875 }, { "epoch": 12.791182603515043, "grad_norm": 2.3018434047698975, "learning_rate": 1.731179280186599e-05, "loss": 0.5969, "num_input_tokens_seen": 49861344, "step": 85880 }, { "epoch": 12.791927316056002, "grad_norm": 1.6328109502792358, "learning_rate": 1.730870091577363e-05, "loss": 0.6423, "num_input_tokens_seen": 49864032, "step": 85885 }, { "epoch": 12.79267202859696, "grad_norm": 1.2696073055267334, "learning_rate": 1.7305609159617965e-05, "loss": 0.6079, "num_input_tokens_seen": 49866912, "step": 85890 }, { "epoch": 12.793416741137921, "grad_norm": 0.8148767352104187, "learning_rate": 1.730251753345121e-05, "loss": 0.3817, "num_input_tokens_seen": 49869504, "step": 85895 }, { "epoch": 12.79416145367888, "grad_norm": 2.3376219272613525, "learning_rate": 1.72994260373256e-05, "loss": 0.5386, "num_input_tokens_seen": 49872160, "step": 85900 }, { "epoch": 12.794906166219839, "grad_norm": 1.1434022188186646, "learning_rate": 1.729633467129335e-05, "loss": 0.7667, "num_input_tokens_seen": 49875264, "step": 85905 }, { "epoch": 12.795650878760798, "grad_norm": 2.452549695968628, "learning_rate": 1.7293243435406705e-05, "loss": 0.776, "num_input_tokens_seen": 49877920, "step": 85910 }, { "epoch": 12.796395591301758, "grad_norm": 1.842482089996338, "learning_rate": 1.7290152329717873e-05, "loss": 0.6645, "num_input_tokens_seen": 49880672, "step": 85915 }, { "epoch": 12.797140303842717, "grad_norm": 1.5285874605178833, "learning_rate": 1.7287061354279088e-05, "loss": 0.6036, "num_input_tokens_seen": 49883424, "step": 85920 }, { "epoch": 12.797885016383676, "grad_norm": 1.5496796369552612, "learning_rate": 1.7283970509142567e-05, "loss": 0.7545, "num_input_tokens_seen": 49886368, "step": 85925 }, { "epoch": 12.798629728924634, "grad_norm": 1.3085720539093018, "learning_rate": 1.728087979436051e-05, "loss": 0.5424, "num_input_tokens_seen": 49889280, "step": 85930 }, { "epoch": 12.799374441465595, "grad_norm": 2.1239845752716064, "learning_rate": 1.7277789209985155e-05, "loss": 0.6749, "num_input_tokens_seen": 49892352, "step": 85935 }, { "epoch": 12.800119154006554, "grad_norm": 1.559453010559082, "learning_rate": 1.727469875606869e-05, "loss": 0.3915, "num_input_tokens_seen": 49895040, "step": 85940 }, { "epoch": 12.800863866547513, "grad_norm": 2.4932758808135986, "learning_rate": 1.727160843266335e-05, "loss": 0.5447, "num_input_tokens_seen": 49897600, "step": 85945 }, { "epoch": 12.801608579088471, "grad_norm": 1.5479519367218018, "learning_rate": 1.7268518239821318e-05, "loss": 0.8181, "num_input_tokens_seen": 49900832, "step": 85950 }, { "epoch": 12.802353291629432, "grad_norm": 1.3789548873901367, "learning_rate": 1.7265428177594822e-05, "loss": 0.7829, "num_input_tokens_seen": 49903840, "step": 85955 }, { "epoch": 12.80309800417039, "grad_norm": 1.5954370498657227, "learning_rate": 1.7262338246036057e-05, "loss": 0.778, "num_input_tokens_seen": 49906848, "step": 85960 }, { "epoch": 12.80384271671135, "grad_norm": 2.3624141216278076, "learning_rate": 1.7259248445197217e-05, "loss": 0.7105, "num_input_tokens_seen": 49909536, "step": 85965 }, { "epoch": 12.804587429252308, "grad_norm": 2.194075584411621, "learning_rate": 1.7256158775130517e-05, "loss": 0.6284, "num_input_tokens_seen": 49912384, "step": 85970 }, { "epoch": 12.805332141793269, "grad_norm": 1.3260940313339233, "learning_rate": 1.725306923588813e-05, "loss": 0.5406, "num_input_tokens_seen": 49915616, "step": 85975 }, { "epoch": 12.806076854334227, "grad_norm": 1.075378656387329, "learning_rate": 1.7249979827522274e-05, "loss": 0.5336, "num_input_tokens_seen": 49918624, "step": 85980 }, { "epoch": 12.806821566875186, "grad_norm": 1.1227176189422607, "learning_rate": 1.7246890550085122e-05, "loss": 0.4958, "num_input_tokens_seen": 49921984, "step": 85985 }, { "epoch": 12.807566279416145, "grad_norm": 1.1762489080429077, "learning_rate": 1.7243801403628887e-05, "loss": 0.5669, "num_input_tokens_seen": 49924576, "step": 85990 }, { "epoch": 12.808310991957104, "grad_norm": 1.2001088857650757, "learning_rate": 1.724071238820574e-05, "loss": 0.5763, "num_input_tokens_seen": 49927296, "step": 85995 }, { "epoch": 12.809055704498064, "grad_norm": 1.6297783851623535, "learning_rate": 1.723762350386787e-05, "loss": 0.629, "num_input_tokens_seen": 49930304, "step": 86000 }, { "epoch": 12.809800417039023, "grad_norm": 1.0914087295532227, "learning_rate": 1.7234534750667468e-05, "loss": 0.5992, "num_input_tokens_seen": 49933376, "step": 86005 }, { "epoch": 12.810545129579982, "grad_norm": 0.8311572670936584, "learning_rate": 1.7231446128656693e-05, "loss": 0.4887, "num_input_tokens_seen": 49936000, "step": 86010 }, { "epoch": 12.811289842120942, "grad_norm": 0.7747229337692261, "learning_rate": 1.7228357637887755e-05, "loss": 0.5396, "num_input_tokens_seen": 49938976, "step": 86015 }, { "epoch": 12.812034554661901, "grad_norm": 1.7654197216033936, "learning_rate": 1.7225269278412802e-05, "loss": 0.4257, "num_input_tokens_seen": 49941888, "step": 86020 }, { "epoch": 12.81277926720286, "grad_norm": 1.2960591316223145, "learning_rate": 1.7222181050284037e-05, "loss": 0.5957, "num_input_tokens_seen": 49944544, "step": 86025 }, { "epoch": 12.813523979743819, "grad_norm": 1.517537236213684, "learning_rate": 1.7219092953553612e-05, "loss": 0.4364, "num_input_tokens_seen": 49947200, "step": 86030 }, { "epoch": 12.814268692284777, "grad_norm": 1.2244418859481812, "learning_rate": 1.7216004988273706e-05, "loss": 0.556, "num_input_tokens_seen": 49950112, "step": 86035 }, { "epoch": 12.815013404825738, "grad_norm": 0.8963667750358582, "learning_rate": 1.7212917154496488e-05, "loss": 0.4639, "num_input_tokens_seen": 49952832, "step": 86040 }, { "epoch": 12.815758117366697, "grad_norm": 2.0716745853424072, "learning_rate": 1.7209829452274108e-05, "loss": 0.5567, "num_input_tokens_seen": 49955712, "step": 86045 }, { "epoch": 12.816502829907655, "grad_norm": 1.9507067203521729, "learning_rate": 1.720674188165875e-05, "loss": 0.7796, "num_input_tokens_seen": 49958720, "step": 86050 }, { "epoch": 12.817247542448614, "grad_norm": 1.38694429397583, "learning_rate": 1.720365444270256e-05, "loss": 0.5322, "num_input_tokens_seen": 49961760, "step": 86055 }, { "epoch": 12.817992254989575, "grad_norm": 1.0918903350830078, "learning_rate": 1.720056713545771e-05, "loss": 0.4975, "num_input_tokens_seen": 49964576, "step": 86060 }, { "epoch": 12.818736967530533, "grad_norm": 1.9822361469268799, "learning_rate": 1.7197479959976353e-05, "loss": 0.7501, "num_input_tokens_seen": 49967584, "step": 86065 }, { "epoch": 12.819481680071492, "grad_norm": 1.2487937211990356, "learning_rate": 1.719439291631064e-05, "loss": 0.8505, "num_input_tokens_seen": 49970624, "step": 86070 }, { "epoch": 12.820226392612451, "grad_norm": 1.7949684858322144, "learning_rate": 1.7191306004512723e-05, "loss": 0.6589, "num_input_tokens_seen": 49973312, "step": 86075 }, { "epoch": 12.820971105153411, "grad_norm": 3.145195722579956, "learning_rate": 1.7188219224634762e-05, "loss": 0.5073, "num_input_tokens_seen": 49976448, "step": 86080 }, { "epoch": 12.82171581769437, "grad_norm": 1.2719781398773193, "learning_rate": 1.7185132576728898e-05, "loss": 0.5622, "num_input_tokens_seen": 49979488, "step": 86085 }, { "epoch": 12.822460530235329, "grad_norm": 1.4047468900680542, "learning_rate": 1.718204606084726e-05, "loss": 0.5611, "num_input_tokens_seen": 49982240, "step": 86090 }, { "epoch": 12.823205242776288, "grad_norm": 1.1768125295639038, "learning_rate": 1.717895967704202e-05, "loss": 0.5072, "num_input_tokens_seen": 49985088, "step": 86095 }, { "epoch": 12.823949955317248, "grad_norm": 1.6785849332809448, "learning_rate": 1.7175873425365308e-05, "loss": 0.5807, "num_input_tokens_seen": 49988192, "step": 86100 }, { "epoch": 12.824694667858207, "grad_norm": 1.3332698345184326, "learning_rate": 1.7172787305869266e-05, "loss": 0.6616, "num_input_tokens_seen": 49991392, "step": 86105 }, { "epoch": 12.825439380399166, "grad_norm": 1.6742031574249268, "learning_rate": 1.7169701318606014e-05, "loss": 0.7463, "num_input_tokens_seen": 49994496, "step": 86110 }, { "epoch": 12.826184092940125, "grad_norm": 1.293636679649353, "learning_rate": 1.7166615463627712e-05, "loss": 0.7061, "num_input_tokens_seen": 49997408, "step": 86115 }, { "epoch": 12.826928805481085, "grad_norm": 2.0506293773651123, "learning_rate": 1.716352974098648e-05, "loss": 0.5718, "num_input_tokens_seen": 50000288, "step": 86120 }, { "epoch": 12.827673518022044, "grad_norm": 1.3299381732940674, "learning_rate": 1.716044415073444e-05, "loss": 0.7058, "num_input_tokens_seen": 50003264, "step": 86125 }, { "epoch": 12.828418230563003, "grad_norm": 0.9281049370765686, "learning_rate": 1.7157358692923737e-05, "loss": 0.4317, "num_input_tokens_seen": 50005920, "step": 86130 }, { "epoch": 12.829162943103961, "grad_norm": 1.0250608921051025, "learning_rate": 1.7154273367606484e-05, "loss": 0.475, "num_input_tokens_seen": 50008800, "step": 86135 }, { "epoch": 12.829907655644922, "grad_norm": 1.0652705430984497, "learning_rate": 1.715118817483481e-05, "loss": 0.4051, "num_input_tokens_seen": 50012352, "step": 86140 }, { "epoch": 12.83065236818588, "grad_norm": 1.5253307819366455, "learning_rate": 1.7148103114660825e-05, "loss": 0.6001, "num_input_tokens_seen": 50015424, "step": 86145 }, { "epoch": 12.83139708072684, "grad_norm": 1.2023708820343018, "learning_rate": 1.7145018187136668e-05, "loss": 0.529, "num_input_tokens_seen": 50018176, "step": 86150 }, { "epoch": 12.832141793267798, "grad_norm": 2.94973087310791, "learning_rate": 1.7141933392314436e-05, "loss": 0.6823, "num_input_tokens_seen": 50021024, "step": 86155 }, { "epoch": 12.832886505808759, "grad_norm": 3.27888560295105, "learning_rate": 1.7138848730246264e-05, "loss": 0.6767, "num_input_tokens_seen": 50023968, "step": 86160 }, { "epoch": 12.833631218349717, "grad_norm": 1.3521244525909424, "learning_rate": 1.7135764200984253e-05, "loss": 0.4356, "num_input_tokens_seen": 50027104, "step": 86165 }, { "epoch": 12.834375930890676, "grad_norm": 1.9403280019760132, "learning_rate": 1.7132679804580505e-05, "loss": 0.5623, "num_input_tokens_seen": 50030016, "step": 86170 }, { "epoch": 12.835120643431635, "grad_norm": 3.1225228309631348, "learning_rate": 1.7129595541087146e-05, "loss": 0.5457, "num_input_tokens_seen": 50032800, "step": 86175 }, { "epoch": 12.835865355972594, "grad_norm": 2.1368255615234375, "learning_rate": 1.7126511410556256e-05, "loss": 0.5647, "num_input_tokens_seen": 50035872, "step": 86180 }, { "epoch": 12.836610068513554, "grad_norm": 1.3594313859939575, "learning_rate": 1.7123427413039967e-05, "loss": 0.5213, "num_input_tokens_seen": 50038592, "step": 86185 }, { "epoch": 12.837354781054513, "grad_norm": 0.9382748603820801, "learning_rate": 1.712034354859036e-05, "loss": 0.6089, "num_input_tokens_seen": 50041472, "step": 86190 }, { "epoch": 12.838099493595472, "grad_norm": 1.8845242261886597, "learning_rate": 1.711725981725954e-05, "loss": 0.626, "num_input_tokens_seen": 50044512, "step": 86195 }, { "epoch": 12.83884420613643, "grad_norm": 1.823808193206787, "learning_rate": 1.7114176219099607e-05, "loss": 0.7213, "num_input_tokens_seen": 50047136, "step": 86200 }, { "epoch": 12.839588918677391, "grad_norm": 2.3972132205963135, "learning_rate": 1.711109275416265e-05, "loss": 0.7349, "num_input_tokens_seen": 50050272, "step": 86205 }, { "epoch": 12.84033363121835, "grad_norm": 3.366081953048706, "learning_rate": 1.7108009422500767e-05, "loss": 0.7735, "num_input_tokens_seen": 50053088, "step": 86210 }, { "epoch": 12.841078343759309, "grad_norm": 0.8301929831504822, "learning_rate": 1.7104926224166033e-05, "loss": 0.5637, "num_input_tokens_seen": 50056000, "step": 86215 }, { "epoch": 12.841823056300267, "grad_norm": 1.5853910446166992, "learning_rate": 1.7101843159210556e-05, "loss": 0.5286, "num_input_tokens_seen": 50058880, "step": 86220 }, { "epoch": 12.842567768841228, "grad_norm": 1.929227590560913, "learning_rate": 1.709876022768641e-05, "loss": 0.7055, "num_input_tokens_seen": 50061920, "step": 86225 }, { "epoch": 12.843312481382187, "grad_norm": 4.590453624725342, "learning_rate": 1.7095677429645682e-05, "loss": 0.5963, "num_input_tokens_seen": 50065120, "step": 86230 }, { "epoch": 12.844057193923145, "grad_norm": 0.7884760499000549, "learning_rate": 1.709259476514044e-05, "loss": 0.3819, "num_input_tokens_seen": 50067936, "step": 86235 }, { "epoch": 12.844801906464104, "grad_norm": 2.315802812576294, "learning_rate": 1.7089512234222783e-05, "loss": 0.6538, "num_input_tokens_seen": 50070784, "step": 86240 }, { "epoch": 12.845546619005065, "grad_norm": 1.6116777658462524, "learning_rate": 1.7086429836944777e-05, "loss": 0.5773, "num_input_tokens_seen": 50073504, "step": 86245 }, { "epoch": 12.846291331546023, "grad_norm": 1.2242109775543213, "learning_rate": 1.7083347573358484e-05, "loss": 0.4602, "num_input_tokens_seen": 50076608, "step": 86250 }, { "epoch": 12.847036044086982, "grad_norm": 1.0822089910507202, "learning_rate": 1.7080265443516e-05, "loss": 0.5988, "num_input_tokens_seen": 50079648, "step": 86255 }, { "epoch": 12.847780756627941, "grad_norm": 3.1273014545440674, "learning_rate": 1.7077183447469376e-05, "loss": 0.5271, "num_input_tokens_seen": 50082336, "step": 86260 }, { "epoch": 12.848525469168901, "grad_norm": 1.7948267459869385, "learning_rate": 1.7074101585270692e-05, "loss": 0.4637, "num_input_tokens_seen": 50085248, "step": 86265 }, { "epoch": 12.84927018170986, "grad_norm": 1.5857712030410767, "learning_rate": 1.7071019856971993e-05, "loss": 0.5532, "num_input_tokens_seen": 50087968, "step": 86270 }, { "epoch": 12.850014894250819, "grad_norm": 1.4953162670135498, "learning_rate": 1.7067938262625364e-05, "loss": 0.592, "num_input_tokens_seen": 50090656, "step": 86275 }, { "epoch": 12.850759606791778, "grad_norm": 2.1768481731414795, "learning_rate": 1.7064856802282865e-05, "loss": 0.5764, "num_input_tokens_seen": 50093728, "step": 86280 }, { "epoch": 12.851504319332738, "grad_norm": 0.9439085125923157, "learning_rate": 1.706177547599653e-05, "loss": 0.5542, "num_input_tokens_seen": 50096384, "step": 86285 }, { "epoch": 12.852249031873697, "grad_norm": 3.9910011291503906, "learning_rate": 1.7058694283818437e-05, "loss": 0.779, "num_input_tokens_seen": 50099040, "step": 86290 }, { "epoch": 12.852993744414656, "grad_norm": 0.9539193511009216, "learning_rate": 1.705561322580063e-05, "loss": 0.5022, "num_input_tokens_seen": 50102208, "step": 86295 }, { "epoch": 12.853738456955615, "grad_norm": 0.5666090250015259, "learning_rate": 1.7052532301995168e-05, "loss": 0.4917, "num_input_tokens_seen": 50104960, "step": 86300 }, { "epoch": 12.854483169496575, "grad_norm": 0.5719560384750366, "learning_rate": 1.7049451512454085e-05, "loss": 0.458, "num_input_tokens_seen": 50108160, "step": 86305 }, { "epoch": 12.855227882037534, "grad_norm": 2.1818997859954834, "learning_rate": 1.704637085722945e-05, "loss": 0.674, "num_input_tokens_seen": 50111072, "step": 86310 }, { "epoch": 12.855972594578493, "grad_norm": 1.7918773889541626, "learning_rate": 1.7043290336373286e-05, "loss": 0.6341, "num_input_tokens_seen": 50113728, "step": 86315 }, { "epoch": 12.856717307119451, "grad_norm": 1.0851943492889404, "learning_rate": 1.7040209949937653e-05, "loss": 0.5287, "num_input_tokens_seen": 50116544, "step": 86320 }, { "epoch": 12.857462019660412, "grad_norm": 1.446354627609253, "learning_rate": 1.7037129697974585e-05, "loss": 0.4687, "num_input_tokens_seen": 50119328, "step": 86325 }, { "epoch": 12.85820673220137, "grad_norm": 1.1493103504180908, "learning_rate": 1.703404958053611e-05, "loss": 0.5825, "num_input_tokens_seen": 50122016, "step": 86330 }, { "epoch": 12.85895144474233, "grad_norm": 1.4904112815856934, "learning_rate": 1.703096959767428e-05, "loss": 0.5723, "num_input_tokens_seen": 50124800, "step": 86335 }, { "epoch": 12.859696157283288, "grad_norm": 1.6270192861557007, "learning_rate": 1.7027889749441108e-05, "loss": 0.3439, "num_input_tokens_seen": 50127680, "step": 86340 }, { "epoch": 12.860440869824249, "grad_norm": 1.00432550907135, "learning_rate": 1.702481003588864e-05, "loss": 0.5237, "num_input_tokens_seen": 50131040, "step": 86345 }, { "epoch": 12.861185582365207, "grad_norm": 1.0498883724212646, "learning_rate": 1.7021730457068898e-05, "loss": 0.7453, "num_input_tokens_seen": 50134112, "step": 86350 }, { "epoch": 12.861930294906166, "grad_norm": 2.281965494155884, "learning_rate": 1.701865101303392e-05, "loss": 0.5921, "num_input_tokens_seen": 50137216, "step": 86355 }, { "epoch": 12.862675007447125, "grad_norm": 2.709581136703491, "learning_rate": 1.701557170383572e-05, "loss": 0.627, "num_input_tokens_seen": 50139872, "step": 86360 }, { "epoch": 12.863419719988084, "grad_norm": 3.171994924545288, "learning_rate": 1.7012492529526315e-05, "loss": 0.6811, "num_input_tokens_seen": 50142592, "step": 86365 }, { "epoch": 12.864164432529044, "grad_norm": 1.2205619812011719, "learning_rate": 1.700941349015774e-05, "loss": 0.5555, "num_input_tokens_seen": 50145504, "step": 86370 }, { "epoch": 12.864909145070003, "grad_norm": 0.6262035369873047, "learning_rate": 1.7006334585781986e-05, "loss": 0.5546, "num_input_tokens_seen": 50148384, "step": 86375 }, { "epoch": 12.865653857610962, "grad_norm": 1.8037341833114624, "learning_rate": 1.7003255816451098e-05, "loss": 0.6347, "num_input_tokens_seen": 50151072, "step": 86380 }, { "epoch": 12.86639857015192, "grad_norm": 2.109647750854492, "learning_rate": 1.7000177182217066e-05, "loss": 0.6355, "num_input_tokens_seen": 50154304, "step": 86385 }, { "epoch": 12.867143282692881, "grad_norm": 1.5671353340148926, "learning_rate": 1.6997098683131918e-05, "loss": 0.5354, "num_input_tokens_seen": 50157376, "step": 86390 }, { "epoch": 12.86788799523384, "grad_norm": 2.5596749782562256, "learning_rate": 1.699402031924765e-05, "loss": 0.5983, "num_input_tokens_seen": 50160128, "step": 86395 }, { "epoch": 12.868632707774799, "grad_norm": 0.8825986981391907, "learning_rate": 1.699094209061628e-05, "loss": 0.6806, "num_input_tokens_seen": 50162944, "step": 86400 }, { "epoch": 12.869377420315757, "grad_norm": 2.8057992458343506, "learning_rate": 1.69878639972898e-05, "loss": 0.884, "num_input_tokens_seen": 50165824, "step": 86405 }, { "epoch": 12.870122132856718, "grad_norm": 1.0801371335983276, "learning_rate": 1.6984786039320207e-05, "loss": 0.6882, "num_input_tokens_seen": 50168640, "step": 86410 }, { "epoch": 12.870866845397677, "grad_norm": 1.4237914085388184, "learning_rate": 1.6981708216759515e-05, "loss": 0.4933, "num_input_tokens_seen": 50171392, "step": 86415 }, { "epoch": 12.871611557938635, "grad_norm": 0.5951772332191467, "learning_rate": 1.6978630529659706e-05, "loss": 0.761, "num_input_tokens_seen": 50174400, "step": 86420 }, { "epoch": 12.872356270479594, "grad_norm": 1.013533353805542, "learning_rate": 1.697555297807279e-05, "loss": 0.7208, "num_input_tokens_seen": 50177056, "step": 86425 }, { "epoch": 12.873100983020555, "grad_norm": 1.4414446353912354, "learning_rate": 1.6972475562050744e-05, "loss": 0.4454, "num_input_tokens_seen": 50179872, "step": 86430 }, { "epoch": 12.873845695561513, "grad_norm": 1.6139062643051147, "learning_rate": 1.6969398281645572e-05, "loss": 0.6552, "num_input_tokens_seen": 50182464, "step": 86435 }, { "epoch": 12.874590408102472, "grad_norm": 1.7799667119979858, "learning_rate": 1.696632113690924e-05, "loss": 0.5261, "num_input_tokens_seen": 50185280, "step": 86440 }, { "epoch": 12.875335120643431, "grad_norm": 1.3440167903900146, "learning_rate": 1.6963244127893763e-05, "loss": 0.568, "num_input_tokens_seen": 50188224, "step": 86445 }, { "epoch": 12.876079833184392, "grad_norm": 2.619842529296875, "learning_rate": 1.6960167254651105e-05, "loss": 0.5792, "num_input_tokens_seen": 50191104, "step": 86450 }, { "epoch": 12.87682454572535, "grad_norm": 3.33105731010437, "learning_rate": 1.6957090517233242e-05, "loss": 0.5094, "num_input_tokens_seen": 50193856, "step": 86455 }, { "epoch": 12.877569258266309, "grad_norm": 1.3417236804962158, "learning_rate": 1.6954013915692167e-05, "loss": 0.628, "num_input_tokens_seen": 50196704, "step": 86460 }, { "epoch": 12.878313970807268, "grad_norm": 1.217987060546875, "learning_rate": 1.695093745007985e-05, "loss": 0.487, "num_input_tokens_seen": 50199584, "step": 86465 }, { "epoch": 12.879058683348228, "grad_norm": 1.1637307405471802, "learning_rate": 1.6947861120448262e-05, "loss": 0.5335, "num_input_tokens_seen": 50202464, "step": 86470 }, { "epoch": 12.879803395889187, "grad_norm": 1.0909485816955566, "learning_rate": 1.694478492684937e-05, "loss": 0.4549, "num_input_tokens_seen": 50205184, "step": 86475 }, { "epoch": 12.880548108430146, "grad_norm": 1.1125247478485107, "learning_rate": 1.694170886933516e-05, "loss": 0.6407, "num_input_tokens_seen": 50208000, "step": 86480 }, { "epoch": 12.881292820971105, "grad_norm": 2.562770366668701, "learning_rate": 1.693863294795759e-05, "loss": 0.5684, "num_input_tokens_seen": 50210752, "step": 86485 }, { "epoch": 12.882037533512065, "grad_norm": 2.022091865539551, "learning_rate": 1.6935557162768612e-05, "loss": 0.5741, "num_input_tokens_seen": 50213792, "step": 86490 }, { "epoch": 12.882782246053024, "grad_norm": 1.5638753175735474, "learning_rate": 1.693248151382021e-05, "loss": 0.6879, "num_input_tokens_seen": 50216640, "step": 86495 }, { "epoch": 12.883526958593983, "grad_norm": 1.941405177116394, "learning_rate": 1.6929406001164325e-05, "loss": 0.788, "num_input_tokens_seen": 50219520, "step": 86500 }, { "epoch": 12.884271671134941, "grad_norm": 2.5347177982330322, "learning_rate": 1.6926330624852932e-05, "loss": 0.7394, "num_input_tokens_seen": 50222272, "step": 86505 }, { "epoch": 12.8850163836759, "grad_norm": 1.5972598791122437, "learning_rate": 1.6923255384937963e-05, "loss": 0.7484, "num_input_tokens_seen": 50225152, "step": 86510 }, { "epoch": 12.88576109621686, "grad_norm": 1.1744498014450073, "learning_rate": 1.69201802814714e-05, "loss": 0.5104, "num_input_tokens_seen": 50228160, "step": 86515 }, { "epoch": 12.88650580875782, "grad_norm": 1.7680073976516724, "learning_rate": 1.691710531450517e-05, "loss": 0.7502, "num_input_tokens_seen": 50231168, "step": 86520 }, { "epoch": 12.887250521298778, "grad_norm": 2.3628296852111816, "learning_rate": 1.6914030484091235e-05, "loss": 0.5465, "num_input_tokens_seen": 50233760, "step": 86525 }, { "epoch": 12.887995233839739, "grad_norm": 1.5785168409347534, "learning_rate": 1.6910955790281538e-05, "loss": 0.5867, "num_input_tokens_seen": 50236544, "step": 86530 }, { "epoch": 12.888739946380698, "grad_norm": 1.9089316129684448, "learning_rate": 1.690788123312802e-05, "loss": 0.9073, "num_input_tokens_seen": 50239328, "step": 86535 }, { "epoch": 12.889484658921656, "grad_norm": 1.690702199935913, "learning_rate": 1.6904806812682628e-05, "loss": 0.5606, "num_input_tokens_seen": 50242208, "step": 86540 }, { "epoch": 12.890229371462615, "grad_norm": 1.298917293548584, "learning_rate": 1.6901732528997282e-05, "loss": 0.733, "num_input_tokens_seen": 50245248, "step": 86545 }, { "epoch": 12.890974084003574, "grad_norm": 1.1552627086639404, "learning_rate": 1.689865838212395e-05, "loss": 0.5853, "num_input_tokens_seen": 50248480, "step": 86550 }, { "epoch": 12.891718796544534, "grad_norm": 0.9360138177871704, "learning_rate": 1.6895584372114544e-05, "loss": 0.448, "num_input_tokens_seen": 50251072, "step": 86555 }, { "epoch": 12.892463509085493, "grad_norm": 0.8880189657211304, "learning_rate": 1.689251049902101e-05, "loss": 0.5932, "num_input_tokens_seen": 50254208, "step": 86560 }, { "epoch": 12.893208221626452, "grad_norm": 1.7940512895584106, "learning_rate": 1.6889436762895267e-05, "loss": 0.5962, "num_input_tokens_seen": 50257280, "step": 86565 }, { "epoch": 12.89395293416741, "grad_norm": 1.1524254083633423, "learning_rate": 1.6886363163789243e-05, "loss": 0.4512, "num_input_tokens_seen": 50260064, "step": 86570 }, { "epoch": 12.894697646708371, "grad_norm": 1.0823839902877808, "learning_rate": 1.6883289701754872e-05, "loss": 0.6456, "num_input_tokens_seen": 50262816, "step": 86575 }, { "epoch": 12.89544235924933, "grad_norm": 1.1250383853912354, "learning_rate": 1.6880216376844066e-05, "loss": 0.5702, "num_input_tokens_seen": 50265984, "step": 86580 }, { "epoch": 12.896187071790289, "grad_norm": 2.9279227256774902, "learning_rate": 1.6877143189108758e-05, "loss": 0.5502, "num_input_tokens_seen": 50268832, "step": 86585 }, { "epoch": 12.896931784331247, "grad_norm": 1.4626893997192383, "learning_rate": 1.6874070138600855e-05, "loss": 0.5872, "num_input_tokens_seen": 50271648, "step": 86590 }, { "epoch": 12.897676496872208, "grad_norm": 0.9266319274902344, "learning_rate": 1.6870997225372286e-05, "loss": 0.4776, "num_input_tokens_seen": 50274560, "step": 86595 }, { "epoch": 12.898421209413167, "grad_norm": 1.3489301204681396, "learning_rate": 1.686792444947494e-05, "loss": 0.529, "num_input_tokens_seen": 50277408, "step": 86600 }, { "epoch": 12.899165921954125, "grad_norm": 3.1394786834716797, "learning_rate": 1.6864851810960763e-05, "loss": 0.5592, "num_input_tokens_seen": 50280320, "step": 86605 }, { "epoch": 12.899910634495084, "grad_norm": 1.4285292625427246, "learning_rate": 1.6861779309881648e-05, "loss": 0.4432, "num_input_tokens_seen": 50282816, "step": 86610 }, { "epoch": 12.900655347036045, "grad_norm": 1.1854337453842163, "learning_rate": 1.6858706946289486e-05, "loss": 0.5975, "num_input_tokens_seen": 50285504, "step": 86615 }, { "epoch": 12.901400059577004, "grad_norm": 1.776840329170227, "learning_rate": 1.6855634720236206e-05, "loss": 0.624, "num_input_tokens_seen": 50288096, "step": 86620 }, { "epoch": 12.902144772117962, "grad_norm": 2.5665581226348877, "learning_rate": 1.6852562631773694e-05, "loss": 0.6233, "num_input_tokens_seen": 50290912, "step": 86625 }, { "epoch": 12.902889484658921, "grad_norm": 1.3211814165115356, "learning_rate": 1.684949068095386e-05, "loss": 0.5214, "num_input_tokens_seen": 50293696, "step": 86630 }, { "epoch": 12.903634197199882, "grad_norm": 2.14912486076355, "learning_rate": 1.684641886782859e-05, "loss": 0.5994, "num_input_tokens_seen": 50296576, "step": 86635 }, { "epoch": 12.90437890974084, "grad_norm": 1.9786465167999268, "learning_rate": 1.6843347192449793e-05, "loss": 0.6419, "num_input_tokens_seen": 50299424, "step": 86640 }, { "epoch": 12.905123622281799, "grad_norm": 2.1260013580322266, "learning_rate": 1.6840275654869358e-05, "loss": 0.5644, "num_input_tokens_seen": 50302208, "step": 86645 }, { "epoch": 12.905868334822758, "grad_norm": 1.6814454793930054, "learning_rate": 1.6837204255139164e-05, "loss": 0.6972, "num_input_tokens_seen": 50305248, "step": 86650 }, { "epoch": 12.906613047363718, "grad_norm": 2.916945695877075, "learning_rate": 1.6834132993311115e-05, "loss": 0.7848, "num_input_tokens_seen": 50308096, "step": 86655 }, { "epoch": 12.907357759904677, "grad_norm": 1.5774152278900146, "learning_rate": 1.6831061869437086e-05, "loss": 0.5326, "num_input_tokens_seen": 50310816, "step": 86660 }, { "epoch": 12.908102472445636, "grad_norm": 1.2503607273101807, "learning_rate": 1.6827990883568966e-05, "loss": 0.4906, "num_input_tokens_seen": 50313568, "step": 86665 }, { "epoch": 12.908847184986595, "grad_norm": 1.4104586839675903, "learning_rate": 1.6824920035758628e-05, "loss": 0.4878, "num_input_tokens_seen": 50316736, "step": 86670 }, { "epoch": 12.909591897527555, "grad_norm": 2.393815755844116, "learning_rate": 1.6821849326057963e-05, "loss": 0.6059, "num_input_tokens_seen": 50319744, "step": 86675 }, { "epoch": 12.910336610068514, "grad_norm": 1.2046446800231934, "learning_rate": 1.681877875451884e-05, "loss": 0.6819, "num_input_tokens_seen": 50322368, "step": 86680 }, { "epoch": 12.911081322609473, "grad_norm": 1.8567246198654175, "learning_rate": 1.681570832119314e-05, "loss": 0.572, "num_input_tokens_seen": 50325280, "step": 86685 }, { "epoch": 12.911826035150431, "grad_norm": 1.9502546787261963, "learning_rate": 1.6812638026132728e-05, "loss": 0.6633, "num_input_tokens_seen": 50328224, "step": 86690 }, { "epoch": 12.91257074769139, "grad_norm": 1.660944938659668, "learning_rate": 1.680956786938947e-05, "loss": 0.5362, "num_input_tokens_seen": 50331232, "step": 86695 }, { "epoch": 12.91331546023235, "grad_norm": 1.053709626197815, "learning_rate": 1.6806497851015246e-05, "loss": 0.5932, "num_input_tokens_seen": 50334144, "step": 86700 }, { "epoch": 12.91406017277331, "grad_norm": 1.6985036134719849, "learning_rate": 1.68034279710619e-05, "loss": 0.7244, "num_input_tokens_seen": 50336896, "step": 86705 }, { "epoch": 12.914804885314268, "grad_norm": 1.8676481246948242, "learning_rate": 1.6800358229581326e-05, "loss": 0.8608, "num_input_tokens_seen": 50339968, "step": 86710 }, { "epoch": 12.915549597855229, "grad_norm": 1.320124864578247, "learning_rate": 1.6797288626625345e-05, "loss": 0.6286, "num_input_tokens_seen": 50342912, "step": 86715 }, { "epoch": 12.916294310396188, "grad_norm": 1.5929840803146362, "learning_rate": 1.6794219162245855e-05, "loss": 0.5254, "num_input_tokens_seen": 50345728, "step": 86720 }, { "epoch": 12.917039022937146, "grad_norm": 1.9174091815948486, "learning_rate": 1.679114983649469e-05, "loss": 0.787, "num_input_tokens_seen": 50348928, "step": 86725 }, { "epoch": 12.917783735478105, "grad_norm": 1.262915849685669, "learning_rate": 1.6788080649423696e-05, "loss": 0.503, "num_input_tokens_seen": 50351744, "step": 86730 }, { "epoch": 12.918528448019064, "grad_norm": 1.5885812044143677, "learning_rate": 1.678501160108474e-05, "loss": 0.6482, "num_input_tokens_seen": 50354528, "step": 86735 }, { "epoch": 12.919273160560024, "grad_norm": 1.2362830638885498, "learning_rate": 1.6781942691529656e-05, "loss": 0.5905, "num_input_tokens_seen": 50357472, "step": 86740 }, { "epoch": 12.920017873100983, "grad_norm": 1.3472115993499756, "learning_rate": 1.6778873920810305e-05, "loss": 0.5826, "num_input_tokens_seen": 50360512, "step": 86745 }, { "epoch": 12.920762585641942, "grad_norm": 1.1909775733947754, "learning_rate": 1.6775805288978517e-05, "loss": 0.528, "num_input_tokens_seen": 50363008, "step": 86750 }, { "epoch": 12.9215072981829, "grad_norm": 2.3332061767578125, "learning_rate": 1.6772736796086146e-05, "loss": 0.4664, "num_input_tokens_seen": 50365728, "step": 86755 }, { "epoch": 12.922252010723861, "grad_norm": 1.363315224647522, "learning_rate": 1.676966844218502e-05, "loss": 0.6468, "num_input_tokens_seen": 50368640, "step": 86760 }, { "epoch": 12.92299672326482, "grad_norm": 1.186761736869812, "learning_rate": 1.676660022732699e-05, "loss": 0.3985, "num_input_tokens_seen": 50371904, "step": 86765 }, { "epoch": 12.923741435805779, "grad_norm": 1.5809729099273682, "learning_rate": 1.6763532151563878e-05, "loss": 0.7379, "num_input_tokens_seen": 50374784, "step": 86770 }, { "epoch": 12.924486148346737, "grad_norm": 1.8333288431167603, "learning_rate": 1.676046421494751e-05, "loss": 0.7527, "num_input_tokens_seen": 50377600, "step": 86775 }, { "epoch": 12.925230860887698, "grad_norm": 1.2700035572052002, "learning_rate": 1.6757396417529735e-05, "loss": 0.6209, "num_input_tokens_seen": 50380736, "step": 86780 }, { "epoch": 12.925975573428657, "grad_norm": 2.8696610927581787, "learning_rate": 1.675432875936236e-05, "loss": 0.4792, "num_input_tokens_seen": 50383968, "step": 86785 }, { "epoch": 12.926720285969616, "grad_norm": 1.3967138528823853, "learning_rate": 1.6751261240497228e-05, "loss": 0.5319, "num_input_tokens_seen": 50386720, "step": 86790 }, { "epoch": 12.927464998510574, "grad_norm": 1.1391205787658691, "learning_rate": 1.6748193860986152e-05, "loss": 0.7793, "num_input_tokens_seen": 50389760, "step": 86795 }, { "epoch": 12.928209711051535, "grad_norm": 0.9682108163833618, "learning_rate": 1.674512662088096e-05, "loss": 0.5557, "num_input_tokens_seen": 50392416, "step": 86800 }, { "epoch": 12.928954423592494, "grad_norm": 2.023341178894043, "learning_rate": 1.674205952023346e-05, "loss": 0.609, "num_input_tokens_seen": 50395104, "step": 86805 }, { "epoch": 12.929699136133452, "grad_norm": 1.9798686504364014, "learning_rate": 1.6738992559095462e-05, "loss": 0.7411, "num_input_tokens_seen": 50397888, "step": 86810 }, { "epoch": 12.930443848674411, "grad_norm": 1.3561692237854004, "learning_rate": 1.67359257375188e-05, "loss": 0.6656, "num_input_tokens_seen": 50400608, "step": 86815 }, { "epoch": 12.931188561215372, "grad_norm": 1.0430489778518677, "learning_rate": 1.673285905555526e-05, "loss": 0.6033, "num_input_tokens_seen": 50403680, "step": 86820 }, { "epoch": 12.93193327375633, "grad_norm": 2.5700109004974365, "learning_rate": 1.6729792513256682e-05, "loss": 0.5026, "num_input_tokens_seen": 50406400, "step": 86825 }, { "epoch": 12.93267798629729, "grad_norm": 1.1331815719604492, "learning_rate": 1.672672611067484e-05, "loss": 0.676, "num_input_tokens_seen": 50409472, "step": 86830 }, { "epoch": 12.933422698838248, "grad_norm": 1.7465554475784302, "learning_rate": 1.672365984786156e-05, "loss": 0.5349, "num_input_tokens_seen": 50412320, "step": 86835 }, { "epoch": 12.934167411379208, "grad_norm": 4.584172248840332, "learning_rate": 1.6720593724868626e-05, "loss": 0.6222, "num_input_tokens_seen": 50415072, "step": 86840 }, { "epoch": 12.934912123920167, "grad_norm": 1.8376744985580444, "learning_rate": 1.6717527741747857e-05, "loss": 0.7314, "num_input_tokens_seen": 50417888, "step": 86845 }, { "epoch": 12.935656836461126, "grad_norm": 2.1108953952789307, "learning_rate": 1.6714461898551037e-05, "loss": 0.6219, "num_input_tokens_seen": 50420832, "step": 86850 }, { "epoch": 12.936401549002085, "grad_norm": 1.3233096599578857, "learning_rate": 1.6711396195329955e-05, "loss": 0.5519, "num_input_tokens_seen": 50424064, "step": 86855 }, { "epoch": 12.937146261543045, "grad_norm": 1.3877521753311157, "learning_rate": 1.670833063213642e-05, "loss": 0.4963, "num_input_tokens_seen": 50426944, "step": 86860 }, { "epoch": 12.937890974084004, "grad_norm": 1.7072983980178833, "learning_rate": 1.6705265209022204e-05, "loss": 0.5182, "num_input_tokens_seen": 50429696, "step": 86865 }, { "epoch": 12.938635686624963, "grad_norm": 2.4944136142730713, "learning_rate": 1.6702199926039107e-05, "loss": 0.6328, "num_input_tokens_seen": 50432320, "step": 86870 }, { "epoch": 12.939380399165922, "grad_norm": 1.1228984594345093, "learning_rate": 1.66991347832389e-05, "loss": 0.6977, "num_input_tokens_seen": 50435072, "step": 86875 }, { "epoch": 12.94012511170688, "grad_norm": 1.4020283222198486, "learning_rate": 1.669606978067338e-05, "loss": 0.6206, "num_input_tokens_seen": 50437984, "step": 86880 }, { "epoch": 12.94086982424784, "grad_norm": 1.3603272438049316, "learning_rate": 1.669300491839433e-05, "loss": 0.4873, "num_input_tokens_seen": 50440800, "step": 86885 }, { "epoch": 12.9416145367888, "grad_norm": 2.985520601272583, "learning_rate": 1.6689940196453507e-05, "loss": 0.6256, "num_input_tokens_seen": 50443808, "step": 86890 }, { "epoch": 12.942359249329758, "grad_norm": 1.7253220081329346, "learning_rate": 1.66868756149027e-05, "loss": 0.4886, "num_input_tokens_seen": 50447264, "step": 86895 }, { "epoch": 12.943103961870717, "grad_norm": 2.538053035736084, "learning_rate": 1.668381117379368e-05, "loss": 0.613, "num_input_tokens_seen": 50450144, "step": 86900 }, { "epoch": 12.943848674411678, "grad_norm": 1.345786452293396, "learning_rate": 1.6680746873178225e-05, "loss": 0.5708, "num_input_tokens_seen": 50452800, "step": 86905 }, { "epoch": 12.944593386952636, "grad_norm": 2.5331475734710693, "learning_rate": 1.6677682713108082e-05, "loss": 0.7592, "num_input_tokens_seen": 50455744, "step": 86910 }, { "epoch": 12.945338099493595, "grad_norm": 1.1495195627212524, "learning_rate": 1.6674618693635047e-05, "loss": 0.511, "num_input_tokens_seen": 50458624, "step": 86915 }, { "epoch": 12.946082812034554, "grad_norm": 3.232647657394409, "learning_rate": 1.6671554814810857e-05, "loss": 0.6095, "num_input_tokens_seen": 50461536, "step": 86920 }, { "epoch": 12.946827524575514, "grad_norm": 2.132397413253784, "learning_rate": 1.6668491076687294e-05, "loss": 0.7421, "num_input_tokens_seen": 50464640, "step": 86925 }, { "epoch": 12.947572237116473, "grad_norm": 1.955967903137207, "learning_rate": 1.66654274793161e-05, "loss": 0.5364, "num_input_tokens_seen": 50467552, "step": 86930 }, { "epoch": 12.948316949657432, "grad_norm": 1.5678128004074097, "learning_rate": 1.6662364022749035e-05, "loss": 0.7317, "num_input_tokens_seen": 50470656, "step": 86935 }, { "epoch": 12.94906166219839, "grad_norm": 0.9452396631240845, "learning_rate": 1.6659300707037864e-05, "loss": 0.6533, "num_input_tokens_seen": 50473536, "step": 86940 }, { "epoch": 12.949806374739351, "grad_norm": 0.9293254017829895, "learning_rate": 1.665623753223432e-05, "loss": 0.6928, "num_input_tokens_seen": 50476480, "step": 86945 }, { "epoch": 12.95055108728031, "grad_norm": 1.5179193019866943, "learning_rate": 1.6653174498390172e-05, "loss": 0.4843, "num_input_tokens_seen": 50479392, "step": 86950 }, { "epoch": 12.951295799821269, "grad_norm": 2.628257989883423, "learning_rate": 1.665011160555715e-05, "loss": 0.7989, "num_input_tokens_seen": 50482432, "step": 86955 }, { "epoch": 12.952040512362228, "grad_norm": 1.4745484590530396, "learning_rate": 1.6647048853787018e-05, "loss": 0.6385, "num_input_tokens_seen": 50485184, "step": 86960 }, { "epoch": 12.952785224903188, "grad_norm": 2.036940813064575, "learning_rate": 1.6643986243131497e-05, "loss": 0.6751, "num_input_tokens_seen": 50488256, "step": 86965 }, { "epoch": 12.953529937444147, "grad_norm": 1.642959713935852, "learning_rate": 1.664092377364233e-05, "loss": 0.4302, "num_input_tokens_seen": 50491136, "step": 86970 }, { "epoch": 12.954274649985106, "grad_norm": 1.8509676456451416, "learning_rate": 1.663786144537127e-05, "loss": 0.6901, "num_input_tokens_seen": 50494048, "step": 86975 }, { "epoch": 12.955019362526064, "grad_norm": 1.4496506452560425, "learning_rate": 1.6634799258370036e-05, "loss": 0.6281, "num_input_tokens_seen": 50496864, "step": 86980 }, { "epoch": 12.955764075067025, "grad_norm": 0.8643988966941833, "learning_rate": 1.6631737212690373e-05, "loss": 0.6033, "num_input_tokens_seen": 50500096, "step": 86985 }, { "epoch": 12.956508787607984, "grad_norm": 1.1538053750991821, "learning_rate": 1.6628675308384e-05, "loss": 0.5855, "num_input_tokens_seen": 50503136, "step": 86990 }, { "epoch": 12.957253500148942, "grad_norm": 3.134948492050171, "learning_rate": 1.6625613545502653e-05, "loss": 0.6216, "num_input_tokens_seen": 50506080, "step": 86995 }, { "epoch": 12.957998212689901, "grad_norm": 1.2787971496582031, "learning_rate": 1.6622551924098046e-05, "loss": 0.577, "num_input_tokens_seen": 50508800, "step": 87000 }, { "epoch": 12.958742925230862, "grad_norm": 1.2185550928115845, "learning_rate": 1.6619490444221918e-05, "loss": 0.6529, "num_input_tokens_seen": 50511616, "step": 87005 }, { "epoch": 12.95948763777182, "grad_norm": 1.044909119606018, "learning_rate": 1.6616429105925978e-05, "loss": 0.5686, "num_input_tokens_seen": 50514656, "step": 87010 }, { "epoch": 12.96023235031278, "grad_norm": 2.569334030151367, "learning_rate": 1.6613367909261946e-05, "loss": 0.6911, "num_input_tokens_seen": 50517568, "step": 87015 }, { "epoch": 12.960977062853738, "grad_norm": 1.457614779472351, "learning_rate": 1.6610306854281542e-05, "loss": 0.3843, "num_input_tokens_seen": 50520352, "step": 87020 }, { "epoch": 12.961721775394697, "grad_norm": 1.5425727367401123, "learning_rate": 1.6607245941036476e-05, "loss": 0.5244, "num_input_tokens_seen": 50523296, "step": 87025 }, { "epoch": 12.962466487935657, "grad_norm": 1.362721562385559, "learning_rate": 1.660418516957846e-05, "loss": 0.3612, "num_input_tokens_seen": 50526144, "step": 87030 }, { "epoch": 12.963211200476616, "grad_norm": 2.4780325889587402, "learning_rate": 1.66011245399592e-05, "loss": 0.5538, "num_input_tokens_seen": 50529120, "step": 87035 }, { "epoch": 12.963955913017575, "grad_norm": 1.621732234954834, "learning_rate": 1.6598064052230407e-05, "loss": 0.6936, "num_input_tokens_seen": 50532064, "step": 87040 }, { "epoch": 12.964700625558535, "grad_norm": 1.1364349126815796, "learning_rate": 1.659500370644378e-05, "loss": 0.6433, "num_input_tokens_seen": 50534816, "step": 87045 }, { "epoch": 12.965445338099494, "grad_norm": 1.843269944190979, "learning_rate": 1.6591943502651025e-05, "loss": 0.8361, "num_input_tokens_seen": 50537568, "step": 87050 }, { "epoch": 12.966190050640453, "grad_norm": 1.9286799430847168, "learning_rate": 1.6588883440903847e-05, "loss": 0.6647, "num_input_tokens_seen": 50540512, "step": 87055 }, { "epoch": 12.966934763181412, "grad_norm": 0.8807395100593567, "learning_rate": 1.6585823521253924e-05, "loss": 0.4348, "num_input_tokens_seen": 50543360, "step": 87060 }, { "epoch": 12.96767947572237, "grad_norm": 1.1030415296554565, "learning_rate": 1.6582763743752965e-05, "loss": 0.5663, "num_input_tokens_seen": 50546368, "step": 87065 }, { "epoch": 12.96842418826333, "grad_norm": 1.101765513420105, "learning_rate": 1.6579704108452653e-05, "loss": 0.461, "num_input_tokens_seen": 50549344, "step": 87070 }, { "epoch": 12.96916890080429, "grad_norm": 0.7468312978744507, "learning_rate": 1.6576644615404686e-05, "loss": 0.5783, "num_input_tokens_seen": 50552192, "step": 87075 }, { "epoch": 12.969913613345248, "grad_norm": 1.3685388565063477, "learning_rate": 1.657358526466074e-05, "loss": 0.5863, "num_input_tokens_seen": 50554848, "step": 87080 }, { "epoch": 12.970658325886207, "grad_norm": 1.2222201824188232, "learning_rate": 1.6570526056272516e-05, "loss": 0.6596, "num_input_tokens_seen": 50558080, "step": 87085 }, { "epoch": 12.971403038427168, "grad_norm": 1.297724962234497, "learning_rate": 1.656746699029169e-05, "loss": 0.5834, "num_input_tokens_seen": 50561280, "step": 87090 }, { "epoch": 12.972147750968126, "grad_norm": 1.5448263883590698, "learning_rate": 1.6564408066769932e-05, "loss": 0.6013, "num_input_tokens_seen": 50564064, "step": 87095 }, { "epoch": 12.972892463509085, "grad_norm": 1.696690320968628, "learning_rate": 1.6561349285758932e-05, "loss": 0.4874, "num_input_tokens_seen": 50566912, "step": 87100 }, { "epoch": 12.973637176050044, "grad_norm": 1.5574085712432861, "learning_rate": 1.6558290647310347e-05, "loss": 0.5693, "num_input_tokens_seen": 50570048, "step": 87105 }, { "epoch": 12.974381888591004, "grad_norm": 1.8653539419174194, "learning_rate": 1.6555232151475872e-05, "loss": 0.5819, "num_input_tokens_seen": 50572832, "step": 87110 }, { "epoch": 12.975126601131963, "grad_norm": 1.0692498683929443, "learning_rate": 1.6552173798307157e-05, "loss": 0.4397, "num_input_tokens_seen": 50575616, "step": 87115 }, { "epoch": 12.975871313672922, "grad_norm": 2.1394948959350586, "learning_rate": 1.654911558785589e-05, "loss": 0.5778, "num_input_tokens_seen": 50578560, "step": 87120 }, { "epoch": 12.97661602621388, "grad_norm": 1.1061419248580933, "learning_rate": 1.654605752017372e-05, "loss": 0.6111, "num_input_tokens_seen": 50581216, "step": 87125 }, { "epoch": 12.977360738754841, "grad_norm": 0.714816153049469, "learning_rate": 1.6542999595312324e-05, "loss": 0.2985, "num_input_tokens_seen": 50583968, "step": 87130 }, { "epoch": 12.9781054512958, "grad_norm": 1.3448479175567627, "learning_rate": 1.6539941813323353e-05, "loss": 0.6018, "num_input_tokens_seen": 50586752, "step": 87135 }, { "epoch": 12.978850163836759, "grad_norm": 1.1893715858459473, "learning_rate": 1.653688417425846e-05, "loss": 0.6197, "num_input_tokens_seen": 50589696, "step": 87140 }, { "epoch": 12.979594876377718, "grad_norm": 2.5529608726501465, "learning_rate": 1.653382667816931e-05, "loss": 0.5383, "num_input_tokens_seen": 50592544, "step": 87145 }, { "epoch": 12.980339588918678, "grad_norm": 1.5092220306396484, "learning_rate": 1.653076932510755e-05, "loss": 0.6369, "num_input_tokens_seen": 50595680, "step": 87150 }, { "epoch": 12.981084301459637, "grad_norm": 1.6977612972259521, "learning_rate": 1.652771211512484e-05, "loss": 0.5345, "num_input_tokens_seen": 50598688, "step": 87155 }, { "epoch": 12.981829014000596, "grad_norm": 1.2071824073791504, "learning_rate": 1.652465504827282e-05, "loss": 0.6722, "num_input_tokens_seen": 50601504, "step": 87160 }, { "epoch": 12.982573726541554, "grad_norm": 1.9399347305297852, "learning_rate": 1.6521598124603143e-05, "loss": 0.4379, "num_input_tokens_seen": 50604640, "step": 87165 }, { "epoch": 12.983318439082515, "grad_norm": 1.6152757406234741, "learning_rate": 1.651854134416745e-05, "loss": 0.5608, "num_input_tokens_seen": 50607680, "step": 87170 }, { "epoch": 12.984063151623474, "grad_norm": 0.7786646485328674, "learning_rate": 1.651548470701737e-05, "loss": 0.6587, "num_input_tokens_seen": 50611488, "step": 87175 }, { "epoch": 12.984807864164432, "grad_norm": 1.3907740116119385, "learning_rate": 1.6512428213204564e-05, "loss": 0.5908, "num_input_tokens_seen": 50614496, "step": 87180 }, { "epoch": 12.985552576705391, "grad_norm": 1.623976707458496, "learning_rate": 1.6509371862780644e-05, "loss": 0.5554, "num_input_tokens_seen": 50617312, "step": 87185 }, { "epoch": 12.986297289246352, "grad_norm": 1.9921674728393555, "learning_rate": 1.650631565579727e-05, "loss": 0.6036, "num_input_tokens_seen": 50620256, "step": 87190 }, { "epoch": 12.98704200178731, "grad_norm": 2.9035634994506836, "learning_rate": 1.6503259592306053e-05, "loss": 0.6743, "num_input_tokens_seen": 50623072, "step": 87195 }, { "epoch": 12.98778671432827, "grad_norm": 1.5619291067123413, "learning_rate": 1.650020367235864e-05, "loss": 0.4978, "num_input_tokens_seen": 50625568, "step": 87200 }, { "epoch": 12.988531426869228, "grad_norm": 2.5674033164978027, "learning_rate": 1.649714789600663e-05, "loss": 0.6379, "num_input_tokens_seen": 50628800, "step": 87205 }, { "epoch": 12.989276139410187, "grad_norm": 2.40973162651062, "learning_rate": 1.649409226330168e-05, "loss": 0.6113, "num_input_tokens_seen": 50631488, "step": 87210 }, { "epoch": 12.990020851951147, "grad_norm": 1.3999212980270386, "learning_rate": 1.6491036774295393e-05, "loss": 0.5235, "num_input_tokens_seen": 50634176, "step": 87215 }, { "epoch": 12.990765564492106, "grad_norm": 1.511436104774475, "learning_rate": 1.6487981429039383e-05, "loss": 0.6914, "num_input_tokens_seen": 50637088, "step": 87220 }, { "epoch": 12.991510277033065, "grad_norm": 1.7129251956939697, "learning_rate": 1.648492622758528e-05, "loss": 0.6445, "num_input_tokens_seen": 50639968, "step": 87225 }, { "epoch": 12.992254989574025, "grad_norm": 1.3551478385925293, "learning_rate": 1.6481871169984696e-05, "loss": 0.6002, "num_input_tokens_seen": 50642976, "step": 87230 }, { "epoch": 12.992999702114984, "grad_norm": 1.9391331672668457, "learning_rate": 1.647881625628924e-05, "loss": 0.5742, "num_input_tokens_seen": 50646144, "step": 87235 }, { "epoch": 12.993744414655943, "grad_norm": 1.7749933004379272, "learning_rate": 1.6475761486550516e-05, "loss": 0.6709, "num_input_tokens_seen": 50649280, "step": 87240 }, { "epoch": 12.994489127196902, "grad_norm": 1.1046286821365356, "learning_rate": 1.6472706860820152e-05, "loss": 0.5181, "num_input_tokens_seen": 50651808, "step": 87245 }, { "epoch": 12.99523383973786, "grad_norm": 1.0667500495910645, "learning_rate": 1.6469652379149736e-05, "loss": 0.4694, "num_input_tokens_seen": 50654912, "step": 87250 }, { "epoch": 12.995978552278821, "grad_norm": 1.3727575540542603, "learning_rate": 1.6466598041590866e-05, "loss": 0.5317, "num_input_tokens_seen": 50657600, "step": 87255 }, { "epoch": 12.99672326481978, "grad_norm": 1.4080357551574707, "learning_rate": 1.646354384819515e-05, "loss": 0.5674, "num_input_tokens_seen": 50660512, "step": 87260 }, { "epoch": 12.997467977360738, "grad_norm": 0.8818610906600952, "learning_rate": 1.6460489799014188e-05, "loss": 0.5822, "num_input_tokens_seen": 50663392, "step": 87265 }, { "epoch": 12.998212689901697, "grad_norm": 1.7672888040542603, "learning_rate": 1.6457435894099575e-05, "loss": 0.4717, "num_input_tokens_seen": 50666592, "step": 87270 }, { "epoch": 12.998957402442658, "grad_norm": 2.2022817134857178, "learning_rate": 1.645438213350289e-05, "loss": 0.5977, "num_input_tokens_seen": 50669664, "step": 87275 }, { "epoch": 12.999702114983616, "grad_norm": 1.6124354600906372, "learning_rate": 1.645132851727574e-05, "loss": 0.4153, "num_input_tokens_seen": 50672352, "step": 87280 }, { "epoch": 13.0, "eval_loss": 0.6574311256408691, "eval_runtime": 46.9869, "eval_samples_per_second": 63.507, "eval_steps_per_second": 15.877, "num_input_tokens_seen": 50673016, "step": 87282 }, { "epoch": 13.000446827524575, "grad_norm": 2.357461929321289, "learning_rate": 1.6448275045469702e-05, "loss": 0.6136, "num_input_tokens_seen": 50675000, "step": 87285 }, { "epoch": 13.001191540065534, "grad_norm": 1.6185221672058105, "learning_rate": 1.6445221718136376e-05, "loss": 0.4303, "num_input_tokens_seen": 50678072, "step": 87290 }, { "epoch": 13.001936252606495, "grad_norm": 0.8354911804199219, "learning_rate": 1.644216853532733e-05, "loss": 0.5798, "num_input_tokens_seen": 50680952, "step": 87295 }, { "epoch": 13.002680965147453, "grad_norm": 4.103393077850342, "learning_rate": 1.6439115497094137e-05, "loss": 0.8378, "num_input_tokens_seen": 50683928, "step": 87300 }, { "epoch": 13.003425677688412, "grad_norm": 1.8314956426620483, "learning_rate": 1.64360626034884e-05, "loss": 0.5948, "num_input_tokens_seen": 50687096, "step": 87305 }, { "epoch": 13.00417039022937, "grad_norm": 1.7321661710739136, "learning_rate": 1.6433009854561672e-05, "loss": 0.5996, "num_input_tokens_seen": 50689848, "step": 87310 }, { "epoch": 13.004915102770331, "grad_norm": 3.1754472255706787, "learning_rate": 1.6429957250365547e-05, "loss": 0.6071, "num_input_tokens_seen": 50692632, "step": 87315 }, { "epoch": 13.00565981531129, "grad_norm": 1.9921213388442993, "learning_rate": 1.6426904790951575e-05, "loss": 0.6954, "num_input_tokens_seen": 50695576, "step": 87320 }, { "epoch": 13.006404527852249, "grad_norm": 1.6451514959335327, "learning_rate": 1.642385247637134e-05, "loss": 0.6127, "num_input_tokens_seen": 50698872, "step": 87325 }, { "epoch": 13.007149240393208, "grad_norm": 0.8850265145301819, "learning_rate": 1.6420800306676397e-05, "loss": 0.6074, "num_input_tokens_seen": 50701752, "step": 87330 }, { "epoch": 13.007893952934168, "grad_norm": 1.6068615913391113, "learning_rate": 1.641774828191831e-05, "loss": 0.6621, "num_input_tokens_seen": 50704632, "step": 87335 }, { "epoch": 13.008638665475127, "grad_norm": 1.6938835382461548, "learning_rate": 1.641469640214865e-05, "loss": 0.6421, "num_input_tokens_seen": 50707448, "step": 87340 }, { "epoch": 13.009383378016086, "grad_norm": 1.843339443206787, "learning_rate": 1.6411644667418958e-05, "loss": 0.9012, "num_input_tokens_seen": 50710424, "step": 87345 }, { "epoch": 13.010128090557044, "grad_norm": 1.167895793914795, "learning_rate": 1.6408593077780808e-05, "loss": 0.6228, "num_input_tokens_seen": 50713400, "step": 87350 }, { "epoch": 13.010872803098005, "grad_norm": 0.9877637624740601, "learning_rate": 1.6405541633285748e-05, "loss": 0.6241, "num_input_tokens_seen": 50716568, "step": 87355 }, { "epoch": 13.011617515638964, "grad_norm": 1.6310157775878906, "learning_rate": 1.6402490333985325e-05, "loss": 0.6746, "num_input_tokens_seen": 50719352, "step": 87360 }, { "epoch": 13.012362228179922, "grad_norm": 1.0252926349639893, "learning_rate": 1.6399439179931087e-05, "loss": 0.5209, "num_input_tokens_seen": 50722136, "step": 87365 }, { "epoch": 13.013106940720881, "grad_norm": 2.076780080795288, "learning_rate": 1.6396388171174586e-05, "loss": 0.7906, "num_input_tokens_seen": 50725176, "step": 87370 }, { "epoch": 13.013851653261842, "grad_norm": 1.0821539163589478, "learning_rate": 1.6393337307767364e-05, "loss": 0.5014, "num_input_tokens_seen": 50727992, "step": 87375 }, { "epoch": 13.0145963658028, "grad_norm": 1.7822493314743042, "learning_rate": 1.6390286589760957e-05, "loss": 0.5051, "num_input_tokens_seen": 50730648, "step": 87380 }, { "epoch": 13.01534107834376, "grad_norm": 1.7579079866409302, "learning_rate": 1.6387236017206908e-05, "loss": 0.6064, "num_input_tokens_seen": 50733592, "step": 87385 }, { "epoch": 13.016085790884718, "grad_norm": 0.9087205529212952, "learning_rate": 1.6384185590156752e-05, "loss": 0.5753, "num_input_tokens_seen": 50736536, "step": 87390 }, { "epoch": 13.016830503425677, "grad_norm": 0.9132151007652283, "learning_rate": 1.6381135308662032e-05, "loss": 0.5679, "num_input_tokens_seen": 50739512, "step": 87395 }, { "epoch": 13.017575215966637, "grad_norm": 2.4109866619110107, "learning_rate": 1.6378085172774258e-05, "loss": 0.6485, "num_input_tokens_seen": 50742168, "step": 87400 }, { "epoch": 13.018319928507596, "grad_norm": 1.7059335708618164, "learning_rate": 1.6375035182544983e-05, "loss": 0.5549, "num_input_tokens_seen": 50745048, "step": 87405 }, { "epoch": 13.019064641048555, "grad_norm": 1.9012641906738281, "learning_rate": 1.637198533802572e-05, "loss": 0.7037, "num_input_tokens_seen": 50747992, "step": 87410 }, { "epoch": 13.019809353589514, "grad_norm": 1.7340397834777832, "learning_rate": 1.636893563926799e-05, "loss": 0.5086, "num_input_tokens_seen": 50750744, "step": 87415 }, { "epoch": 13.020554066130474, "grad_norm": 1.2253764867782593, "learning_rate": 1.6365886086323327e-05, "loss": 0.5582, "num_input_tokens_seen": 50753624, "step": 87420 }, { "epoch": 13.021298778671433, "grad_norm": 1.7101794481277466, "learning_rate": 1.636283667924324e-05, "loss": 0.6462, "num_input_tokens_seen": 50756184, "step": 87425 }, { "epoch": 13.022043491212392, "grad_norm": 2.282111644744873, "learning_rate": 1.6359787418079254e-05, "loss": 0.5768, "num_input_tokens_seen": 50759288, "step": 87430 }, { "epoch": 13.02278820375335, "grad_norm": 1.9585520029067993, "learning_rate": 1.6356738302882864e-05, "loss": 0.6257, "num_input_tokens_seen": 50762456, "step": 87435 }, { "epoch": 13.023532916294311, "grad_norm": 2.803213357925415, "learning_rate": 1.6353689333705606e-05, "loss": 0.6863, "num_input_tokens_seen": 50765368, "step": 87440 }, { "epoch": 13.02427762883527, "grad_norm": 1.257453203201294, "learning_rate": 1.6350640510598974e-05, "loss": 0.5415, "num_input_tokens_seen": 50768280, "step": 87445 }, { "epoch": 13.025022341376228, "grad_norm": 0.8704382181167603, "learning_rate": 1.634759183361449e-05, "loss": 0.6387, "num_input_tokens_seen": 50771000, "step": 87450 }, { "epoch": 13.025767053917187, "grad_norm": 1.5638844966888428, "learning_rate": 1.6344543302803643e-05, "loss": 0.6455, "num_input_tokens_seen": 50773688, "step": 87455 }, { "epoch": 13.026511766458148, "grad_norm": 1.2332754135131836, "learning_rate": 1.6341494918217938e-05, "loss": 0.5312, "num_input_tokens_seen": 50776632, "step": 87460 }, { "epoch": 13.027256478999107, "grad_norm": 1.3372806310653687, "learning_rate": 1.633844667990888e-05, "loss": 0.5246, "num_input_tokens_seen": 50779384, "step": 87465 }, { "epoch": 13.028001191540065, "grad_norm": 2.5982112884521484, "learning_rate": 1.633539858792795e-05, "loss": 0.6343, "num_input_tokens_seen": 50782328, "step": 87470 }, { "epoch": 13.028745904081024, "grad_norm": 0.998587965965271, "learning_rate": 1.6332350642326673e-05, "loss": 0.5776, "num_input_tokens_seen": 50784984, "step": 87475 }, { "epoch": 13.029490616621985, "grad_norm": 1.1561979055404663, "learning_rate": 1.6329302843156503e-05, "loss": 0.505, "num_input_tokens_seen": 50787928, "step": 87480 }, { "epoch": 13.030235329162943, "grad_norm": 2.016726493835449, "learning_rate": 1.6326255190468965e-05, "loss": 0.5735, "num_input_tokens_seen": 50790680, "step": 87485 }, { "epoch": 13.030980041703902, "grad_norm": 1.379069209098816, "learning_rate": 1.632320768431553e-05, "loss": 0.4572, "num_input_tokens_seen": 50793752, "step": 87490 }, { "epoch": 13.03172475424486, "grad_norm": 4.900578498840332, "learning_rate": 1.6320160324747672e-05, "loss": 0.7737, "num_input_tokens_seen": 50796856, "step": 87495 }, { "epoch": 13.032469466785821, "grad_norm": 2.7176315784454346, "learning_rate": 1.631711311181689e-05, "loss": 0.7169, "num_input_tokens_seen": 50799512, "step": 87500 }, { "epoch": 13.03321417932678, "grad_norm": 1.3948066234588623, "learning_rate": 1.631406604557465e-05, "loss": 0.6274, "num_input_tokens_seen": 50802616, "step": 87505 }, { "epoch": 13.033958891867739, "grad_norm": 1.14376699924469, "learning_rate": 1.6311019126072447e-05, "loss": 0.5414, "num_input_tokens_seen": 50805816, "step": 87510 }, { "epoch": 13.034703604408698, "grad_norm": 1.9333181381225586, "learning_rate": 1.630797235336173e-05, "loss": 0.5386, "num_input_tokens_seen": 50808472, "step": 87515 }, { "epoch": 13.035448316949658, "grad_norm": 4.602596759796143, "learning_rate": 1.6304925727493998e-05, "loss": 0.7123, "num_input_tokens_seen": 50811448, "step": 87520 }, { "epoch": 13.036193029490617, "grad_norm": 0.7539780139923096, "learning_rate": 1.6301879248520707e-05, "loss": 0.4118, "num_input_tokens_seen": 50814520, "step": 87525 }, { "epoch": 13.036937742031576, "grad_norm": 3.9372570514678955, "learning_rate": 1.629883291649333e-05, "loss": 0.7244, "num_input_tokens_seen": 50817272, "step": 87530 }, { "epoch": 13.037682454572534, "grad_norm": 2.0309946537017822, "learning_rate": 1.6295786731463324e-05, "loss": 0.7011, "num_input_tokens_seen": 50820248, "step": 87535 }, { "epoch": 13.038427167113495, "grad_norm": 2.215242862701416, "learning_rate": 1.6292740693482144e-05, "loss": 0.7413, "num_input_tokens_seen": 50823288, "step": 87540 }, { "epoch": 13.039171879654454, "grad_norm": 3.0732507705688477, "learning_rate": 1.6289694802601273e-05, "loss": 0.7824, "num_input_tokens_seen": 50826200, "step": 87545 }, { "epoch": 13.039916592195413, "grad_norm": 1.3688462972640991, "learning_rate": 1.628664905887215e-05, "loss": 0.6383, "num_input_tokens_seen": 50829272, "step": 87550 }, { "epoch": 13.040661304736371, "grad_norm": 1.0204527378082275, "learning_rate": 1.6283603462346235e-05, "loss": 0.5263, "num_input_tokens_seen": 50831864, "step": 87555 }, { "epoch": 13.041406017277332, "grad_norm": 1.5663983821868896, "learning_rate": 1.628055801307498e-05, "loss": 0.5946, "num_input_tokens_seen": 50834904, "step": 87560 }, { "epoch": 13.04215072981829, "grad_norm": 1.9976097345352173, "learning_rate": 1.6277512711109842e-05, "loss": 0.503, "num_input_tokens_seen": 50837720, "step": 87565 }, { "epoch": 13.04289544235925, "grad_norm": 1.2366828918457031, "learning_rate": 1.627446755650226e-05, "loss": 0.5715, "num_input_tokens_seen": 50840568, "step": 87570 }, { "epoch": 13.043640154900208, "grad_norm": 1.7694472074508667, "learning_rate": 1.627142254930367e-05, "loss": 0.5883, "num_input_tokens_seen": 50843640, "step": 87575 }, { "epoch": 13.044384867441167, "grad_norm": 1.0689892768859863, "learning_rate": 1.6268377689565533e-05, "loss": 0.5984, "num_input_tokens_seen": 50846712, "step": 87580 }, { "epoch": 13.045129579982127, "grad_norm": 0.9252195358276367, "learning_rate": 1.6265332977339282e-05, "loss": 0.4915, "num_input_tokens_seen": 50849752, "step": 87585 }, { "epoch": 13.045874292523086, "grad_norm": 1.5064417123794556, "learning_rate": 1.6262288412676345e-05, "loss": 0.6818, "num_input_tokens_seen": 50852664, "step": 87590 }, { "epoch": 13.046619005064045, "grad_norm": 0.8561689853668213, "learning_rate": 1.625924399562817e-05, "loss": 0.4574, "num_input_tokens_seen": 50855352, "step": 87595 }, { "epoch": 13.047363717605004, "grad_norm": 3.0729758739471436, "learning_rate": 1.625619972624619e-05, "loss": 0.629, "num_input_tokens_seen": 50858168, "step": 87600 }, { "epoch": 13.048108430145964, "grad_norm": 2.057619333267212, "learning_rate": 1.6253155604581817e-05, "loss": 0.7388, "num_input_tokens_seen": 50861304, "step": 87605 }, { "epoch": 13.048853142686923, "grad_norm": 1.4916497468948364, "learning_rate": 1.6250111630686498e-05, "loss": 0.5061, "num_input_tokens_seen": 50864152, "step": 87610 }, { "epoch": 13.049597855227882, "grad_norm": 1.2481938600540161, "learning_rate": 1.6247067804611652e-05, "loss": 0.542, "num_input_tokens_seen": 50867224, "step": 87615 }, { "epoch": 13.05034256776884, "grad_norm": 2.330216407775879, "learning_rate": 1.6244024126408695e-05, "loss": 0.5925, "num_input_tokens_seen": 50870200, "step": 87620 }, { "epoch": 13.051087280309801, "grad_norm": 0.5871431231498718, "learning_rate": 1.6240980596129053e-05, "loss": 0.6596, "num_input_tokens_seen": 50873112, "step": 87625 }, { "epoch": 13.05183199285076, "grad_norm": 0.8932385444641113, "learning_rate": 1.6237937213824134e-05, "loss": 0.507, "num_input_tokens_seen": 50876088, "step": 87630 }, { "epoch": 13.052576705391719, "grad_norm": 1.5513510704040527, "learning_rate": 1.623489397954537e-05, "loss": 0.6673, "num_input_tokens_seen": 50879160, "step": 87635 }, { "epoch": 13.053321417932677, "grad_norm": 1.2741212844848633, "learning_rate": 1.623185089334415e-05, "loss": 0.6285, "num_input_tokens_seen": 50882232, "step": 87640 }, { "epoch": 13.054066130473638, "grad_norm": 0.7537400126457214, "learning_rate": 1.6228807955271915e-05, "loss": 0.3915, "num_input_tokens_seen": 50885848, "step": 87645 }, { "epoch": 13.054810843014597, "grad_norm": 1.7223927974700928, "learning_rate": 1.6225765165380046e-05, "loss": 0.5543, "num_input_tokens_seen": 50888856, "step": 87650 }, { "epoch": 13.055555555555555, "grad_norm": 0.8292120695114136, "learning_rate": 1.6222722523719963e-05, "loss": 0.4532, "num_input_tokens_seen": 50891640, "step": 87655 }, { "epoch": 13.056300268096514, "grad_norm": 1.6357489824295044, "learning_rate": 1.6219680030343063e-05, "loss": 0.551, "num_input_tokens_seen": 50894296, "step": 87660 }, { "epoch": 13.057044980637475, "grad_norm": 1.5455511808395386, "learning_rate": 1.6216637685300735e-05, "loss": 0.5804, "num_input_tokens_seen": 50897208, "step": 87665 }, { "epoch": 13.057789693178433, "grad_norm": 1.0645835399627686, "learning_rate": 1.6213595488644393e-05, "loss": 0.4778, "num_input_tokens_seen": 50900088, "step": 87670 }, { "epoch": 13.058534405719392, "grad_norm": 1.1843441724777222, "learning_rate": 1.6210553440425415e-05, "loss": 0.5487, "num_input_tokens_seen": 50903096, "step": 87675 }, { "epoch": 13.059279118260351, "grad_norm": 1.7848342657089233, "learning_rate": 1.6207511540695215e-05, "loss": 0.5727, "num_input_tokens_seen": 50906072, "step": 87680 }, { "epoch": 13.060023830801311, "grad_norm": 2.1007418632507324, "learning_rate": 1.6204469789505165e-05, "loss": 0.8364, "num_input_tokens_seen": 50908888, "step": 87685 }, { "epoch": 13.06076854334227, "grad_norm": 2.9291117191314697, "learning_rate": 1.620142818690667e-05, "loss": 0.4942, "num_input_tokens_seen": 50911640, "step": 87690 }, { "epoch": 13.061513255883229, "grad_norm": 1.6785615682601929, "learning_rate": 1.61983867329511e-05, "loss": 0.6197, "num_input_tokens_seen": 50914552, "step": 87695 }, { "epoch": 13.062257968424188, "grad_norm": 1.3337165117263794, "learning_rate": 1.6195345427689826e-05, "loss": 0.803, "num_input_tokens_seen": 50917400, "step": 87700 }, { "epoch": 13.063002680965148, "grad_norm": 1.4065940380096436, "learning_rate": 1.6192304271174256e-05, "loss": 0.5568, "num_input_tokens_seen": 50920216, "step": 87705 }, { "epoch": 13.063747393506107, "grad_norm": 2.706937789916992, "learning_rate": 1.618926326345574e-05, "loss": 0.5737, "num_input_tokens_seen": 50923256, "step": 87710 }, { "epoch": 13.064492106047066, "grad_norm": 1.4172037839889526, "learning_rate": 1.618622240458568e-05, "loss": 0.5135, "num_input_tokens_seen": 50926040, "step": 87715 }, { "epoch": 13.065236818588025, "grad_norm": 1.3439359664916992, "learning_rate": 1.618318169461543e-05, "loss": 0.4318, "num_input_tokens_seen": 50928952, "step": 87720 }, { "epoch": 13.065981531128985, "grad_norm": 1.450623631477356, "learning_rate": 1.6180141133596367e-05, "loss": 0.82, "num_input_tokens_seen": 50931960, "step": 87725 }, { "epoch": 13.066726243669944, "grad_norm": 1.8897947072982788, "learning_rate": 1.6177100721579847e-05, "loss": 0.6305, "num_input_tokens_seen": 50934776, "step": 87730 }, { "epoch": 13.067470956210903, "grad_norm": 1.4669722318649292, "learning_rate": 1.617406045861725e-05, "loss": 0.6101, "num_input_tokens_seen": 50937592, "step": 87735 }, { "epoch": 13.068215668751861, "grad_norm": 1.6507453918457031, "learning_rate": 1.6171020344759936e-05, "loss": 0.546, "num_input_tokens_seen": 50940504, "step": 87740 }, { "epoch": 13.06896038129282, "grad_norm": 1.6271569728851318, "learning_rate": 1.616798038005925e-05, "loss": 0.6343, "num_input_tokens_seen": 50943192, "step": 87745 }, { "epoch": 13.06970509383378, "grad_norm": 0.8884627223014832, "learning_rate": 1.6164940564566566e-05, "loss": 0.6942, "num_input_tokens_seen": 50946136, "step": 87750 }, { "epoch": 13.07044980637474, "grad_norm": 3.2282049655914307, "learning_rate": 1.6161900898333225e-05, "loss": 0.7425, "num_input_tokens_seen": 50949304, "step": 87755 }, { "epoch": 13.071194518915698, "grad_norm": 1.0237774848937988, "learning_rate": 1.6158861381410593e-05, "loss": 0.4655, "num_input_tokens_seen": 50952152, "step": 87760 }, { "epoch": 13.071939231456657, "grad_norm": 0.8794053196907043, "learning_rate": 1.6155822013850004e-05, "loss": 0.5305, "num_input_tokens_seen": 50955288, "step": 87765 }, { "epoch": 13.072683943997617, "grad_norm": 0.9205459952354431, "learning_rate": 1.615278279570282e-05, "loss": 0.5147, "num_input_tokens_seen": 50957816, "step": 87770 }, { "epoch": 13.073428656538576, "grad_norm": 2.2885141372680664, "learning_rate": 1.614974372702038e-05, "loss": 0.5632, "num_input_tokens_seen": 50960504, "step": 87775 }, { "epoch": 13.074173369079535, "grad_norm": 2.081171989440918, "learning_rate": 1.6146704807854014e-05, "loss": 0.4368, "num_input_tokens_seen": 50963256, "step": 87780 }, { "epoch": 13.074918081620494, "grad_norm": 1.3646646738052368, "learning_rate": 1.6143666038255084e-05, "loss": 0.5486, "num_input_tokens_seen": 50966296, "step": 87785 }, { "epoch": 13.075662794161454, "grad_norm": 1.7956488132476807, "learning_rate": 1.614062741827491e-05, "loss": 0.7291, "num_input_tokens_seen": 50969144, "step": 87790 }, { "epoch": 13.076407506702413, "grad_norm": 2.8594586849212646, "learning_rate": 1.6137588947964838e-05, "loss": 0.561, "num_input_tokens_seen": 50971864, "step": 87795 }, { "epoch": 13.077152219243372, "grad_norm": 1.3357871770858765, "learning_rate": 1.613455062737618e-05, "loss": 0.525, "num_input_tokens_seen": 50974776, "step": 87800 }, { "epoch": 13.07789693178433, "grad_norm": 2.6483020782470703, "learning_rate": 1.613151245656029e-05, "loss": 0.5552, "num_input_tokens_seen": 50977432, "step": 87805 }, { "epoch": 13.078641644325291, "grad_norm": 2.0356802940368652, "learning_rate": 1.612847443556847e-05, "loss": 0.556, "num_input_tokens_seen": 50980568, "step": 87810 }, { "epoch": 13.07938635686625, "grad_norm": 1.2769306898117065, "learning_rate": 1.6125436564452075e-05, "loss": 0.747, "num_input_tokens_seen": 50983672, "step": 87815 }, { "epoch": 13.080131069407209, "grad_norm": 1.9738869667053223, "learning_rate": 1.6122398843262405e-05, "loss": 0.543, "num_input_tokens_seen": 50986648, "step": 87820 }, { "epoch": 13.080875781948167, "grad_norm": 3.739506483078003, "learning_rate": 1.6119361272050777e-05, "loss": 0.8784, "num_input_tokens_seen": 50989816, "step": 87825 }, { "epoch": 13.081620494489128, "grad_norm": 1.744245171546936, "learning_rate": 1.6116323850868526e-05, "loss": 0.4975, "num_input_tokens_seen": 50992696, "step": 87830 }, { "epoch": 13.082365207030087, "grad_norm": 1.1337854862213135, "learning_rate": 1.611328657976694e-05, "loss": 0.474, "num_input_tokens_seen": 50995608, "step": 87835 }, { "epoch": 13.083109919571045, "grad_norm": 1.2540781497955322, "learning_rate": 1.6110249458797355e-05, "loss": 0.5443, "num_input_tokens_seen": 50998392, "step": 87840 }, { "epoch": 13.083854632112004, "grad_norm": 1.3514282703399658, "learning_rate": 1.610721248801106e-05, "loss": 0.7073, "num_input_tokens_seen": 51001336, "step": 87845 }, { "epoch": 13.084599344652965, "grad_norm": 1.464489459991455, "learning_rate": 1.610417566745938e-05, "loss": 0.6463, "num_input_tokens_seen": 51004024, "step": 87850 }, { "epoch": 13.085344057193923, "grad_norm": 1.0674175024032593, "learning_rate": 1.6101138997193615e-05, "loss": 0.6245, "num_input_tokens_seen": 51007320, "step": 87855 }, { "epoch": 13.086088769734882, "grad_norm": 3.266359806060791, "learning_rate": 1.6098102477265057e-05, "loss": 0.6987, "num_input_tokens_seen": 51010456, "step": 87860 }, { "epoch": 13.086833482275841, "grad_norm": 1.503062129020691, "learning_rate": 1.6095066107725015e-05, "loss": 0.6201, "num_input_tokens_seen": 51013432, "step": 87865 }, { "epoch": 13.087578194816802, "grad_norm": 1.958038330078125, "learning_rate": 1.609202988862477e-05, "loss": 0.7574, "num_input_tokens_seen": 51016280, "step": 87870 }, { "epoch": 13.08832290735776, "grad_norm": 3.5091817378997803, "learning_rate": 1.6088993820015634e-05, "loss": 0.7157, "num_input_tokens_seen": 51019448, "step": 87875 }, { "epoch": 13.089067619898719, "grad_norm": 0.7723231315612793, "learning_rate": 1.608595790194889e-05, "loss": 0.729, "num_input_tokens_seen": 51022360, "step": 87880 }, { "epoch": 13.089812332439678, "grad_norm": 3.4816582202911377, "learning_rate": 1.6082922134475823e-05, "loss": 0.5655, "num_input_tokens_seen": 51025080, "step": 87885 }, { "epoch": 13.090557044980638, "grad_norm": 1.9686098098754883, "learning_rate": 1.6079886517647723e-05, "loss": 0.7409, "num_input_tokens_seen": 51027832, "step": 87890 }, { "epoch": 13.091301757521597, "grad_norm": 1.3209800720214844, "learning_rate": 1.6076851051515884e-05, "loss": 0.5222, "num_input_tokens_seen": 51030808, "step": 87895 }, { "epoch": 13.092046470062556, "grad_norm": 1.3241928815841675, "learning_rate": 1.6073815736131577e-05, "loss": 0.5109, "num_input_tokens_seen": 51034104, "step": 87900 }, { "epoch": 13.092791182603515, "grad_norm": 1.553646206855774, "learning_rate": 1.6070780571546066e-05, "loss": 0.4858, "num_input_tokens_seen": 51037048, "step": 87905 }, { "epoch": 13.093535895144473, "grad_norm": 1.076370120048523, "learning_rate": 1.6067745557810656e-05, "loss": 0.4804, "num_input_tokens_seen": 51040184, "step": 87910 }, { "epoch": 13.094280607685434, "grad_norm": 2.0596065521240234, "learning_rate": 1.60647106949766e-05, "loss": 0.5605, "num_input_tokens_seen": 51042808, "step": 87915 }, { "epoch": 13.095025320226393, "grad_norm": 1.3716530799865723, "learning_rate": 1.6061675983095177e-05, "loss": 0.4789, "num_input_tokens_seen": 51045432, "step": 87920 }, { "epoch": 13.095770032767351, "grad_norm": 1.8498013019561768, "learning_rate": 1.605864142221765e-05, "loss": 0.5883, "num_input_tokens_seen": 51048536, "step": 87925 }, { "epoch": 13.09651474530831, "grad_norm": 1.5014489889144897, "learning_rate": 1.60556070123953e-05, "loss": 0.719, "num_input_tokens_seen": 51051448, "step": 87930 }, { "epoch": 13.09725945784927, "grad_norm": 1.6823515892028809, "learning_rate": 1.6052572753679372e-05, "loss": 0.4543, "num_input_tokens_seen": 51054328, "step": 87935 }, { "epoch": 13.09800417039023, "grad_norm": 3.2473013401031494, "learning_rate": 1.604953864612113e-05, "loss": 0.6805, "num_input_tokens_seen": 51057080, "step": 87940 }, { "epoch": 13.098748882931188, "grad_norm": 1.0479894876480103, "learning_rate": 1.6046504689771842e-05, "loss": 0.5539, "num_input_tokens_seen": 51060120, "step": 87945 }, { "epoch": 13.099493595472147, "grad_norm": 0.5037450790405273, "learning_rate": 1.6043470884682753e-05, "loss": 0.5019, "num_input_tokens_seen": 51063160, "step": 87950 }, { "epoch": 13.100238308013108, "grad_norm": 1.0883405208587646, "learning_rate": 1.6040437230905126e-05, "loss": 0.6648, "num_input_tokens_seen": 51066136, "step": 87955 }, { "epoch": 13.100983020554066, "grad_norm": 1.1786847114562988, "learning_rate": 1.6037403728490193e-05, "loss": 0.594, "num_input_tokens_seen": 51068888, "step": 87960 }, { "epoch": 13.101727733095025, "grad_norm": 1.554009199142456, "learning_rate": 1.6034370377489227e-05, "loss": 0.5881, "num_input_tokens_seen": 51072056, "step": 87965 }, { "epoch": 13.102472445635984, "grad_norm": 2.3845605850219727, "learning_rate": 1.6031337177953455e-05, "loss": 0.7087, "num_input_tokens_seen": 51075032, "step": 87970 }, { "epoch": 13.103217158176944, "grad_norm": 1.8736587762832642, "learning_rate": 1.602830412993413e-05, "loss": 0.5774, "num_input_tokens_seen": 51077944, "step": 87975 }, { "epoch": 13.103961870717903, "grad_norm": 0.9471107125282288, "learning_rate": 1.6025271233482492e-05, "loss": 0.4099, "num_input_tokens_seen": 51080632, "step": 87980 }, { "epoch": 13.104706583258862, "grad_norm": 2.0574731826782227, "learning_rate": 1.6022238488649764e-05, "loss": 0.5585, "num_input_tokens_seen": 51083352, "step": 87985 }, { "epoch": 13.10545129579982, "grad_norm": 0.9169963002204895, "learning_rate": 1.6019205895487204e-05, "loss": 0.5962, "num_input_tokens_seen": 51086360, "step": 87990 }, { "epoch": 13.106196008340781, "grad_norm": 2.407670736312866, "learning_rate": 1.6016173454046018e-05, "loss": 0.7195, "num_input_tokens_seen": 51089144, "step": 87995 }, { "epoch": 13.10694072088174, "grad_norm": 1.4610600471496582, "learning_rate": 1.6013141164377467e-05, "loss": 0.7437, "num_input_tokens_seen": 51092088, "step": 88000 }, { "epoch": 13.107685433422699, "grad_norm": 1.5490204095840454, "learning_rate": 1.6010109026532747e-05, "loss": 0.5372, "num_input_tokens_seen": 51094968, "step": 88005 }, { "epoch": 13.108430145963657, "grad_norm": 1.3905025720596313, "learning_rate": 1.600707704056311e-05, "loss": 0.5232, "num_input_tokens_seen": 51098136, "step": 88010 }, { "epoch": 13.109174858504618, "grad_norm": 1.1450201272964478, "learning_rate": 1.6004045206519763e-05, "loss": 0.6779, "num_input_tokens_seen": 51101048, "step": 88015 }, { "epoch": 13.109919571045577, "grad_norm": 1.0940629243850708, "learning_rate": 1.6001013524453928e-05, "loss": 0.6381, "num_input_tokens_seen": 51103736, "step": 88020 }, { "epoch": 13.110664283586535, "grad_norm": 1.9942361116409302, "learning_rate": 1.599798199441683e-05, "loss": 0.7406, "num_input_tokens_seen": 51106840, "step": 88025 }, { "epoch": 13.111408996127494, "grad_norm": 1.9111897945404053, "learning_rate": 1.5994950616459664e-05, "loss": 0.7022, "num_input_tokens_seen": 51109592, "step": 88030 }, { "epoch": 13.112153708668455, "grad_norm": 1.8326585292816162, "learning_rate": 1.5991919390633662e-05, "loss": 0.6719, "num_input_tokens_seen": 51112440, "step": 88035 }, { "epoch": 13.112898421209414, "grad_norm": 1.2877273559570312, "learning_rate": 1.5988888316990018e-05, "loss": 0.5654, "num_input_tokens_seen": 51115128, "step": 88040 }, { "epoch": 13.113643133750372, "grad_norm": 1.3261810541152954, "learning_rate": 1.5985857395579963e-05, "loss": 0.5717, "num_input_tokens_seen": 51117720, "step": 88045 }, { "epoch": 13.114387846291331, "grad_norm": 1.9020602703094482, "learning_rate": 1.5982826626454678e-05, "loss": 0.6234, "num_input_tokens_seen": 51120792, "step": 88050 }, { "epoch": 13.115132558832292, "grad_norm": 2.448702335357666, "learning_rate": 1.5979796009665376e-05, "loss": 0.5428, "num_input_tokens_seen": 51123576, "step": 88055 }, { "epoch": 13.11587727137325, "grad_norm": 1.2197116613388062, "learning_rate": 1.5976765545263254e-05, "loss": 0.6942, "num_input_tokens_seen": 51126648, "step": 88060 }, { "epoch": 13.116621983914209, "grad_norm": 1.6123402118682861, "learning_rate": 1.5973735233299496e-05, "loss": 0.5948, "num_input_tokens_seen": 51129432, "step": 88065 }, { "epoch": 13.117366696455168, "grad_norm": 1.425391435623169, "learning_rate": 1.5970705073825315e-05, "loss": 0.7058, "num_input_tokens_seen": 51132344, "step": 88070 }, { "epoch": 13.118111408996128, "grad_norm": 1.2015774250030518, "learning_rate": 1.596767506689189e-05, "loss": 0.5821, "num_input_tokens_seen": 51135480, "step": 88075 }, { "epoch": 13.118856121537087, "grad_norm": 1.06791090965271, "learning_rate": 1.5964645212550422e-05, "loss": 0.4989, "num_input_tokens_seen": 51138424, "step": 88080 }, { "epoch": 13.119600834078046, "grad_norm": 2.003735065460205, "learning_rate": 1.5961615510852083e-05, "loss": 0.6617, "num_input_tokens_seen": 51141272, "step": 88085 }, { "epoch": 13.120345546619005, "grad_norm": 1.238456130027771, "learning_rate": 1.5958585961848072e-05, "loss": 0.5174, "num_input_tokens_seen": 51144248, "step": 88090 }, { "epoch": 13.121090259159963, "grad_norm": 2.4486420154571533, "learning_rate": 1.5955556565589564e-05, "loss": 0.5027, "num_input_tokens_seen": 51147256, "step": 88095 }, { "epoch": 13.121834971700924, "grad_norm": 1.4552441835403442, "learning_rate": 1.5952527322127718e-05, "loss": 0.705, "num_input_tokens_seen": 51150392, "step": 88100 }, { "epoch": 13.122579684241883, "grad_norm": 2.124899387359619, "learning_rate": 1.5949498231513744e-05, "loss": 0.5834, "num_input_tokens_seen": 51153304, "step": 88105 }, { "epoch": 13.123324396782841, "grad_norm": 1.814893364906311, "learning_rate": 1.5946469293798788e-05, "loss": 0.6364, "num_input_tokens_seen": 51156312, "step": 88110 }, { "epoch": 13.1240691093238, "grad_norm": 1.0643812417984009, "learning_rate": 1.5943440509034038e-05, "loss": 0.596, "num_input_tokens_seen": 51159224, "step": 88115 }, { "epoch": 13.12481382186476, "grad_norm": 1.6255390644073486, "learning_rate": 1.5940411877270655e-05, "loss": 0.4604, "num_input_tokens_seen": 51162232, "step": 88120 }, { "epoch": 13.12555853440572, "grad_norm": 0.979245662689209, "learning_rate": 1.5937383398559808e-05, "loss": 0.6395, "num_input_tokens_seen": 51164984, "step": 88125 }, { "epoch": 13.126303246946678, "grad_norm": 1.4638277292251587, "learning_rate": 1.593435507295265e-05, "loss": 0.6033, "num_input_tokens_seen": 51167672, "step": 88130 }, { "epoch": 13.127047959487637, "grad_norm": 1.93770432472229, "learning_rate": 1.5931326900500353e-05, "loss": 0.6367, "num_input_tokens_seen": 51170392, "step": 88135 }, { "epoch": 13.127792672028598, "grad_norm": 1.5844131708145142, "learning_rate": 1.5928298881254077e-05, "loss": 0.6044, "num_input_tokens_seen": 51173048, "step": 88140 }, { "epoch": 13.128537384569556, "grad_norm": 1.3323802947998047, "learning_rate": 1.5925271015264962e-05, "loss": 0.4345, "num_input_tokens_seen": 51175960, "step": 88145 }, { "epoch": 13.129282097110515, "grad_norm": 1.2889782190322876, "learning_rate": 1.5922243302584176e-05, "loss": 0.646, "num_input_tokens_seen": 51179032, "step": 88150 }, { "epoch": 13.130026809651474, "grad_norm": 1.744986891746521, "learning_rate": 1.5919215743262862e-05, "loss": 0.5819, "num_input_tokens_seen": 51182008, "step": 88155 }, { "epoch": 13.130771522192434, "grad_norm": 1.3535934686660767, "learning_rate": 1.591618833735217e-05, "loss": 0.6566, "num_input_tokens_seen": 51184952, "step": 88160 }, { "epoch": 13.131516234733393, "grad_norm": 1.524744987487793, "learning_rate": 1.5913161084903237e-05, "loss": 0.6845, "num_input_tokens_seen": 51187960, "step": 88165 }, { "epoch": 13.132260947274352, "grad_norm": 1.1715753078460693, "learning_rate": 1.591013398596722e-05, "loss": 0.598, "num_input_tokens_seen": 51190808, "step": 88170 }, { "epoch": 13.13300565981531, "grad_norm": 1.5727699995040894, "learning_rate": 1.5907107040595255e-05, "loss": 0.5713, "num_input_tokens_seen": 51193656, "step": 88175 }, { "epoch": 13.133750372356271, "grad_norm": 1.112330675125122, "learning_rate": 1.590408024883846e-05, "loss": 0.5083, "num_input_tokens_seen": 51196568, "step": 88180 }, { "epoch": 13.13449508489723, "grad_norm": 1.1259636878967285, "learning_rate": 1.5901053610747995e-05, "loss": 0.4364, "num_input_tokens_seen": 51199160, "step": 88185 }, { "epoch": 13.135239797438189, "grad_norm": 1.6490168571472168, "learning_rate": 1.5898027126374974e-05, "loss": 0.5408, "num_input_tokens_seen": 51201944, "step": 88190 }, { "epoch": 13.135984509979147, "grad_norm": 1.0672677755355835, "learning_rate": 1.5895000795770547e-05, "loss": 0.6321, "num_input_tokens_seen": 51204728, "step": 88195 }, { "epoch": 13.136729222520108, "grad_norm": 3.823535203933716, "learning_rate": 1.589197461898581e-05, "loss": 0.6117, "num_input_tokens_seen": 51207640, "step": 88200 }, { "epoch": 13.137473935061067, "grad_norm": 1.6013730764389038, "learning_rate": 1.588894859607192e-05, "loss": 0.5648, "num_input_tokens_seen": 51210360, "step": 88205 }, { "epoch": 13.138218647602026, "grad_norm": 1.4884287118911743, "learning_rate": 1.5885922727079977e-05, "loss": 0.6515, "num_input_tokens_seen": 51212920, "step": 88210 }, { "epoch": 13.138963360142984, "grad_norm": 1.3911875486373901, "learning_rate": 1.5882897012061104e-05, "loss": 0.6285, "num_input_tokens_seen": 51215736, "step": 88215 }, { "epoch": 13.139708072683945, "grad_norm": 1.3805862665176392, "learning_rate": 1.5879871451066424e-05, "loss": 0.6399, "num_input_tokens_seen": 51218872, "step": 88220 }, { "epoch": 13.140452785224904, "grad_norm": 0.9748815298080444, "learning_rate": 1.5876846044147048e-05, "loss": 0.6567, "num_input_tokens_seen": 51221848, "step": 88225 }, { "epoch": 13.141197497765862, "grad_norm": 0.8134578466415405, "learning_rate": 1.5873820791354085e-05, "loss": 0.481, "num_input_tokens_seen": 51224632, "step": 88230 }, { "epoch": 13.141942210306821, "grad_norm": 1.2272756099700928, "learning_rate": 1.5870795692738635e-05, "loss": 0.596, "num_input_tokens_seen": 51227608, "step": 88235 }, { "epoch": 13.142686922847782, "grad_norm": 1.8579926490783691, "learning_rate": 1.5867770748351822e-05, "loss": 0.5689, "num_input_tokens_seen": 51230232, "step": 88240 }, { "epoch": 13.14343163538874, "grad_norm": 1.0001672506332397, "learning_rate": 1.586474595824474e-05, "loss": 0.7019, "num_input_tokens_seen": 51233656, "step": 88245 }, { "epoch": 13.1441763479297, "grad_norm": 1.5800610780715942, "learning_rate": 1.5861721322468487e-05, "loss": 0.7149, "num_input_tokens_seen": 51236696, "step": 88250 }, { "epoch": 13.144921060470658, "grad_norm": 1.6259357929229736, "learning_rate": 1.5858696841074166e-05, "loss": 0.5174, "num_input_tokens_seen": 51239768, "step": 88255 }, { "epoch": 13.145665773011617, "grad_norm": 0.5573622584342957, "learning_rate": 1.5855672514112876e-05, "loss": 0.6897, "num_input_tokens_seen": 51242584, "step": 88260 }, { "epoch": 13.146410485552577, "grad_norm": 2.382969856262207, "learning_rate": 1.5852648341635705e-05, "loss": 0.6505, "num_input_tokens_seen": 51245688, "step": 88265 }, { "epoch": 13.147155198093536, "grad_norm": 1.0576722621917725, "learning_rate": 1.5849624323693736e-05, "loss": 0.5356, "num_input_tokens_seen": 51248696, "step": 88270 }, { "epoch": 13.147899910634495, "grad_norm": 1.5040355920791626, "learning_rate": 1.5846600460338068e-05, "loss": 0.5779, "num_input_tokens_seen": 51251512, "step": 88275 }, { "epoch": 13.148644623175453, "grad_norm": 2.337249279022217, "learning_rate": 1.584357675161978e-05, "loss": 0.5741, "num_input_tokens_seen": 51254200, "step": 88280 }, { "epoch": 13.149389335716414, "grad_norm": 2.4821577072143555, "learning_rate": 1.5840553197589964e-05, "loss": 0.5578, "num_input_tokens_seen": 51257176, "step": 88285 }, { "epoch": 13.150134048257373, "grad_norm": 1.6836884021759033, "learning_rate": 1.5837529798299677e-05, "loss": 0.6372, "num_input_tokens_seen": 51260312, "step": 88290 }, { "epoch": 13.150878760798332, "grad_norm": 1.3929052352905273, "learning_rate": 1.5834506553800026e-05, "loss": 0.5665, "num_input_tokens_seen": 51263320, "step": 88295 }, { "epoch": 13.15162347333929, "grad_norm": 1.7098461389541626, "learning_rate": 1.583148346414207e-05, "loss": 0.5639, "num_input_tokens_seen": 51266200, "step": 88300 }, { "epoch": 13.15236818588025, "grad_norm": 1.1986913681030273, "learning_rate": 1.5828460529376876e-05, "loss": 0.4033, "num_input_tokens_seen": 51268984, "step": 88305 }, { "epoch": 13.15311289842121, "grad_norm": 1.2999069690704346, "learning_rate": 1.5825437749555525e-05, "loss": 0.5587, "num_input_tokens_seen": 51271800, "step": 88310 }, { "epoch": 13.153857610962168, "grad_norm": 1.561190128326416, "learning_rate": 1.582241512472907e-05, "loss": 0.4389, "num_input_tokens_seen": 51274712, "step": 88315 }, { "epoch": 13.154602323503127, "grad_norm": 1.8544952869415283, "learning_rate": 1.58193926549486e-05, "loss": 0.5836, "num_input_tokens_seen": 51278168, "step": 88320 }, { "epoch": 13.155347036044088, "grad_norm": 1.9015965461730957, "learning_rate": 1.5816370340265144e-05, "loss": 0.6773, "num_input_tokens_seen": 51280728, "step": 88325 }, { "epoch": 13.156091748585046, "grad_norm": 2.4191696643829346, "learning_rate": 1.5813348180729788e-05, "loss": 0.4947, "num_input_tokens_seen": 51283800, "step": 88330 }, { "epoch": 13.156836461126005, "grad_norm": 1.1254545450210571, "learning_rate": 1.5810326176393566e-05, "loss": 0.5862, "num_input_tokens_seen": 51286808, "step": 88335 }, { "epoch": 13.157581173666964, "grad_norm": 1.2249667644500732, "learning_rate": 1.5807304327307556e-05, "loss": 0.7712, "num_input_tokens_seen": 51289816, "step": 88340 }, { "epoch": 13.158325886207924, "grad_norm": 0.9931701421737671, "learning_rate": 1.5804282633522793e-05, "loss": 0.4831, "num_input_tokens_seen": 51292696, "step": 88345 }, { "epoch": 13.159070598748883, "grad_norm": 0.8955250382423401, "learning_rate": 1.5801261095090325e-05, "loss": 0.6334, "num_input_tokens_seen": 51295896, "step": 88350 }, { "epoch": 13.159815311289842, "grad_norm": 1.109858512878418, "learning_rate": 1.579823971206121e-05, "loss": 0.4699, "num_input_tokens_seen": 51298744, "step": 88355 }, { "epoch": 13.1605600238308, "grad_norm": 1.714519739151001, "learning_rate": 1.5795218484486468e-05, "loss": 0.5809, "num_input_tokens_seen": 51301816, "step": 88360 }, { "epoch": 13.161304736371761, "grad_norm": 0.9377855658531189, "learning_rate": 1.5792197412417167e-05, "loss": 0.5174, "num_input_tokens_seen": 51304664, "step": 88365 }, { "epoch": 13.16204944891272, "grad_norm": 1.4501591920852661, "learning_rate": 1.578917649590432e-05, "loss": 0.6306, "num_input_tokens_seen": 51307544, "step": 88370 }, { "epoch": 13.162794161453679, "grad_norm": 1.7481484413146973, "learning_rate": 1.5786155734998988e-05, "loss": 0.5544, "num_input_tokens_seen": 51310360, "step": 88375 }, { "epoch": 13.163538873994638, "grad_norm": 3.787599802017212, "learning_rate": 1.578313512975219e-05, "loss": 0.7853, "num_input_tokens_seen": 51313720, "step": 88380 }, { "epoch": 13.164283586535598, "grad_norm": 1.5901012420654297, "learning_rate": 1.5780114680214948e-05, "loss": 0.6258, "num_input_tokens_seen": 51316504, "step": 88385 }, { "epoch": 13.165028299076557, "grad_norm": 1.9002220630645752, "learning_rate": 1.5777094386438306e-05, "loss": 0.6763, "num_input_tokens_seen": 51319672, "step": 88390 }, { "epoch": 13.165773011617516, "grad_norm": 1.3652211427688599, "learning_rate": 1.577407424847327e-05, "loss": 0.5339, "num_input_tokens_seen": 51322616, "step": 88395 }, { "epoch": 13.166517724158474, "grad_norm": 0.8734329342842102, "learning_rate": 1.5771054266370882e-05, "loss": 0.6221, "num_input_tokens_seen": 51325656, "step": 88400 }, { "epoch": 13.167262436699435, "grad_norm": 1.3940237760543823, "learning_rate": 1.5768034440182143e-05, "loss": 0.5466, "num_input_tokens_seen": 51328472, "step": 88405 }, { "epoch": 13.168007149240394, "grad_norm": 2.086949586868286, "learning_rate": 1.576501476995809e-05, "loss": 0.6291, "num_input_tokens_seen": 51331384, "step": 88410 }, { "epoch": 13.168751861781352, "grad_norm": 1.321286678314209, "learning_rate": 1.576199525574972e-05, "loss": 0.5779, "num_input_tokens_seen": 51334264, "step": 88415 }, { "epoch": 13.169496574322311, "grad_norm": 3.135756015777588, "learning_rate": 1.5758975897608056e-05, "loss": 0.8173, "num_input_tokens_seen": 51337176, "step": 88420 }, { "epoch": 13.17024128686327, "grad_norm": 1.4247349500656128, "learning_rate": 1.57559566955841e-05, "loss": 0.4307, "num_input_tokens_seen": 51339928, "step": 88425 }, { "epoch": 13.17098599940423, "grad_norm": 1.4778929948806763, "learning_rate": 1.5752937649728854e-05, "loss": 0.5711, "num_input_tokens_seen": 51342648, "step": 88430 }, { "epoch": 13.17173071194519, "grad_norm": 2.994058847427368, "learning_rate": 1.574991876009334e-05, "loss": 0.5685, "num_input_tokens_seen": 51345208, "step": 88435 }, { "epoch": 13.172475424486148, "grad_norm": 1.2324271202087402, "learning_rate": 1.574690002672853e-05, "loss": 0.451, "num_input_tokens_seen": 51348184, "step": 88440 }, { "epoch": 13.173220137027107, "grad_norm": 1.9656471014022827, "learning_rate": 1.5743881449685456e-05, "loss": 0.5457, "num_input_tokens_seen": 51350808, "step": 88445 }, { "epoch": 13.173964849568067, "grad_norm": 2.1227774620056152, "learning_rate": 1.574086302901509e-05, "loss": 0.732, "num_input_tokens_seen": 51353464, "step": 88450 }, { "epoch": 13.174709562109026, "grad_norm": 1.1919091939926147, "learning_rate": 1.5737844764768437e-05, "loss": 0.5715, "num_input_tokens_seen": 51356600, "step": 88455 }, { "epoch": 13.175454274649985, "grad_norm": 1.5118385553359985, "learning_rate": 1.5734826656996482e-05, "loss": 0.556, "num_input_tokens_seen": 51359352, "step": 88460 }, { "epoch": 13.176198987190944, "grad_norm": 1.1593141555786133, "learning_rate": 1.5731808705750206e-05, "loss": 0.5752, "num_input_tokens_seen": 51362136, "step": 88465 }, { "epoch": 13.176943699731904, "grad_norm": 1.8480560779571533, "learning_rate": 1.5728790911080612e-05, "loss": 0.575, "num_input_tokens_seen": 51365112, "step": 88470 }, { "epoch": 13.177688412272863, "grad_norm": 1.811399221420288, "learning_rate": 1.5725773273038662e-05, "loss": 0.4842, "num_input_tokens_seen": 51367960, "step": 88475 }, { "epoch": 13.178433124813822, "grad_norm": 1.387723445892334, "learning_rate": 1.5722755791675358e-05, "loss": 0.7559, "num_input_tokens_seen": 51370744, "step": 88480 }, { "epoch": 13.17917783735478, "grad_norm": 1.3847577571868896, "learning_rate": 1.571973846704166e-05, "loss": 0.5526, "num_input_tokens_seen": 51373560, "step": 88485 }, { "epoch": 13.17992254989574, "grad_norm": 1.3770627975463867, "learning_rate": 1.5716721299188553e-05, "loss": 0.5846, "num_input_tokens_seen": 51376728, "step": 88490 }, { "epoch": 13.1806672624367, "grad_norm": 1.544957160949707, "learning_rate": 1.5713704288166998e-05, "loss": 0.7444, "num_input_tokens_seen": 51379576, "step": 88495 }, { "epoch": 13.181411974977658, "grad_norm": 1.8193726539611816, "learning_rate": 1.5710687434027976e-05, "loss": 0.4858, "num_input_tokens_seen": 51382488, "step": 88500 }, { "epoch": 13.182156687518617, "grad_norm": 1.7568180561065674, "learning_rate": 1.5707670736822448e-05, "loss": 0.541, "num_input_tokens_seen": 51385176, "step": 88505 }, { "epoch": 13.182901400059578, "grad_norm": 1.2846413850784302, "learning_rate": 1.5704654196601375e-05, "loss": 0.5429, "num_input_tokens_seen": 51387864, "step": 88510 }, { "epoch": 13.183646112600536, "grad_norm": 0.7452306747436523, "learning_rate": 1.5701637813415728e-05, "loss": 0.7494, "num_input_tokens_seen": 51390584, "step": 88515 }, { "epoch": 13.184390825141495, "grad_norm": 2.0821168422698975, "learning_rate": 1.5698621587316454e-05, "loss": 0.5546, "num_input_tokens_seen": 51393368, "step": 88520 }, { "epoch": 13.185135537682454, "grad_norm": 1.3358540534973145, "learning_rate": 1.5695605518354524e-05, "loss": 0.4872, "num_input_tokens_seen": 51396216, "step": 88525 }, { "epoch": 13.185880250223414, "grad_norm": 1.377402663230896, "learning_rate": 1.5692589606580866e-05, "loss": 0.5493, "num_input_tokens_seen": 51399288, "step": 88530 }, { "epoch": 13.186624962764373, "grad_norm": 1.2580666542053223, "learning_rate": 1.568957385204646e-05, "loss": 0.6618, "num_input_tokens_seen": 51402264, "step": 88535 }, { "epoch": 13.187369675305332, "grad_norm": 1.599586009979248, "learning_rate": 1.568655825480224e-05, "loss": 0.5725, "num_input_tokens_seen": 51405048, "step": 88540 }, { "epoch": 13.18811438784629, "grad_norm": 1.2356990575790405, "learning_rate": 1.5683542814899144e-05, "loss": 0.612, "num_input_tokens_seen": 51407800, "step": 88545 }, { "epoch": 13.188859100387251, "grad_norm": 2.620561122894287, "learning_rate": 1.5680527532388133e-05, "loss": 0.5401, "num_input_tokens_seen": 51410424, "step": 88550 }, { "epoch": 13.18960381292821, "grad_norm": 1.3136252164840698, "learning_rate": 1.5677512407320133e-05, "loss": 0.693, "num_input_tokens_seen": 51413464, "step": 88555 }, { "epoch": 13.190348525469169, "grad_norm": 1.6353740692138672, "learning_rate": 1.5674497439746088e-05, "loss": 0.5778, "num_input_tokens_seen": 51416568, "step": 88560 }, { "epoch": 13.191093238010128, "grad_norm": 2.214552879333496, "learning_rate": 1.5671482629716926e-05, "loss": 0.6501, "num_input_tokens_seen": 51419512, "step": 88565 }, { "epoch": 13.191837950551088, "grad_norm": 1.4318716526031494, "learning_rate": 1.566846797728359e-05, "loss": 0.4468, "num_input_tokens_seen": 51422424, "step": 88570 }, { "epoch": 13.192582663092047, "grad_norm": 1.6866766214370728, "learning_rate": 1.5665453482497e-05, "loss": 0.607, "num_input_tokens_seen": 51425240, "step": 88575 }, { "epoch": 13.193327375633006, "grad_norm": 2.146005868911743, "learning_rate": 1.5662439145408084e-05, "loss": 0.7104, "num_input_tokens_seen": 51427832, "step": 88580 }, { "epoch": 13.194072088173964, "grad_norm": 1.9839394092559814, "learning_rate": 1.5659424966067775e-05, "loss": 0.6419, "num_input_tokens_seen": 51430552, "step": 88585 }, { "epoch": 13.194816800714925, "grad_norm": 1.324506163597107, "learning_rate": 1.5656410944526984e-05, "loss": 0.5839, "num_input_tokens_seen": 51433496, "step": 88590 }, { "epoch": 13.195561513255884, "grad_norm": 1.948386311531067, "learning_rate": 1.5653397080836633e-05, "loss": 0.6314, "num_input_tokens_seen": 51436248, "step": 88595 }, { "epoch": 13.196306225796842, "grad_norm": 1.839235782623291, "learning_rate": 1.5650383375047634e-05, "loss": 0.4428, "num_input_tokens_seen": 51439224, "step": 88600 }, { "epoch": 13.197050938337801, "grad_norm": 1.0581505298614502, "learning_rate": 1.5647369827210917e-05, "loss": 0.5114, "num_input_tokens_seen": 51442168, "step": 88605 }, { "epoch": 13.19779565087876, "grad_norm": 1.3573962450027466, "learning_rate": 1.5644356437377373e-05, "loss": 0.6567, "num_input_tokens_seen": 51445048, "step": 88610 }, { "epoch": 13.19854036341972, "grad_norm": 1.2275922298431396, "learning_rate": 1.5641343205597925e-05, "loss": 0.5367, "num_input_tokens_seen": 51447896, "step": 88615 }, { "epoch": 13.19928507596068, "grad_norm": 1.258986473083496, "learning_rate": 1.5638330131923465e-05, "loss": 0.6357, "num_input_tokens_seen": 51450904, "step": 88620 }, { "epoch": 13.200029788501638, "grad_norm": 1.3629058599472046, "learning_rate": 1.5635317216404906e-05, "loss": 0.5902, "num_input_tokens_seen": 51453784, "step": 88625 }, { "epoch": 13.200774501042597, "grad_norm": 1.7508115768432617, "learning_rate": 1.5632304459093145e-05, "loss": 0.5027, "num_input_tokens_seen": 51456600, "step": 88630 }, { "epoch": 13.201519213583557, "grad_norm": 2.061732769012451, "learning_rate": 1.562929186003907e-05, "loss": 0.8097, "num_input_tokens_seen": 51459352, "step": 88635 }, { "epoch": 13.202263926124516, "grad_norm": 0.8120457530021667, "learning_rate": 1.56262794192936e-05, "loss": 0.6413, "num_input_tokens_seen": 51462424, "step": 88640 }, { "epoch": 13.203008638665475, "grad_norm": 2.5070149898529053, "learning_rate": 1.5623267136907602e-05, "loss": 0.5594, "num_input_tokens_seen": 51464984, "step": 88645 }, { "epoch": 13.203753351206434, "grad_norm": 1.5928491353988647, "learning_rate": 1.5620255012931984e-05, "loss": 0.6582, "num_input_tokens_seen": 51467896, "step": 88650 }, { "epoch": 13.204498063747394, "grad_norm": 1.1994634866714478, "learning_rate": 1.5617243047417614e-05, "loss": 0.6286, "num_input_tokens_seen": 51470648, "step": 88655 }, { "epoch": 13.205242776288353, "grad_norm": 1.182234525680542, "learning_rate": 1.5614231240415393e-05, "loss": 0.7399, "num_input_tokens_seen": 51473976, "step": 88660 }, { "epoch": 13.205987488829312, "grad_norm": 1.2313555479049683, "learning_rate": 1.5611219591976198e-05, "loss": 0.741, "num_input_tokens_seen": 51476920, "step": 88665 }, { "epoch": 13.20673220137027, "grad_norm": 1.872759461402893, "learning_rate": 1.5608208102150895e-05, "loss": 0.7248, "num_input_tokens_seen": 51479736, "step": 88670 }, { "epoch": 13.207476913911231, "grad_norm": 1.8583173751831055, "learning_rate": 1.560519677099038e-05, "loss": 0.5885, "num_input_tokens_seen": 51482392, "step": 88675 }, { "epoch": 13.20822162645219, "grad_norm": 1.5553712844848633, "learning_rate": 1.5602185598545515e-05, "loss": 0.5979, "num_input_tokens_seen": 51485208, "step": 88680 }, { "epoch": 13.208966338993148, "grad_norm": 1.0014885663986206, "learning_rate": 1.5599174584867177e-05, "loss": 0.5992, "num_input_tokens_seen": 51488120, "step": 88685 }, { "epoch": 13.209711051534107, "grad_norm": 1.7008930444717407, "learning_rate": 1.5596163730006218e-05, "loss": 0.596, "num_input_tokens_seen": 51490968, "step": 88690 }, { "epoch": 13.210455764075068, "grad_norm": 1.3051398992538452, "learning_rate": 1.5593153034013526e-05, "loss": 0.6618, "num_input_tokens_seen": 51493848, "step": 88695 }, { "epoch": 13.211200476616026, "grad_norm": 1.3833621740341187, "learning_rate": 1.5590142496939956e-05, "loss": 0.6283, "num_input_tokens_seen": 51496888, "step": 88700 }, { "epoch": 13.211945189156985, "grad_norm": 0.837661623954773, "learning_rate": 1.558713211883635e-05, "loss": 0.6306, "num_input_tokens_seen": 51499864, "step": 88705 }, { "epoch": 13.212689901697944, "grad_norm": 1.4059518575668335, "learning_rate": 1.5584121899753595e-05, "loss": 0.6068, "num_input_tokens_seen": 51502872, "step": 88710 }, { "epoch": 13.213434614238905, "grad_norm": 1.422011137008667, "learning_rate": 1.558111183974252e-05, "loss": 0.6765, "num_input_tokens_seen": 51505528, "step": 88715 }, { "epoch": 13.214179326779863, "grad_norm": 1.350569248199463, "learning_rate": 1.5578101938853994e-05, "loss": 0.548, "num_input_tokens_seen": 51508440, "step": 88720 }, { "epoch": 13.214924039320822, "grad_norm": 1.262738585472107, "learning_rate": 1.5575092197138852e-05, "loss": 0.6444, "num_input_tokens_seen": 51511384, "step": 88725 }, { "epoch": 13.21566875186178, "grad_norm": 1.3798213005065918, "learning_rate": 1.557208261464796e-05, "loss": 0.5117, "num_input_tokens_seen": 51514008, "step": 88730 }, { "epoch": 13.216413464402741, "grad_norm": 1.5454977750778198, "learning_rate": 1.556907319143214e-05, "loss": 0.5971, "num_input_tokens_seen": 51516920, "step": 88735 }, { "epoch": 13.2171581769437, "grad_norm": 1.8718504905700684, "learning_rate": 1.5566063927542245e-05, "loss": 0.4936, "num_input_tokens_seen": 51520504, "step": 88740 }, { "epoch": 13.217902889484659, "grad_norm": 2.133538246154785, "learning_rate": 1.5563054823029122e-05, "loss": 0.6313, "num_input_tokens_seen": 51523512, "step": 88745 }, { "epoch": 13.218647602025618, "grad_norm": 1.9423645734786987, "learning_rate": 1.5560045877943585e-05, "loss": 0.6817, "num_input_tokens_seen": 51526264, "step": 88750 }, { "epoch": 13.219392314566578, "grad_norm": 1.6670055389404297, "learning_rate": 1.5557037092336486e-05, "loss": 0.6924, "num_input_tokens_seen": 51529528, "step": 88755 }, { "epoch": 13.220137027107537, "grad_norm": 2.1246769428253174, "learning_rate": 1.5554028466258634e-05, "loss": 0.7306, "num_input_tokens_seen": 51532056, "step": 88760 }, { "epoch": 13.220881739648496, "grad_norm": 1.973534107208252, "learning_rate": 1.5551019999760885e-05, "loss": 0.7819, "num_input_tokens_seen": 51534776, "step": 88765 }, { "epoch": 13.221626452189454, "grad_norm": 1.5138121843338013, "learning_rate": 1.554801169289404e-05, "loss": 0.7024, "num_input_tokens_seen": 51537592, "step": 88770 }, { "epoch": 13.222371164730415, "grad_norm": 1.8072980642318726, "learning_rate": 1.554500354570894e-05, "loss": 0.5812, "num_input_tokens_seen": 51540248, "step": 88775 }, { "epoch": 13.223115877271374, "grad_norm": 1.7237377166748047, "learning_rate": 1.5541995558256394e-05, "loss": 0.6229, "num_input_tokens_seen": 51542968, "step": 88780 }, { "epoch": 13.223860589812332, "grad_norm": 2.0981101989746094, "learning_rate": 1.5538987730587217e-05, "loss": 0.6443, "num_input_tokens_seen": 51545688, "step": 88785 }, { "epoch": 13.224605302353291, "grad_norm": 1.072751522064209, "learning_rate": 1.553598006275223e-05, "loss": 0.5698, "num_input_tokens_seen": 51548568, "step": 88790 }, { "epoch": 13.22535001489425, "grad_norm": 2.2811834812164307, "learning_rate": 1.5532972554802232e-05, "loss": 0.6291, "num_input_tokens_seen": 51551288, "step": 88795 }, { "epoch": 13.22609472743521, "grad_norm": 1.9700212478637695, "learning_rate": 1.552996520678805e-05, "loss": 0.7118, "num_input_tokens_seen": 51554200, "step": 88800 }, { "epoch": 13.22683943997617, "grad_norm": 1.4325357675552368, "learning_rate": 1.5526958018760473e-05, "loss": 0.5237, "num_input_tokens_seen": 51557048, "step": 88805 }, { "epoch": 13.227584152517128, "grad_norm": 1.8430455923080444, "learning_rate": 1.552395099077032e-05, "loss": 0.6983, "num_input_tokens_seen": 51559960, "step": 88810 }, { "epoch": 13.228328865058087, "grad_norm": 1.0994101762771606, "learning_rate": 1.552094412286838e-05, "loss": 0.5202, "num_input_tokens_seen": 51562840, "step": 88815 }, { "epoch": 13.229073577599047, "grad_norm": 2.649446487426758, "learning_rate": 1.551793741510546e-05, "loss": 0.529, "num_input_tokens_seen": 51565976, "step": 88820 }, { "epoch": 13.229818290140006, "grad_norm": 3.5495142936706543, "learning_rate": 1.5514930867532352e-05, "loss": 0.5994, "num_input_tokens_seen": 51568760, "step": 88825 }, { "epoch": 13.230563002680965, "grad_norm": 1.775863528251648, "learning_rate": 1.5511924480199836e-05, "loss": 0.6787, "num_input_tokens_seen": 51571608, "step": 88830 }, { "epoch": 13.231307715221924, "grad_norm": 0.981706440448761, "learning_rate": 1.550891825315872e-05, "loss": 0.4523, "num_input_tokens_seen": 51574520, "step": 88835 }, { "epoch": 13.232052427762884, "grad_norm": 1.7027398347854614, "learning_rate": 1.5505912186459775e-05, "loss": 0.6805, "num_input_tokens_seen": 51577496, "step": 88840 }, { "epoch": 13.232797140303843, "grad_norm": 2.647686719894409, "learning_rate": 1.5502906280153806e-05, "loss": 0.66, "num_input_tokens_seen": 51580344, "step": 88845 }, { "epoch": 13.233541852844802, "grad_norm": 1.6931369304656982, "learning_rate": 1.5499900534291575e-05, "loss": 0.5044, "num_input_tokens_seen": 51583352, "step": 88850 }, { "epoch": 13.23428656538576, "grad_norm": 0.9565962553024292, "learning_rate": 1.5496894948923873e-05, "loss": 0.3236, "num_input_tokens_seen": 51586008, "step": 88855 }, { "epoch": 13.235031277926721, "grad_norm": 1.4083340167999268, "learning_rate": 1.5493889524101467e-05, "loss": 0.6931, "num_input_tokens_seen": 51588760, "step": 88860 }, { "epoch": 13.23577599046768, "grad_norm": 2.2381579875946045, "learning_rate": 1.5490884259875143e-05, "loss": 0.7084, "num_input_tokens_seen": 51591416, "step": 88865 }, { "epoch": 13.236520703008638, "grad_norm": 1.3622418642044067, "learning_rate": 1.5487879156295665e-05, "loss": 0.5983, "num_input_tokens_seen": 51594552, "step": 88870 }, { "epoch": 13.237265415549597, "grad_norm": 1.5774269104003906, "learning_rate": 1.548487421341379e-05, "loss": 0.5476, "num_input_tokens_seen": 51597528, "step": 88875 }, { "epoch": 13.238010128090558, "grad_norm": 1.0760942697525024, "learning_rate": 1.548186943128031e-05, "loss": 0.8762, "num_input_tokens_seen": 51600472, "step": 88880 }, { "epoch": 13.238754840631517, "grad_norm": 2.1492278575897217, "learning_rate": 1.5478864809945965e-05, "loss": 0.5718, "num_input_tokens_seen": 51603224, "step": 88885 }, { "epoch": 13.239499553172475, "grad_norm": 2.1117727756500244, "learning_rate": 1.5475860349461524e-05, "loss": 0.8096, "num_input_tokens_seen": 51606200, "step": 88890 }, { "epoch": 13.240244265713434, "grad_norm": 1.5181002616882324, "learning_rate": 1.5472856049877733e-05, "loss": 0.5868, "num_input_tokens_seen": 51609112, "step": 88895 }, { "epoch": 13.240988978254395, "grad_norm": 1.332169532775879, "learning_rate": 1.5469851911245368e-05, "loss": 0.5498, "num_input_tokens_seen": 51612248, "step": 88900 }, { "epoch": 13.241733690795353, "grad_norm": 1.2381926774978638, "learning_rate": 1.5466847933615165e-05, "loss": 0.5561, "num_input_tokens_seen": 51614840, "step": 88905 }, { "epoch": 13.242478403336312, "grad_norm": 2.593684434890747, "learning_rate": 1.5463844117037872e-05, "loss": 0.8275, "num_input_tokens_seen": 51617592, "step": 88910 }, { "epoch": 13.24322311587727, "grad_norm": 1.0001972913742065, "learning_rate": 1.5460840461564247e-05, "loss": 0.5801, "num_input_tokens_seen": 51620600, "step": 88915 }, { "epoch": 13.243967828418231, "grad_norm": 2.302717447280884, "learning_rate": 1.5457836967245027e-05, "loss": 0.6687, "num_input_tokens_seen": 51623768, "step": 88920 }, { "epoch": 13.24471254095919, "grad_norm": 1.0705891847610474, "learning_rate": 1.5454833634130955e-05, "loss": 0.6522, "num_input_tokens_seen": 51626808, "step": 88925 }, { "epoch": 13.245457253500149, "grad_norm": 2.8016884326934814, "learning_rate": 1.5451830462272753e-05, "loss": 0.5212, "num_input_tokens_seen": 51629880, "step": 88930 }, { "epoch": 13.246201966041108, "grad_norm": 1.4913769960403442, "learning_rate": 1.5448827451721188e-05, "loss": 0.4435, "num_input_tokens_seen": 51632696, "step": 88935 }, { "epoch": 13.246946678582066, "grad_norm": 2.7828025817871094, "learning_rate": 1.5445824602526966e-05, "loss": 0.6093, "num_input_tokens_seen": 51635288, "step": 88940 }, { "epoch": 13.247691391123027, "grad_norm": 1.273382306098938, "learning_rate": 1.5442821914740836e-05, "loss": 0.6145, "num_input_tokens_seen": 51638008, "step": 88945 }, { "epoch": 13.248436103663986, "grad_norm": 2.362046003341675, "learning_rate": 1.543981938841351e-05, "loss": 0.7615, "num_input_tokens_seen": 51640824, "step": 88950 }, { "epoch": 13.249180816204944, "grad_norm": 1.7833963632583618, "learning_rate": 1.5436817023595716e-05, "loss": 0.569, "num_input_tokens_seen": 51643640, "step": 88955 }, { "epoch": 13.249925528745903, "grad_norm": 1.0344864130020142, "learning_rate": 1.543381482033819e-05, "loss": 0.4425, "num_input_tokens_seen": 51646296, "step": 88960 }, { "epoch": 13.250670241286864, "grad_norm": 1.3211828470230103, "learning_rate": 1.5430812778691626e-05, "loss": 0.6277, "num_input_tokens_seen": 51649016, "step": 88965 }, { "epoch": 13.251414953827823, "grad_norm": 0.9710714817047119, "learning_rate": 1.5427810898706764e-05, "loss": 0.5789, "num_input_tokens_seen": 51652216, "step": 88970 }, { "epoch": 13.252159666368781, "grad_norm": 1.8292051553726196, "learning_rate": 1.54248091804343e-05, "loss": 0.444, "num_input_tokens_seen": 51654904, "step": 88975 }, { "epoch": 13.25290437890974, "grad_norm": 2.1959476470947266, "learning_rate": 1.5421807623924968e-05, "loss": 0.5314, "num_input_tokens_seen": 51657592, "step": 88980 }, { "epoch": 13.2536490914507, "grad_norm": 1.491050124168396, "learning_rate": 1.5418806229229452e-05, "loss": 0.5553, "num_input_tokens_seen": 51660376, "step": 88985 }, { "epoch": 13.25439380399166, "grad_norm": 2.4843356609344482, "learning_rate": 1.541580499639846e-05, "loss": 0.5236, "num_input_tokens_seen": 51663480, "step": 88990 }, { "epoch": 13.255138516532618, "grad_norm": 3.2262156009674072, "learning_rate": 1.541280392548271e-05, "loss": 0.8033, "num_input_tokens_seen": 51666264, "step": 88995 }, { "epoch": 13.255883229073577, "grad_norm": 1.4508960247039795, "learning_rate": 1.5409803016532888e-05, "loss": 0.4702, "num_input_tokens_seen": 51668984, "step": 89000 }, { "epoch": 13.256627941614537, "grad_norm": 3.217695713043213, "learning_rate": 1.5406802269599703e-05, "loss": 0.6144, "num_input_tokens_seen": 51671832, "step": 89005 }, { "epoch": 13.257372654155496, "grad_norm": 1.1759740114212036, "learning_rate": 1.540380168473384e-05, "loss": 0.539, "num_input_tokens_seen": 51674744, "step": 89010 }, { "epoch": 13.258117366696455, "grad_norm": 2.6219234466552734, "learning_rate": 1.5400801261986e-05, "loss": 0.731, "num_input_tokens_seen": 51677624, "step": 89015 }, { "epoch": 13.258862079237414, "grad_norm": 1.239508867263794, "learning_rate": 1.5397801001406857e-05, "loss": 0.6359, "num_input_tokens_seen": 51680664, "step": 89020 }, { "epoch": 13.259606791778374, "grad_norm": 1.1089900732040405, "learning_rate": 1.5394800903047114e-05, "loss": 0.5715, "num_input_tokens_seen": 51683512, "step": 89025 }, { "epoch": 13.260351504319333, "grad_norm": 1.5963636636734009, "learning_rate": 1.5391800966957448e-05, "loss": 0.5305, "num_input_tokens_seen": 51686680, "step": 89030 }, { "epoch": 13.261096216860292, "grad_norm": 1.8704272508621216, "learning_rate": 1.538880119318853e-05, "loss": 0.716, "num_input_tokens_seen": 51689592, "step": 89035 }, { "epoch": 13.26184092940125, "grad_norm": 1.8964382410049438, "learning_rate": 1.538580158179106e-05, "loss": 0.7148, "num_input_tokens_seen": 51692696, "step": 89040 }, { "epoch": 13.262585641942211, "grad_norm": 1.3881410360336304, "learning_rate": 1.5382802132815694e-05, "loss": 0.4812, "num_input_tokens_seen": 51695480, "step": 89045 }, { "epoch": 13.26333035448317, "grad_norm": 1.5570166110992432, "learning_rate": 1.5379802846313115e-05, "loss": 0.3896, "num_input_tokens_seen": 51698328, "step": 89050 }, { "epoch": 13.264075067024129, "grad_norm": 1.7321813106536865, "learning_rate": 1.5376803722333983e-05, "loss": 0.7262, "num_input_tokens_seen": 51701208, "step": 89055 }, { "epoch": 13.264819779565087, "grad_norm": 2.748293876647949, "learning_rate": 1.5373804760928978e-05, "loss": 0.6047, "num_input_tokens_seen": 51704056, "step": 89060 }, { "epoch": 13.265564492106048, "grad_norm": 1.2841291427612305, "learning_rate": 1.537080596214876e-05, "loss": 0.5283, "num_input_tokens_seen": 51706840, "step": 89065 }, { "epoch": 13.266309204647007, "grad_norm": 1.1403958797454834, "learning_rate": 1.5367807326043976e-05, "loss": 0.5029, "num_input_tokens_seen": 51709848, "step": 89070 }, { "epoch": 13.267053917187965, "grad_norm": 1.041683316230774, "learning_rate": 1.5364808852665307e-05, "loss": 0.5804, "num_input_tokens_seen": 51712888, "step": 89075 }, { "epoch": 13.267798629728924, "grad_norm": 2.2221381664276123, "learning_rate": 1.53618105420634e-05, "loss": 0.7415, "num_input_tokens_seen": 51715832, "step": 89080 }, { "epoch": 13.268543342269885, "grad_norm": 1.4101955890655518, "learning_rate": 1.5358812394288906e-05, "loss": 0.4222, "num_input_tokens_seen": 51718776, "step": 89085 }, { "epoch": 13.269288054810843, "grad_norm": 2.206355571746826, "learning_rate": 1.5355814409392475e-05, "loss": 0.5206, "num_input_tokens_seen": 51721656, "step": 89090 }, { "epoch": 13.270032767351802, "grad_norm": 1.2847219705581665, "learning_rate": 1.5352816587424762e-05, "loss": 0.5579, "num_input_tokens_seen": 51724248, "step": 89095 }, { "epoch": 13.270777479892761, "grad_norm": 1.1588939428329468, "learning_rate": 1.53498189284364e-05, "loss": 0.4925, "num_input_tokens_seen": 51727128, "step": 89100 }, { "epoch": 13.271522192433721, "grad_norm": 1.412083625793457, "learning_rate": 1.534682143247805e-05, "loss": 0.5163, "num_input_tokens_seen": 51730104, "step": 89105 }, { "epoch": 13.27226690497468, "grad_norm": 1.1245054006576538, "learning_rate": 1.534382409960034e-05, "loss": 0.6274, "num_input_tokens_seen": 51732856, "step": 89110 }, { "epoch": 13.273011617515639, "grad_norm": 1.4590749740600586, "learning_rate": 1.5340826929853903e-05, "loss": 0.614, "num_input_tokens_seen": 51735896, "step": 89115 }, { "epoch": 13.273756330056598, "grad_norm": 1.3085858821868896, "learning_rate": 1.5337829923289382e-05, "loss": 0.6155, "num_input_tokens_seen": 51738872, "step": 89120 }, { "epoch": 13.274501042597556, "grad_norm": 1.2440146207809448, "learning_rate": 1.5334833079957394e-05, "loss": 0.4406, "num_input_tokens_seen": 51741528, "step": 89125 }, { "epoch": 13.275245755138517, "grad_norm": 1.3129240274429321, "learning_rate": 1.5331836399908588e-05, "loss": 0.6291, "num_input_tokens_seen": 51744568, "step": 89130 }, { "epoch": 13.275990467679476, "grad_norm": 0.7957062721252441, "learning_rate": 1.5328839883193575e-05, "loss": 0.5258, "num_input_tokens_seen": 51747608, "step": 89135 }, { "epoch": 13.276735180220435, "grad_norm": 1.9612782001495361, "learning_rate": 1.5325843529862987e-05, "loss": 0.6011, "num_input_tokens_seen": 51750840, "step": 89140 }, { "epoch": 13.277479892761393, "grad_norm": 0.8915624022483826, "learning_rate": 1.532284733996744e-05, "loss": 0.5657, "num_input_tokens_seen": 51754040, "step": 89145 }, { "epoch": 13.278224605302354, "grad_norm": 2.570464611053467, "learning_rate": 1.5319851313557548e-05, "loss": 0.656, "num_input_tokens_seen": 51756856, "step": 89150 }, { "epoch": 13.278969317843313, "grad_norm": 2.095592498779297, "learning_rate": 1.5316855450683937e-05, "loss": 0.7079, "num_input_tokens_seen": 51759992, "step": 89155 }, { "epoch": 13.279714030384271, "grad_norm": 1.144797682762146, "learning_rate": 1.53138597513972e-05, "loss": 0.5553, "num_input_tokens_seen": 51762680, "step": 89160 }, { "epoch": 13.28045874292523, "grad_norm": 1.9205173254013062, "learning_rate": 1.5310864215747966e-05, "loss": 0.6561, "num_input_tokens_seen": 51765624, "step": 89165 }, { "epoch": 13.28120345546619, "grad_norm": 1.1846003532409668, "learning_rate": 1.5307868843786828e-05, "loss": 0.6711, "num_input_tokens_seen": 51768792, "step": 89170 }, { "epoch": 13.28194816800715, "grad_norm": 1.8317288160324097, "learning_rate": 1.53048736355644e-05, "loss": 0.6533, "num_input_tokens_seen": 51771480, "step": 89175 }, { "epoch": 13.282692880548108, "grad_norm": 1.2135405540466309, "learning_rate": 1.5301878591131273e-05, "loss": 0.5659, "num_input_tokens_seen": 51774552, "step": 89180 }, { "epoch": 13.283437593089067, "grad_norm": 2.4490673542022705, "learning_rate": 1.529888371053806e-05, "loss": 0.67, "num_input_tokens_seen": 51777528, "step": 89185 }, { "epoch": 13.284182305630027, "grad_norm": 1.2482408285140991, "learning_rate": 1.5295888993835345e-05, "loss": 0.5048, "num_input_tokens_seen": 51780504, "step": 89190 }, { "epoch": 13.284927018170986, "grad_norm": 0.7964333295822144, "learning_rate": 1.5292894441073712e-05, "loss": 0.6197, "num_input_tokens_seen": 51783416, "step": 89195 }, { "epoch": 13.285671730711945, "grad_norm": 1.9011080265045166, "learning_rate": 1.5289900052303774e-05, "loss": 0.6323, "num_input_tokens_seen": 51786360, "step": 89200 }, { "epoch": 13.286416443252904, "grad_norm": 1.6626607179641724, "learning_rate": 1.5286905827576094e-05, "loss": 0.6166, "num_input_tokens_seen": 51789048, "step": 89205 }, { "epoch": 13.287161155793864, "grad_norm": 1.0929453372955322, "learning_rate": 1.5283911766941277e-05, "loss": 0.6293, "num_input_tokens_seen": 51791992, "step": 89210 }, { "epoch": 13.287905868334823, "grad_norm": 1.300758957862854, "learning_rate": 1.528091787044989e-05, "loss": 0.7102, "num_input_tokens_seen": 51794968, "step": 89215 }, { "epoch": 13.288650580875782, "grad_norm": 1.4014915227890015, "learning_rate": 1.5277924138152528e-05, "loss": 0.7609, "num_input_tokens_seen": 51797720, "step": 89220 }, { "epoch": 13.28939529341674, "grad_norm": 2.499460220336914, "learning_rate": 1.527493057009975e-05, "loss": 0.7172, "num_input_tokens_seen": 51800472, "step": 89225 }, { "epoch": 13.290140005957701, "grad_norm": 2.1652300357818604, "learning_rate": 1.5271937166342132e-05, "loss": 0.7461, "num_input_tokens_seen": 51803448, "step": 89230 }, { "epoch": 13.29088471849866, "grad_norm": 3.2675206661224365, "learning_rate": 1.526894392693025e-05, "loss": 0.6542, "num_input_tokens_seen": 51806616, "step": 89235 }, { "epoch": 13.291629431039619, "grad_norm": 2.503333330154419, "learning_rate": 1.5265950851914668e-05, "loss": 0.608, "num_input_tokens_seen": 51809432, "step": 89240 }, { "epoch": 13.292374143580577, "grad_norm": 2.305774688720703, "learning_rate": 1.526295794134596e-05, "loss": 0.5688, "num_input_tokens_seen": 51812376, "step": 89245 }, { "epoch": 13.293118856121538, "grad_norm": 1.1777164936065674, "learning_rate": 1.5259965195274678e-05, "loss": 0.5051, "num_input_tokens_seen": 51815256, "step": 89250 }, { "epoch": 13.293863568662497, "grad_norm": 1.859578013420105, "learning_rate": 1.5256972613751386e-05, "loss": 0.5826, "num_input_tokens_seen": 51817912, "step": 89255 }, { "epoch": 13.294608281203455, "grad_norm": 0.7987407445907593, "learning_rate": 1.5253980196826634e-05, "loss": 0.6335, "num_input_tokens_seen": 51820568, "step": 89260 }, { "epoch": 13.295352993744414, "grad_norm": 1.3834648132324219, "learning_rate": 1.5250987944550988e-05, "loss": 0.4876, "num_input_tokens_seen": 51823576, "step": 89265 }, { "epoch": 13.296097706285375, "grad_norm": 1.2130128145217896, "learning_rate": 1.5247995856974995e-05, "loss": 0.5339, "num_input_tokens_seen": 51826552, "step": 89270 }, { "epoch": 13.296842418826333, "grad_norm": 3.554725170135498, "learning_rate": 1.5245003934149194e-05, "loss": 0.7405, "num_input_tokens_seen": 51829304, "step": 89275 }, { "epoch": 13.297587131367292, "grad_norm": 1.10334312915802, "learning_rate": 1.5242012176124135e-05, "loss": 0.5874, "num_input_tokens_seen": 51832184, "step": 89280 }, { "epoch": 13.298331843908251, "grad_norm": 1.8851046562194824, "learning_rate": 1.5239020582950364e-05, "loss": 0.5623, "num_input_tokens_seen": 51834744, "step": 89285 }, { "epoch": 13.299076556449211, "grad_norm": 1.7438055276870728, "learning_rate": 1.5236029154678425e-05, "loss": 0.6108, "num_input_tokens_seen": 51837720, "step": 89290 }, { "epoch": 13.29982126899017, "grad_norm": 1.1736689805984497, "learning_rate": 1.523303789135884e-05, "loss": 0.4503, "num_input_tokens_seen": 51840664, "step": 89295 }, { "epoch": 13.300565981531129, "grad_norm": 1.3549338579177856, "learning_rate": 1.5230046793042163e-05, "loss": 0.7215, "num_input_tokens_seen": 51843448, "step": 89300 }, { "epoch": 13.301310694072088, "grad_norm": 2.7236664295196533, "learning_rate": 1.5227055859778917e-05, "loss": 0.6452, "num_input_tokens_seen": 51846584, "step": 89305 }, { "epoch": 13.302055406613047, "grad_norm": 1.765344262123108, "learning_rate": 1.5224065091619622e-05, "loss": 0.7001, "num_input_tokens_seen": 51849624, "step": 89310 }, { "epoch": 13.302800119154007, "grad_norm": 3.4217426776885986, "learning_rate": 1.5221074488614818e-05, "loss": 0.6423, "num_input_tokens_seen": 51852632, "step": 89315 }, { "epoch": 13.303544831694966, "grad_norm": 2.6512722969055176, "learning_rate": 1.521808405081501e-05, "loss": 0.631, "num_input_tokens_seen": 51855672, "step": 89320 }, { "epoch": 13.304289544235925, "grad_norm": 1.4222087860107422, "learning_rate": 1.521509377827074e-05, "loss": 0.4558, "num_input_tokens_seen": 51858456, "step": 89325 }, { "epoch": 13.305034256776883, "grad_norm": 0.8595162630081177, "learning_rate": 1.5212103671032507e-05, "loss": 0.5269, "num_input_tokens_seen": 51861176, "step": 89330 }, { "epoch": 13.305778969317844, "grad_norm": 1.120839238166809, "learning_rate": 1.5209113729150845e-05, "loss": 0.5963, "num_input_tokens_seen": 51863832, "step": 89335 }, { "epoch": 13.306523681858803, "grad_norm": 1.589505672454834, "learning_rate": 1.520612395267625e-05, "loss": 0.6254, "num_input_tokens_seen": 51866872, "step": 89340 }, { "epoch": 13.307268394399761, "grad_norm": 1.4934759140014648, "learning_rate": 1.5203134341659242e-05, "loss": 0.6798, "num_input_tokens_seen": 51869976, "step": 89345 }, { "epoch": 13.30801310694072, "grad_norm": 2.269967794418335, "learning_rate": 1.520014489615032e-05, "loss": 0.5392, "num_input_tokens_seen": 51872856, "step": 89350 }, { "epoch": 13.30875781948168, "grad_norm": 1.7889128923416138, "learning_rate": 1.5197155616199982e-05, "loss": 0.656, "num_input_tokens_seen": 51875832, "step": 89355 }, { "epoch": 13.30950253202264, "grad_norm": 1.7572511434555054, "learning_rate": 1.5194166501858747e-05, "loss": 0.6814, "num_input_tokens_seen": 51878680, "step": 89360 }, { "epoch": 13.310247244563598, "grad_norm": 2.2751216888427734, "learning_rate": 1.5191177553177094e-05, "loss": 0.6194, "num_input_tokens_seen": 51881752, "step": 89365 }, { "epoch": 13.310991957104557, "grad_norm": 1.6612576246261597, "learning_rate": 1.5188188770205533e-05, "loss": 0.599, "num_input_tokens_seen": 51884760, "step": 89370 }, { "epoch": 13.311736669645517, "grad_norm": 1.6336921453475952, "learning_rate": 1.518520015299455e-05, "loss": 0.4757, "num_input_tokens_seen": 51887544, "step": 89375 }, { "epoch": 13.312481382186476, "grad_norm": 1.4485536813735962, "learning_rate": 1.5182211701594634e-05, "loss": 0.6541, "num_input_tokens_seen": 51890424, "step": 89380 }, { "epoch": 13.313226094727435, "grad_norm": 2.5418217182159424, "learning_rate": 1.5179223416056268e-05, "loss": 0.6292, "num_input_tokens_seen": 51893464, "step": 89385 }, { "epoch": 13.313970807268394, "grad_norm": 1.6696473360061646, "learning_rate": 1.517623529642995e-05, "loss": 0.8219, "num_input_tokens_seen": 51896600, "step": 89390 }, { "epoch": 13.314715519809354, "grad_norm": 1.5183342695236206, "learning_rate": 1.517324734276615e-05, "loss": 0.583, "num_input_tokens_seen": 51899384, "step": 89395 }, { "epoch": 13.315460232350313, "grad_norm": 1.8894742727279663, "learning_rate": 1.5170259555115343e-05, "loss": 0.6272, "num_input_tokens_seen": 51902136, "step": 89400 }, { "epoch": 13.316204944891272, "grad_norm": 2.389064311981201, "learning_rate": 1.5167271933528015e-05, "loss": 0.5696, "num_input_tokens_seen": 51905016, "step": 89405 }, { "epoch": 13.31694965743223, "grad_norm": 2.018815279006958, "learning_rate": 1.5164284478054636e-05, "loss": 0.6293, "num_input_tokens_seen": 51907576, "step": 89410 }, { "epoch": 13.317694369973191, "grad_norm": 1.5288805961608887, "learning_rate": 1.5161297188745673e-05, "loss": 0.5516, "num_input_tokens_seen": 51910264, "step": 89415 }, { "epoch": 13.31843908251415, "grad_norm": 1.9779300689697266, "learning_rate": 1.5158310065651588e-05, "loss": 0.5839, "num_input_tokens_seen": 51913208, "step": 89420 }, { "epoch": 13.319183795055109, "grad_norm": 1.2240463495254517, "learning_rate": 1.5155323108822861e-05, "loss": 0.6415, "num_input_tokens_seen": 51916120, "step": 89425 }, { "epoch": 13.319928507596067, "grad_norm": 1.524799108505249, "learning_rate": 1.5152336318309942e-05, "loss": 0.5194, "num_input_tokens_seen": 51919704, "step": 89430 }, { "epoch": 13.320673220137028, "grad_norm": 2.5280561447143555, "learning_rate": 1.5149349694163283e-05, "loss": 0.7843, "num_input_tokens_seen": 51922456, "step": 89435 }, { "epoch": 13.321417932677987, "grad_norm": 1.7734030485153198, "learning_rate": 1.5146363236433362e-05, "loss": 0.5819, "num_input_tokens_seen": 51925496, "step": 89440 }, { "epoch": 13.322162645218945, "grad_norm": 1.8937674760818481, "learning_rate": 1.5143376945170612e-05, "loss": 0.7134, "num_input_tokens_seen": 51928216, "step": 89445 }, { "epoch": 13.322907357759904, "grad_norm": 1.8665014505386353, "learning_rate": 1.5140390820425495e-05, "loss": 0.6738, "num_input_tokens_seen": 51931160, "step": 89450 }, { "epoch": 13.323652070300863, "grad_norm": 1.4518322944641113, "learning_rate": 1.5137404862248447e-05, "loss": 0.6888, "num_input_tokens_seen": 51934040, "step": 89455 }, { "epoch": 13.324396782841823, "grad_norm": 1.2843929529190063, "learning_rate": 1.5134419070689926e-05, "loss": 0.6572, "num_input_tokens_seen": 51937272, "step": 89460 }, { "epoch": 13.325141495382782, "grad_norm": 1.7880524396896362, "learning_rate": 1.5131433445800363e-05, "loss": 0.5523, "num_input_tokens_seen": 51939992, "step": 89465 }, { "epoch": 13.325886207923741, "grad_norm": 1.3572443723678589, "learning_rate": 1.5128447987630207e-05, "loss": 0.7348, "num_input_tokens_seen": 51943096, "step": 89470 }, { "epoch": 13.3266309204647, "grad_norm": 0.8235379457473755, "learning_rate": 1.5125462696229892e-05, "loss": 0.5908, "num_input_tokens_seen": 51945976, "step": 89475 }, { "epoch": 13.32737563300566, "grad_norm": 2.0558881759643555, "learning_rate": 1.5122477571649846e-05, "loss": 0.6836, "num_input_tokens_seen": 51948856, "step": 89480 }, { "epoch": 13.328120345546619, "grad_norm": 1.699330449104309, "learning_rate": 1.5119492613940503e-05, "loss": 0.6404, "num_input_tokens_seen": 51951800, "step": 89485 }, { "epoch": 13.328865058087578, "grad_norm": 4.679932594299316, "learning_rate": 1.5116507823152282e-05, "loss": 0.582, "num_input_tokens_seen": 51954648, "step": 89490 }, { "epoch": 13.329609770628537, "grad_norm": 2.412552833557129, "learning_rate": 1.5113523199335624e-05, "loss": 0.4569, "num_input_tokens_seen": 51957496, "step": 89495 }, { "epoch": 13.330354483169497, "grad_norm": 1.8495759963989258, "learning_rate": 1.5110538742540936e-05, "loss": 0.6046, "num_input_tokens_seen": 51960472, "step": 89500 }, { "epoch": 13.331099195710456, "grad_norm": 1.6968579292297363, "learning_rate": 1.5107554452818653e-05, "loss": 0.6322, "num_input_tokens_seen": 51963288, "step": 89505 }, { "epoch": 13.331843908251415, "grad_norm": 1.1695677042007446, "learning_rate": 1.5104570330219187e-05, "loss": 0.5625, "num_input_tokens_seen": 51965976, "step": 89510 }, { "epoch": 13.332588620792373, "grad_norm": 2.279741048812866, "learning_rate": 1.510158637479294e-05, "loss": 0.7233, "num_input_tokens_seen": 51969112, "step": 89515 }, { "epoch": 13.333333333333334, "grad_norm": 0.8876289129257202, "learning_rate": 1.5098602586590335e-05, "loss": 0.4368, "num_input_tokens_seen": 51971896, "step": 89520 }, { "epoch": 13.334078045874293, "grad_norm": 1.3163676261901855, "learning_rate": 1.5095618965661767e-05, "loss": 0.4679, "num_input_tokens_seen": 51974712, "step": 89525 }, { "epoch": 13.334822758415251, "grad_norm": 2.0020227432250977, "learning_rate": 1.5092635512057662e-05, "loss": 0.6812, "num_input_tokens_seen": 51977848, "step": 89530 }, { "epoch": 13.33556747095621, "grad_norm": 1.6964131593704224, "learning_rate": 1.5089652225828399e-05, "loss": 0.6777, "num_input_tokens_seen": 51980792, "step": 89535 }, { "epoch": 13.33631218349717, "grad_norm": 1.168542742729187, "learning_rate": 1.5086669107024398e-05, "loss": 0.588, "num_input_tokens_seen": 51983544, "step": 89540 }, { "epoch": 13.33705689603813, "grad_norm": 2.702894687652588, "learning_rate": 1.5083686155696043e-05, "loss": 0.4571, "num_input_tokens_seen": 51986520, "step": 89545 }, { "epoch": 13.337801608579088, "grad_norm": 0.8574885725975037, "learning_rate": 1.5080703371893737e-05, "loss": 0.5135, "num_input_tokens_seen": 51989272, "step": 89550 }, { "epoch": 13.338546321120047, "grad_norm": 1.9221469163894653, "learning_rate": 1.5077720755667868e-05, "loss": 0.5533, "num_input_tokens_seen": 51992312, "step": 89555 }, { "epoch": 13.339291033661008, "grad_norm": 1.3277149200439453, "learning_rate": 1.5074738307068809e-05, "loss": 0.4316, "num_input_tokens_seen": 51995224, "step": 89560 }, { "epoch": 13.340035746201966, "grad_norm": 1.146126389503479, "learning_rate": 1.5071756026146972e-05, "loss": 0.5991, "num_input_tokens_seen": 51997880, "step": 89565 }, { "epoch": 13.340780458742925, "grad_norm": 1.1136348247528076, "learning_rate": 1.506877391295271e-05, "loss": 0.6701, "num_input_tokens_seen": 52000696, "step": 89570 }, { "epoch": 13.341525171283884, "grad_norm": 1.7883903980255127, "learning_rate": 1.5065791967536436e-05, "loss": 0.4984, "num_input_tokens_seen": 52003576, "step": 89575 }, { "epoch": 13.342269883824844, "grad_norm": 2.671438455581665, "learning_rate": 1.50628101899485e-05, "loss": 0.5187, "num_input_tokens_seen": 52006232, "step": 89580 }, { "epoch": 13.343014596365803, "grad_norm": 1.4871490001678467, "learning_rate": 1.5059828580239296e-05, "loss": 0.5262, "num_input_tokens_seen": 52009016, "step": 89585 }, { "epoch": 13.343759308906762, "grad_norm": 0.7357370257377625, "learning_rate": 1.5056847138459185e-05, "loss": 0.5945, "num_input_tokens_seen": 52011896, "step": 89590 }, { "epoch": 13.34450402144772, "grad_norm": 1.355384111404419, "learning_rate": 1.5053865864658523e-05, "loss": 0.5341, "num_input_tokens_seen": 52014872, "step": 89595 }, { "epoch": 13.345248733988681, "grad_norm": 1.447778582572937, "learning_rate": 1.5050884758887698e-05, "loss": 0.6388, "num_input_tokens_seen": 52017848, "step": 89600 }, { "epoch": 13.34599344652964, "grad_norm": 2.4936270713806152, "learning_rate": 1.504790382119706e-05, "loss": 0.5466, "num_input_tokens_seen": 52020664, "step": 89605 }, { "epoch": 13.346738159070599, "grad_norm": 0.815943717956543, "learning_rate": 1.5044923051636972e-05, "loss": 0.5722, "num_input_tokens_seen": 52023448, "step": 89610 }, { "epoch": 13.347482871611557, "grad_norm": 1.2127249240875244, "learning_rate": 1.504194245025779e-05, "loss": 0.6422, "num_input_tokens_seen": 52026616, "step": 89615 }, { "epoch": 13.348227584152518, "grad_norm": 1.6014277935028076, "learning_rate": 1.5038962017109875e-05, "loss": 0.6794, "num_input_tokens_seen": 52029464, "step": 89620 }, { "epoch": 13.348972296693477, "grad_norm": 1.3936783075332642, "learning_rate": 1.5035981752243561e-05, "loss": 0.7369, "num_input_tokens_seen": 52032632, "step": 89625 }, { "epoch": 13.349717009234435, "grad_norm": 2.0626797676086426, "learning_rate": 1.5033001655709222e-05, "loss": 0.5533, "num_input_tokens_seen": 52035288, "step": 89630 }, { "epoch": 13.350461721775394, "grad_norm": 2.2669761180877686, "learning_rate": 1.5030021727557189e-05, "loss": 0.6037, "num_input_tokens_seen": 52037976, "step": 89635 }, { "epoch": 13.351206434316353, "grad_norm": 2.395005702972412, "learning_rate": 1.5027041967837802e-05, "loss": 0.5347, "num_input_tokens_seen": 52041048, "step": 89640 }, { "epoch": 13.351951146857314, "grad_norm": 1.913365125656128, "learning_rate": 1.5024062376601406e-05, "loss": 0.6415, "num_input_tokens_seen": 52043704, "step": 89645 }, { "epoch": 13.352695859398272, "grad_norm": 1.2970081567764282, "learning_rate": 1.502108295389833e-05, "loss": 0.5694, "num_input_tokens_seen": 52046552, "step": 89650 }, { "epoch": 13.353440571939231, "grad_norm": 1.5923597812652588, "learning_rate": 1.5018103699778923e-05, "loss": 0.5677, "num_input_tokens_seen": 52049144, "step": 89655 }, { "epoch": 13.35418528448019, "grad_norm": 1.0789533853530884, "learning_rate": 1.5015124614293501e-05, "loss": 0.5957, "num_input_tokens_seen": 52052056, "step": 89660 }, { "epoch": 13.35492999702115, "grad_norm": 1.1796702146530151, "learning_rate": 1.5012145697492407e-05, "loss": 0.4416, "num_input_tokens_seen": 52054872, "step": 89665 }, { "epoch": 13.35567470956211, "grad_norm": 2.5348024368286133, "learning_rate": 1.5009166949425965e-05, "loss": 0.7303, "num_input_tokens_seen": 52057912, "step": 89670 }, { "epoch": 13.356419422103068, "grad_norm": 0.7938612699508667, "learning_rate": 1.5006188370144486e-05, "loss": 0.5469, "num_input_tokens_seen": 52060632, "step": 89675 }, { "epoch": 13.357164134644027, "grad_norm": 2.0594704151153564, "learning_rate": 1.5003209959698302e-05, "loss": 0.5603, "num_input_tokens_seen": 52063256, "step": 89680 }, { "epoch": 13.357908847184987, "grad_norm": 1.369742751121521, "learning_rate": 1.5000231718137717e-05, "loss": 0.5218, "num_input_tokens_seen": 52066104, "step": 89685 }, { "epoch": 13.358653559725946, "grad_norm": 1.4707752466201782, "learning_rate": 1.4997253645513063e-05, "loss": 0.6758, "num_input_tokens_seen": 52068920, "step": 89690 }, { "epoch": 13.359398272266905, "grad_norm": 0.7942457795143127, "learning_rate": 1.499427574187463e-05, "loss": 0.5013, "num_input_tokens_seen": 52071832, "step": 89695 }, { "epoch": 13.360142984807863, "grad_norm": 2.3056652545928955, "learning_rate": 1.4991298007272753e-05, "loss": 0.5547, "num_input_tokens_seen": 52075000, "step": 89700 }, { "epoch": 13.360887697348824, "grad_norm": 1.1922856569290161, "learning_rate": 1.4988320441757714e-05, "loss": 0.6303, "num_input_tokens_seen": 52077592, "step": 89705 }, { "epoch": 13.361632409889783, "grad_norm": 1.66090989112854, "learning_rate": 1.4985343045379836e-05, "loss": 0.5314, "num_input_tokens_seen": 52080536, "step": 89710 }, { "epoch": 13.362377122430741, "grad_norm": 2.7768349647521973, "learning_rate": 1.4982365818189407e-05, "loss": 0.6622, "num_input_tokens_seen": 52083704, "step": 89715 }, { "epoch": 13.3631218349717, "grad_norm": 1.1594535112380981, "learning_rate": 1.4979388760236712e-05, "loss": 0.5118, "num_input_tokens_seen": 52086872, "step": 89720 }, { "epoch": 13.36386654751266, "grad_norm": 1.5857820510864258, "learning_rate": 1.4976411871572074e-05, "loss": 0.6015, "num_input_tokens_seen": 52089816, "step": 89725 }, { "epoch": 13.36461126005362, "grad_norm": 0.9479125738143921, "learning_rate": 1.4973435152245757e-05, "loss": 0.3433, "num_input_tokens_seen": 52092664, "step": 89730 }, { "epoch": 13.365355972594578, "grad_norm": 0.8450016975402832, "learning_rate": 1.4970458602308077e-05, "loss": 0.4161, "num_input_tokens_seen": 52095640, "step": 89735 }, { "epoch": 13.366100685135537, "grad_norm": 0.9010639190673828, "learning_rate": 1.4967482221809299e-05, "loss": 0.4979, "num_input_tokens_seen": 52098360, "step": 89740 }, { "epoch": 13.366845397676498, "grad_norm": 1.5487236976623535, "learning_rate": 1.4964506010799711e-05, "loss": 0.6155, "num_input_tokens_seen": 52101400, "step": 89745 }, { "epoch": 13.367590110217456, "grad_norm": 1.6483746767044067, "learning_rate": 1.4961529969329602e-05, "loss": 0.6343, "num_input_tokens_seen": 52104504, "step": 89750 }, { "epoch": 13.368334822758415, "grad_norm": 2.250969886779785, "learning_rate": 1.4958554097449228e-05, "loss": 0.8337, "num_input_tokens_seen": 52108536, "step": 89755 }, { "epoch": 13.369079535299374, "grad_norm": 1.7046562433242798, "learning_rate": 1.4955578395208886e-05, "loss": 0.7506, "num_input_tokens_seen": 52111288, "step": 89760 }, { "epoch": 13.369824247840334, "grad_norm": 1.177213191986084, "learning_rate": 1.4952602862658832e-05, "loss": 0.724, "num_input_tokens_seen": 52115224, "step": 89765 }, { "epoch": 13.370568960381293, "grad_norm": 1.8583554029464722, "learning_rate": 1.494962749984935e-05, "loss": 0.699, "num_input_tokens_seen": 52118104, "step": 89770 }, { "epoch": 13.371313672922252, "grad_norm": 1.3380134105682373, "learning_rate": 1.4946652306830688e-05, "loss": 0.577, "num_input_tokens_seen": 52120888, "step": 89775 }, { "epoch": 13.37205838546321, "grad_norm": 1.008856177330017, "learning_rate": 1.4943677283653124e-05, "loss": 0.6584, "num_input_tokens_seen": 52123832, "step": 89780 }, { "epoch": 13.372803098004171, "grad_norm": 1.305598497390747, "learning_rate": 1.4940702430366905e-05, "loss": 0.5183, "num_input_tokens_seen": 52126584, "step": 89785 }, { "epoch": 13.37354781054513, "grad_norm": 0.9856885671615601, "learning_rate": 1.4937727747022302e-05, "loss": 0.4023, "num_input_tokens_seen": 52129560, "step": 89790 }, { "epoch": 13.374292523086089, "grad_norm": 1.2630311250686646, "learning_rate": 1.493475323366956e-05, "loss": 0.3957, "num_input_tokens_seen": 52132280, "step": 89795 }, { "epoch": 13.375037235627047, "grad_norm": 2.0277326107025146, "learning_rate": 1.4931778890358924e-05, "loss": 0.4648, "num_input_tokens_seen": 52136056, "step": 89800 }, { "epoch": 13.375781948168008, "grad_norm": 2.1627731323242188, "learning_rate": 1.492880471714066e-05, "loss": 0.6357, "num_input_tokens_seen": 52139128, "step": 89805 }, { "epoch": 13.376526660708967, "grad_norm": 0.9242542386054993, "learning_rate": 1.4925830714065003e-05, "loss": 0.4372, "num_input_tokens_seen": 52141848, "step": 89810 }, { "epoch": 13.377271373249926, "grad_norm": 0.9780434370040894, "learning_rate": 1.4922856881182199e-05, "loss": 0.3939, "num_input_tokens_seen": 52144472, "step": 89815 }, { "epoch": 13.378016085790884, "grad_norm": 2.4904980659484863, "learning_rate": 1.4919883218542474e-05, "loss": 0.6968, "num_input_tokens_seen": 52147384, "step": 89820 }, { "epoch": 13.378760798331843, "grad_norm": 3.1802256107330322, "learning_rate": 1.4916909726196093e-05, "loss": 0.771, "num_input_tokens_seen": 52150136, "step": 89825 }, { "epoch": 13.379505510872804, "grad_norm": 2.04612135887146, "learning_rate": 1.4913936404193268e-05, "loss": 0.5909, "num_input_tokens_seen": 52153048, "step": 89830 }, { "epoch": 13.380250223413762, "grad_norm": 2.8329155445098877, "learning_rate": 1.4910963252584231e-05, "loss": 0.7859, "num_input_tokens_seen": 52155992, "step": 89835 }, { "epoch": 13.380994935954721, "grad_norm": 2.924126625061035, "learning_rate": 1.4907990271419222e-05, "loss": 0.6588, "num_input_tokens_seen": 52158808, "step": 89840 }, { "epoch": 13.38173964849568, "grad_norm": 2.378225326538086, "learning_rate": 1.4905017460748458e-05, "loss": 0.6611, "num_input_tokens_seen": 52161528, "step": 89845 }, { "epoch": 13.38248436103664, "grad_norm": 1.2964566946029663, "learning_rate": 1.4902044820622168e-05, "loss": 0.6663, "num_input_tokens_seen": 52164280, "step": 89850 }, { "epoch": 13.3832290735776, "grad_norm": 1.483646035194397, "learning_rate": 1.489907235109056e-05, "loss": 0.7744, "num_input_tokens_seen": 52167480, "step": 89855 }, { "epoch": 13.383973786118558, "grad_norm": 2.3921449184417725, "learning_rate": 1.4896100052203865e-05, "loss": 0.6785, "num_input_tokens_seen": 52170296, "step": 89860 }, { "epoch": 13.384718498659517, "grad_norm": 2.0011889934539795, "learning_rate": 1.4893127924012281e-05, "loss": 0.6044, "num_input_tokens_seen": 52173464, "step": 89865 }, { "epoch": 13.385463211200477, "grad_norm": 2.4141197204589844, "learning_rate": 1.4890155966566039e-05, "loss": 0.77, "num_input_tokens_seen": 52176248, "step": 89870 }, { "epoch": 13.386207923741436, "grad_norm": 1.4863752126693726, "learning_rate": 1.4887184179915336e-05, "loss": 0.6051, "num_input_tokens_seen": 52179096, "step": 89875 }, { "epoch": 13.386952636282395, "grad_norm": 3.1103978157043457, "learning_rate": 1.4884212564110379e-05, "loss": 0.5963, "num_input_tokens_seen": 52182104, "step": 89880 }, { "epoch": 13.387697348823353, "grad_norm": 1.3847984075546265, "learning_rate": 1.4881241119201367e-05, "loss": 0.436, "num_input_tokens_seen": 52184888, "step": 89885 }, { "epoch": 13.388442061364314, "grad_norm": 1.1420694589614868, "learning_rate": 1.4878269845238496e-05, "loss": 0.6429, "num_input_tokens_seen": 52187960, "step": 89890 }, { "epoch": 13.389186773905273, "grad_norm": 1.372713565826416, "learning_rate": 1.4875298742271976e-05, "loss": 0.5545, "num_input_tokens_seen": 52190904, "step": 89895 }, { "epoch": 13.389931486446232, "grad_norm": 3.725289821624756, "learning_rate": 1.4872327810351986e-05, "loss": 0.5485, "num_input_tokens_seen": 52193880, "step": 89900 }, { "epoch": 13.39067619898719, "grad_norm": 2.2112233638763428, "learning_rate": 1.4869357049528731e-05, "loss": 0.6911, "num_input_tokens_seen": 52196760, "step": 89905 }, { "epoch": 13.39142091152815, "grad_norm": 1.6049011945724487, "learning_rate": 1.4866386459852394e-05, "loss": 0.6405, "num_input_tokens_seen": 52199832, "step": 89910 }, { "epoch": 13.39216562406911, "grad_norm": 1.2859517335891724, "learning_rate": 1.4863416041373158e-05, "loss": 0.5466, "num_input_tokens_seen": 52202488, "step": 89915 }, { "epoch": 13.392910336610068, "grad_norm": 1.3287104368209839, "learning_rate": 1.4860445794141204e-05, "loss": 0.5231, "num_input_tokens_seen": 52205304, "step": 89920 }, { "epoch": 13.393655049151027, "grad_norm": 2.031414747238159, "learning_rate": 1.4857475718206706e-05, "loss": 0.4877, "num_input_tokens_seen": 52207960, "step": 89925 }, { "epoch": 13.394399761691988, "grad_norm": 1.806670069694519, "learning_rate": 1.4854505813619857e-05, "loss": 0.7055, "num_input_tokens_seen": 52210904, "step": 89930 }, { "epoch": 13.395144474232946, "grad_norm": 0.7252746820449829, "learning_rate": 1.4851536080430817e-05, "loss": 0.4489, "num_input_tokens_seen": 52213688, "step": 89935 }, { "epoch": 13.395889186773905, "grad_norm": 1.087654709815979, "learning_rate": 1.4848566518689757e-05, "loss": 0.5426, "num_input_tokens_seen": 52216920, "step": 89940 }, { "epoch": 13.396633899314864, "grad_norm": 4.038174629211426, "learning_rate": 1.4845597128446853e-05, "loss": 0.6591, "num_input_tokens_seen": 52219736, "step": 89945 }, { "epoch": 13.397378611855824, "grad_norm": 1.1220319271087646, "learning_rate": 1.4842627909752266e-05, "loss": 0.5157, "num_input_tokens_seen": 52222776, "step": 89950 }, { "epoch": 13.398123324396783, "grad_norm": 2.0765881538391113, "learning_rate": 1.4839658862656158e-05, "loss": 0.5599, "num_input_tokens_seen": 52225624, "step": 89955 }, { "epoch": 13.398868036937742, "grad_norm": 1.4017168283462524, "learning_rate": 1.4836689987208677e-05, "loss": 0.5787, "num_input_tokens_seen": 52228440, "step": 89960 }, { "epoch": 13.3996127494787, "grad_norm": 1.4802345037460327, "learning_rate": 1.483372128346e-05, "loss": 0.6297, "num_input_tokens_seen": 52231256, "step": 89965 }, { "epoch": 13.400357462019661, "grad_norm": 1.6519449949264526, "learning_rate": 1.4830752751460264e-05, "loss": 0.5315, "num_input_tokens_seen": 52234072, "step": 89970 }, { "epoch": 13.40110217456062, "grad_norm": 1.1151360273361206, "learning_rate": 1.482778439125963e-05, "loss": 0.5863, "num_input_tokens_seen": 52237208, "step": 89975 }, { "epoch": 13.401846887101579, "grad_norm": 1.9235700368881226, "learning_rate": 1.482481620290823e-05, "loss": 0.4365, "num_input_tokens_seen": 52239960, "step": 89980 }, { "epoch": 13.402591599642538, "grad_norm": 1.450575351715088, "learning_rate": 1.4821848186456228e-05, "loss": 0.669, "num_input_tokens_seen": 52242744, "step": 89985 }, { "epoch": 13.403336312183498, "grad_norm": 0.934105634689331, "learning_rate": 1.4818880341953745e-05, "loss": 0.5203, "num_input_tokens_seen": 52245528, "step": 89990 }, { "epoch": 13.404081024724457, "grad_norm": 1.2373756170272827, "learning_rate": 1.4815912669450943e-05, "loss": 0.4901, "num_input_tokens_seen": 52248248, "step": 89995 }, { "epoch": 13.404825737265416, "grad_norm": 1.5329488515853882, "learning_rate": 1.4812945168997947e-05, "loss": 0.5624, "num_input_tokens_seen": 52251256, "step": 90000 }, { "epoch": 13.405570449806374, "grad_norm": 2.196681261062622, "learning_rate": 1.4809977840644881e-05, "loss": 0.657, "num_input_tokens_seen": 52254328, "step": 90005 }, { "epoch": 13.406315162347333, "grad_norm": 2.135287046432495, "learning_rate": 1.4807010684441891e-05, "loss": 0.6617, "num_input_tokens_seen": 52257528, "step": 90010 }, { "epoch": 13.407059874888294, "grad_norm": 1.0819776058197021, "learning_rate": 1.4804043700439083e-05, "loss": 0.564, "num_input_tokens_seen": 52260760, "step": 90015 }, { "epoch": 13.407804587429252, "grad_norm": 2.2338497638702393, "learning_rate": 1.4801076888686605e-05, "loss": 0.5843, "num_input_tokens_seen": 52263640, "step": 90020 }, { "epoch": 13.408549299970211, "grad_norm": 1.7088122367858887, "learning_rate": 1.4798110249234556e-05, "loss": 0.5412, "num_input_tokens_seen": 52266648, "step": 90025 }, { "epoch": 13.40929401251117, "grad_norm": 1.6677051782608032, "learning_rate": 1.4795143782133075e-05, "loss": 0.6981, "num_input_tokens_seen": 52269720, "step": 90030 }, { "epoch": 13.41003872505213, "grad_norm": 1.3608342409133911, "learning_rate": 1.4792177487432271e-05, "loss": 0.6158, "num_input_tokens_seen": 52272728, "step": 90035 }, { "epoch": 13.41078343759309, "grad_norm": 1.59977126121521, "learning_rate": 1.4789211365182249e-05, "loss": 0.4429, "num_input_tokens_seen": 52275480, "step": 90040 }, { "epoch": 13.411528150134048, "grad_norm": 1.1236737966537476, "learning_rate": 1.4786245415433125e-05, "loss": 0.6167, "num_input_tokens_seen": 52278520, "step": 90045 }, { "epoch": 13.412272862675007, "grad_norm": 2.173043727874756, "learning_rate": 1.4783279638234994e-05, "loss": 0.6572, "num_input_tokens_seen": 52281112, "step": 90050 }, { "epoch": 13.413017575215967, "grad_norm": 1.4948484897613525, "learning_rate": 1.4780314033637982e-05, "loss": 0.4495, "num_input_tokens_seen": 52284184, "step": 90055 }, { "epoch": 13.413762287756926, "grad_norm": 2.8417603969573975, "learning_rate": 1.4777348601692168e-05, "loss": 0.6881, "num_input_tokens_seen": 52287096, "step": 90060 }, { "epoch": 13.414507000297885, "grad_norm": 1.3263540267944336, "learning_rate": 1.4774383342447667e-05, "loss": 0.6121, "num_input_tokens_seen": 52290008, "step": 90065 }, { "epoch": 13.415251712838844, "grad_norm": 2.649413824081421, "learning_rate": 1.477141825595456e-05, "loss": 0.6046, "num_input_tokens_seen": 52292792, "step": 90070 }, { "epoch": 13.415996425379804, "grad_norm": 1.1189764738082886, "learning_rate": 1.4768453342262955e-05, "loss": 0.7178, "num_input_tokens_seen": 52295768, "step": 90075 }, { "epoch": 13.416741137920763, "grad_norm": 1.5118502378463745, "learning_rate": 1.4765488601422934e-05, "loss": 0.4559, "num_input_tokens_seen": 52298456, "step": 90080 }, { "epoch": 13.417485850461722, "grad_norm": 1.1395642757415771, "learning_rate": 1.4762524033484565e-05, "loss": 0.5031, "num_input_tokens_seen": 52301592, "step": 90085 }, { "epoch": 13.41823056300268, "grad_norm": 2.592623472213745, "learning_rate": 1.475955963849796e-05, "loss": 0.6198, "num_input_tokens_seen": 52304344, "step": 90090 }, { "epoch": 13.418975275543641, "grad_norm": 1.2467970848083496, "learning_rate": 1.4756595416513175e-05, "loss": 0.587, "num_input_tokens_seen": 52307352, "step": 90095 }, { "epoch": 13.4197199880846, "grad_norm": 1.4719761610031128, "learning_rate": 1.4753631367580312e-05, "loss": 0.7512, "num_input_tokens_seen": 52310104, "step": 90100 }, { "epoch": 13.420464700625558, "grad_norm": 1.3527015447616577, "learning_rate": 1.4750667491749428e-05, "loss": 0.5011, "num_input_tokens_seen": 52313240, "step": 90105 }, { "epoch": 13.421209413166517, "grad_norm": 1.7034485340118408, "learning_rate": 1.4747703789070606e-05, "loss": 0.7527, "num_input_tokens_seen": 52316088, "step": 90110 }, { "epoch": 13.421954125707478, "grad_norm": 2.2151010036468506, "learning_rate": 1.4744740259593907e-05, "loss": 0.7968, "num_input_tokens_seen": 52318904, "step": 90115 }, { "epoch": 13.422698838248436, "grad_norm": 2.165513753890991, "learning_rate": 1.4741776903369386e-05, "loss": 0.6664, "num_input_tokens_seen": 52321912, "step": 90120 }, { "epoch": 13.423443550789395, "grad_norm": 1.0524818897247314, "learning_rate": 1.4738813720447132e-05, "loss": 0.5786, "num_input_tokens_seen": 52324408, "step": 90125 }, { "epoch": 13.424188263330354, "grad_norm": 1.3456822633743286, "learning_rate": 1.473585071087718e-05, "loss": 0.4337, "num_input_tokens_seen": 52327512, "step": 90130 }, { "epoch": 13.424932975871315, "grad_norm": 1.3647617101669312, "learning_rate": 1.4732887874709605e-05, "loss": 0.3915, "num_input_tokens_seen": 52330392, "step": 90135 }, { "epoch": 13.425677688412273, "grad_norm": 2.9196417331695557, "learning_rate": 1.4729925211994455e-05, "loss": 0.6491, "num_input_tokens_seen": 52333240, "step": 90140 }, { "epoch": 13.426422400953232, "grad_norm": 1.6324728727340698, "learning_rate": 1.4726962722781783e-05, "loss": 0.6696, "num_input_tokens_seen": 52336472, "step": 90145 }, { "epoch": 13.42716711349419, "grad_norm": 2.2921576499938965, "learning_rate": 1.4724000407121624e-05, "loss": 0.448, "num_input_tokens_seen": 52339480, "step": 90150 }, { "epoch": 13.42791182603515, "grad_norm": 1.2822741270065308, "learning_rate": 1.4721038265064044e-05, "loss": 0.7459, "num_input_tokens_seen": 52342200, "step": 90155 }, { "epoch": 13.42865653857611, "grad_norm": 1.5327181816101074, "learning_rate": 1.4718076296659078e-05, "loss": 0.5775, "num_input_tokens_seen": 52345336, "step": 90160 }, { "epoch": 13.429401251117069, "grad_norm": 2.1984670162200928, "learning_rate": 1.471511450195675e-05, "loss": 0.6748, "num_input_tokens_seen": 52348152, "step": 90165 }, { "epoch": 13.430145963658028, "grad_norm": 1.1843013763427734, "learning_rate": 1.4712152881007118e-05, "loss": 0.5463, "num_input_tokens_seen": 52351096, "step": 90170 }, { "epoch": 13.430890676198986, "grad_norm": 2.7604103088378906, "learning_rate": 1.4709191433860206e-05, "loss": 0.4566, "num_input_tokens_seen": 52354104, "step": 90175 }, { "epoch": 13.431635388739947, "grad_norm": 1.0837297439575195, "learning_rate": 1.4706230160566048e-05, "loss": 0.6091, "num_input_tokens_seen": 52356984, "step": 90180 }, { "epoch": 13.432380101280906, "grad_norm": 2.199906587600708, "learning_rate": 1.4703269061174663e-05, "loss": 0.6875, "num_input_tokens_seen": 52360248, "step": 90185 }, { "epoch": 13.433124813821864, "grad_norm": 1.3374959230422974, "learning_rate": 1.4700308135736085e-05, "loss": 0.5269, "num_input_tokens_seen": 52363288, "step": 90190 }, { "epoch": 13.433869526362823, "grad_norm": 1.2979105710983276, "learning_rate": 1.4697347384300338e-05, "loss": 0.676, "num_input_tokens_seen": 52366168, "step": 90195 }, { "epoch": 13.434614238903784, "grad_norm": 1.7089067697525024, "learning_rate": 1.4694386806917426e-05, "loss": 0.597, "num_input_tokens_seen": 52369240, "step": 90200 }, { "epoch": 13.435358951444742, "grad_norm": 1.3033260107040405, "learning_rate": 1.4691426403637382e-05, "loss": 0.7986, "num_input_tokens_seen": 52371832, "step": 90205 }, { "epoch": 13.436103663985701, "grad_norm": 0.9546211361885071, "learning_rate": 1.4688466174510209e-05, "loss": 0.4031, "num_input_tokens_seen": 52374936, "step": 90210 }, { "epoch": 13.43684837652666, "grad_norm": 3.5700552463531494, "learning_rate": 1.4685506119585924e-05, "loss": 0.7313, "num_input_tokens_seen": 52377720, "step": 90215 }, { "epoch": 13.43759308906762, "grad_norm": 1.1678649187088013, "learning_rate": 1.4682546238914521e-05, "loss": 0.5453, "num_input_tokens_seen": 52380696, "step": 90220 }, { "epoch": 13.43833780160858, "grad_norm": 1.6471126079559326, "learning_rate": 1.4679586532546025e-05, "loss": 0.6098, "num_input_tokens_seen": 52383704, "step": 90225 }, { "epoch": 13.439082514149538, "grad_norm": 4.110910892486572, "learning_rate": 1.467662700053041e-05, "loss": 0.5525, "num_input_tokens_seen": 52386552, "step": 90230 }, { "epoch": 13.439827226690497, "grad_norm": 1.468470573425293, "learning_rate": 1.4673667642917705e-05, "loss": 0.5509, "num_input_tokens_seen": 52389528, "step": 90235 }, { "epoch": 13.440571939231457, "grad_norm": 2.6088058948516846, "learning_rate": 1.4670708459757885e-05, "loss": 0.6296, "num_input_tokens_seen": 52392472, "step": 90240 }, { "epoch": 13.441316651772416, "grad_norm": 1.7087414264678955, "learning_rate": 1.4667749451100943e-05, "loss": 0.5781, "num_input_tokens_seen": 52395128, "step": 90245 }, { "epoch": 13.442061364313375, "grad_norm": 1.8027976751327515, "learning_rate": 1.4664790616996881e-05, "loss": 0.8509, "num_input_tokens_seen": 52398040, "step": 90250 }, { "epoch": 13.442806076854334, "grad_norm": 3.5938339233398438, "learning_rate": 1.4661831957495665e-05, "loss": 0.5106, "num_input_tokens_seen": 52400696, "step": 90255 }, { "epoch": 13.443550789395294, "grad_norm": 2.161668539047241, "learning_rate": 1.4658873472647299e-05, "loss": 0.6347, "num_input_tokens_seen": 52403416, "step": 90260 }, { "epoch": 13.444295501936253, "grad_norm": 1.5186097621917725, "learning_rate": 1.4655915162501754e-05, "loss": 0.5726, "num_input_tokens_seen": 52406072, "step": 90265 }, { "epoch": 13.445040214477212, "grad_norm": 1.6345171928405762, "learning_rate": 1.4652957027109009e-05, "loss": 0.5687, "num_input_tokens_seen": 52408824, "step": 90270 }, { "epoch": 13.44578492701817, "grad_norm": 0.9615067839622498, "learning_rate": 1.4649999066519043e-05, "loss": 0.7182, "num_input_tokens_seen": 52411704, "step": 90275 }, { "epoch": 13.446529639559131, "grad_norm": 1.960160255432129, "learning_rate": 1.4647041280781821e-05, "loss": 0.5828, "num_input_tokens_seen": 52414520, "step": 90280 }, { "epoch": 13.44727435210009, "grad_norm": 2.394549608230591, "learning_rate": 1.4644083669947314e-05, "loss": 0.6296, "num_input_tokens_seen": 52417432, "step": 90285 }, { "epoch": 13.448019064641048, "grad_norm": 1.307312250137329, "learning_rate": 1.464112623406548e-05, "loss": 0.6668, "num_input_tokens_seen": 52420312, "step": 90290 }, { "epoch": 13.448763777182007, "grad_norm": 1.0770127773284912, "learning_rate": 1.46381689731863e-05, "loss": 0.6583, "num_input_tokens_seen": 52423096, "step": 90295 }, { "epoch": 13.449508489722968, "grad_norm": 1.290444254875183, "learning_rate": 1.463521188735972e-05, "loss": 0.3794, "num_input_tokens_seen": 52426232, "step": 90300 }, { "epoch": 13.450253202263927, "grad_norm": 2.251919984817505, "learning_rate": 1.4632254976635706e-05, "loss": 0.6347, "num_input_tokens_seen": 52428888, "step": 90305 }, { "epoch": 13.450997914804885, "grad_norm": 2.557791233062744, "learning_rate": 1.4629298241064196e-05, "loss": 0.6061, "num_input_tokens_seen": 52431608, "step": 90310 }, { "epoch": 13.451742627345844, "grad_norm": 2.8544135093688965, "learning_rate": 1.462634168069516e-05, "loss": 0.6493, "num_input_tokens_seen": 52434744, "step": 90315 }, { "epoch": 13.452487339886805, "grad_norm": 0.9578545689582825, "learning_rate": 1.4623385295578542e-05, "loss": 0.5166, "num_input_tokens_seen": 52437656, "step": 90320 }, { "epoch": 13.453232052427763, "grad_norm": 1.0974088907241821, "learning_rate": 1.462042908576427e-05, "loss": 0.503, "num_input_tokens_seen": 52440728, "step": 90325 }, { "epoch": 13.453976764968722, "grad_norm": 1.4059226512908936, "learning_rate": 1.461747305130231e-05, "loss": 0.7156, "num_input_tokens_seen": 52443640, "step": 90330 }, { "epoch": 13.45472147750968, "grad_norm": 1.7083297967910767, "learning_rate": 1.4614517192242588e-05, "loss": 0.6613, "num_input_tokens_seen": 52446904, "step": 90335 }, { "epoch": 13.45546619005064, "grad_norm": 2.633126735687256, "learning_rate": 1.4611561508635047e-05, "loss": 0.6642, "num_input_tokens_seen": 52449720, "step": 90340 }, { "epoch": 13.4562109025916, "grad_norm": 1.6045312881469727, "learning_rate": 1.4608606000529601e-05, "loss": 0.7214, "num_input_tokens_seen": 52452344, "step": 90345 }, { "epoch": 13.456955615132559, "grad_norm": 2.035792589187622, "learning_rate": 1.4605650667976211e-05, "loss": 0.5697, "num_input_tokens_seen": 52455384, "step": 90350 }, { "epoch": 13.457700327673518, "grad_norm": 3.124199628829956, "learning_rate": 1.4602695511024785e-05, "loss": 0.6253, "num_input_tokens_seen": 52458168, "step": 90355 }, { "epoch": 13.458445040214476, "grad_norm": 1.1312693357467651, "learning_rate": 1.4599740529725242e-05, "loss": 0.3824, "num_input_tokens_seen": 52461048, "step": 90360 }, { "epoch": 13.459189752755437, "grad_norm": 1.3746730089187622, "learning_rate": 1.4596785724127526e-05, "loss": 0.6139, "num_input_tokens_seen": 52464056, "step": 90365 }, { "epoch": 13.459934465296396, "grad_norm": 2.2590866088867188, "learning_rate": 1.4593831094281529e-05, "loss": 0.769, "num_input_tokens_seen": 52467256, "step": 90370 }, { "epoch": 13.460679177837354, "grad_norm": 1.721922755241394, "learning_rate": 1.4590876640237189e-05, "loss": 0.6503, "num_input_tokens_seen": 52470232, "step": 90375 }, { "epoch": 13.461423890378313, "grad_norm": 0.8473385572433472, "learning_rate": 1.4587922362044399e-05, "loss": 0.4734, "num_input_tokens_seen": 52473496, "step": 90380 }, { "epoch": 13.462168602919274, "grad_norm": 1.2663624286651611, "learning_rate": 1.4584968259753088e-05, "loss": 0.5133, "num_input_tokens_seen": 52476408, "step": 90385 }, { "epoch": 13.462913315460233, "grad_norm": 1.133569598197937, "learning_rate": 1.4582014333413153e-05, "loss": 0.4251, "num_input_tokens_seen": 52479480, "step": 90390 }, { "epoch": 13.463658028001191, "grad_norm": 1.7549561262130737, "learning_rate": 1.4579060583074497e-05, "loss": 0.6991, "num_input_tokens_seen": 52482296, "step": 90395 }, { "epoch": 13.46440274054215, "grad_norm": 1.5871399641036987, "learning_rate": 1.4576107008787022e-05, "loss": 0.5079, "num_input_tokens_seen": 52485400, "step": 90400 }, { "epoch": 13.46514745308311, "grad_norm": 1.7893543243408203, "learning_rate": 1.4573153610600615e-05, "loss": 0.5135, "num_input_tokens_seen": 52488088, "step": 90405 }, { "epoch": 13.46589216562407, "grad_norm": 1.540581464767456, "learning_rate": 1.4570200388565189e-05, "loss": 0.5099, "num_input_tokens_seen": 52491128, "step": 90410 }, { "epoch": 13.466636878165028, "grad_norm": 1.299635887145996, "learning_rate": 1.4567247342730617e-05, "loss": 0.5121, "num_input_tokens_seen": 52493976, "step": 90415 }, { "epoch": 13.467381590705987, "grad_norm": 1.6534219980239868, "learning_rate": 1.4564294473146808e-05, "loss": 0.5102, "num_input_tokens_seen": 52497112, "step": 90420 }, { "epoch": 13.468126303246947, "grad_norm": 1.803000569343567, "learning_rate": 1.456134177986363e-05, "loss": 0.7516, "num_input_tokens_seen": 52499992, "step": 90425 }, { "epoch": 13.468871015787906, "grad_norm": 0.8327316045761108, "learning_rate": 1.455838926293098e-05, "loss": 0.6143, "num_input_tokens_seen": 52502872, "step": 90430 }, { "epoch": 13.469615728328865, "grad_norm": 1.5959913730621338, "learning_rate": 1.4555436922398732e-05, "loss": 0.5682, "num_input_tokens_seen": 52506104, "step": 90435 }, { "epoch": 13.470360440869824, "grad_norm": 1.3795888423919678, "learning_rate": 1.4552484758316754e-05, "loss": 0.5449, "num_input_tokens_seen": 52509112, "step": 90440 }, { "epoch": 13.471105153410784, "grad_norm": 1.3519130945205688, "learning_rate": 1.4549532770734936e-05, "loss": 0.5455, "num_input_tokens_seen": 52512024, "step": 90445 }, { "epoch": 13.471849865951743, "grad_norm": 1.0796030759811401, "learning_rate": 1.4546580959703138e-05, "loss": 0.5884, "num_input_tokens_seen": 52515256, "step": 90450 }, { "epoch": 13.472594578492702, "grad_norm": 1.0217119455337524, "learning_rate": 1.4543629325271225e-05, "loss": 0.7615, "num_input_tokens_seen": 52518168, "step": 90455 }, { "epoch": 13.47333929103366, "grad_norm": 3.379511833190918, "learning_rate": 1.4540677867489072e-05, "loss": 0.7099, "num_input_tokens_seen": 52521048, "step": 90460 }, { "epoch": 13.474084003574621, "grad_norm": 1.146718144416809, "learning_rate": 1.4537726586406538e-05, "loss": 0.4008, "num_input_tokens_seen": 52523960, "step": 90465 }, { "epoch": 13.47482871611558, "grad_norm": 2.02835750579834, "learning_rate": 1.453477548207347e-05, "loss": 0.6648, "num_input_tokens_seen": 52527192, "step": 90470 }, { "epoch": 13.475573428656539, "grad_norm": 1.2330043315887451, "learning_rate": 1.4531824554539747e-05, "loss": 0.5212, "num_input_tokens_seen": 52530104, "step": 90475 }, { "epoch": 13.476318141197497, "grad_norm": 0.9763749837875366, "learning_rate": 1.4528873803855206e-05, "loss": 0.52, "num_input_tokens_seen": 52533112, "step": 90480 }, { "epoch": 13.477062853738458, "grad_norm": 1.3076764345169067, "learning_rate": 1.4525923230069689e-05, "loss": 0.5766, "num_input_tokens_seen": 52535768, "step": 90485 }, { "epoch": 13.477807566279417, "grad_norm": 3.207247495651245, "learning_rate": 1.4522972833233068e-05, "loss": 0.65, "num_input_tokens_seen": 52538584, "step": 90490 }, { "epoch": 13.478552278820375, "grad_norm": 2.059410333633423, "learning_rate": 1.4520022613395157e-05, "loss": 0.6658, "num_input_tokens_seen": 52541528, "step": 90495 }, { "epoch": 13.479296991361334, "grad_norm": 3.310451030731201, "learning_rate": 1.4517072570605824e-05, "loss": 0.6732, "num_input_tokens_seen": 52544504, "step": 90500 }, { "epoch": 13.480041703902295, "grad_norm": 1.648529291152954, "learning_rate": 1.4514122704914887e-05, "loss": 0.5497, "num_input_tokens_seen": 52547288, "step": 90505 }, { "epoch": 13.480786416443253, "grad_norm": 1.8887532949447632, "learning_rate": 1.4511173016372199e-05, "loss": 0.5515, "num_input_tokens_seen": 52550200, "step": 90510 }, { "epoch": 13.481531128984212, "grad_norm": 1.0562838315963745, "learning_rate": 1.4508223505027581e-05, "loss": 0.5402, "num_input_tokens_seen": 52553368, "step": 90515 }, { "epoch": 13.482275841525171, "grad_norm": 1.4294731616973877, "learning_rate": 1.4505274170930866e-05, "loss": 0.5552, "num_input_tokens_seen": 52556120, "step": 90520 }, { "epoch": 13.48302055406613, "grad_norm": 1.341865062713623, "learning_rate": 1.450232501413188e-05, "loss": 0.7279, "num_input_tokens_seen": 52559256, "step": 90525 }, { "epoch": 13.48376526660709, "grad_norm": 2.0195624828338623, "learning_rate": 1.4499376034680429e-05, "loss": 0.603, "num_input_tokens_seen": 52562328, "step": 90530 }, { "epoch": 13.484509979148049, "grad_norm": 0.716918408870697, "learning_rate": 1.4496427232626358e-05, "loss": 0.4038, "num_input_tokens_seen": 52565080, "step": 90535 }, { "epoch": 13.485254691689008, "grad_norm": 1.153128743171692, "learning_rate": 1.4493478608019461e-05, "loss": 0.6611, "num_input_tokens_seen": 52567768, "step": 90540 }, { "epoch": 13.485999404229966, "grad_norm": 1.4199262857437134, "learning_rate": 1.449053016090958e-05, "loss": 0.5841, "num_input_tokens_seen": 52570392, "step": 90545 }, { "epoch": 13.486744116770927, "grad_norm": 2.2623093128204346, "learning_rate": 1.4487581891346497e-05, "loss": 0.5137, "num_input_tokens_seen": 52573176, "step": 90550 }, { "epoch": 13.487488829311886, "grad_norm": 1.7096738815307617, "learning_rate": 1.4484633799380046e-05, "loss": 0.6983, "num_input_tokens_seen": 52575800, "step": 90555 }, { "epoch": 13.488233541852845, "grad_norm": 2.5872108936309814, "learning_rate": 1.4481685885060017e-05, "loss": 0.6925, "num_input_tokens_seen": 52578872, "step": 90560 }, { "epoch": 13.488978254393803, "grad_norm": 1.8860727548599243, "learning_rate": 1.4478738148436205e-05, "loss": 0.6659, "num_input_tokens_seen": 52581656, "step": 90565 }, { "epoch": 13.489722966934764, "grad_norm": 1.2666122913360596, "learning_rate": 1.447579058955843e-05, "loss": 0.5623, "num_input_tokens_seen": 52584632, "step": 90570 }, { "epoch": 13.490467679475723, "grad_norm": 2.968902587890625, "learning_rate": 1.4472843208476466e-05, "loss": 0.5934, "num_input_tokens_seen": 52587512, "step": 90575 }, { "epoch": 13.491212392016681, "grad_norm": 1.759968876838684, "learning_rate": 1.446989600524013e-05, "loss": 0.478, "num_input_tokens_seen": 52590488, "step": 90580 }, { "epoch": 13.49195710455764, "grad_norm": 2.0875957012176514, "learning_rate": 1.4466948979899194e-05, "loss": 0.6943, "num_input_tokens_seen": 52593368, "step": 90585 }, { "epoch": 13.4927018170986, "grad_norm": 1.3718900680541992, "learning_rate": 1.446400213250345e-05, "loss": 0.6062, "num_input_tokens_seen": 52596152, "step": 90590 }, { "epoch": 13.49344652963956, "grad_norm": 1.2950763702392578, "learning_rate": 1.4461055463102673e-05, "loss": 0.7138, "num_input_tokens_seen": 52599064, "step": 90595 }, { "epoch": 13.494191242180518, "grad_norm": 2.56553316116333, "learning_rate": 1.4458108971746665e-05, "loss": 0.6785, "num_input_tokens_seen": 52602104, "step": 90600 }, { "epoch": 13.494935954721477, "grad_norm": 1.2524183988571167, "learning_rate": 1.4455162658485188e-05, "loss": 0.7209, "num_input_tokens_seen": 52605208, "step": 90605 }, { "epoch": 13.495680667262437, "grad_norm": 1.7336974143981934, "learning_rate": 1.4452216523368011e-05, "loss": 0.645, "num_input_tokens_seen": 52608056, "step": 90610 }, { "epoch": 13.496425379803396, "grad_norm": 1.3365520238876343, "learning_rate": 1.4449270566444929e-05, "loss": 0.704, "num_input_tokens_seen": 52611032, "step": 90615 }, { "epoch": 13.497170092344355, "grad_norm": 1.325658917427063, "learning_rate": 1.4446324787765686e-05, "loss": 0.5628, "num_input_tokens_seen": 52613944, "step": 90620 }, { "epoch": 13.497914804885314, "grad_norm": 1.9456974267959595, "learning_rate": 1.4443379187380068e-05, "loss": 0.5909, "num_input_tokens_seen": 52617016, "step": 90625 }, { "epoch": 13.498659517426274, "grad_norm": 2.229018211364746, "learning_rate": 1.4440433765337819e-05, "loss": 0.664, "num_input_tokens_seen": 52619736, "step": 90630 }, { "epoch": 13.499404229967233, "grad_norm": 1.8723446130752563, "learning_rate": 1.443748852168872e-05, "loss": 0.6643, "num_input_tokens_seen": 52622648, "step": 90635 }, { "epoch": 13.500148942508192, "grad_norm": 1.5650445222854614, "learning_rate": 1.443454345648252e-05, "loss": 0.6351, "num_input_tokens_seen": 52625688, "step": 90640 }, { "epoch": 13.50089365504915, "grad_norm": 2.1383473873138428, "learning_rate": 1.4431598569768955e-05, "loss": 0.6888, "num_input_tokens_seen": 52628376, "step": 90645 }, { "epoch": 13.501638367590111, "grad_norm": 3.8444406986236572, "learning_rate": 1.4428653861597802e-05, "loss": 0.6795, "num_input_tokens_seen": 52631480, "step": 90650 }, { "epoch": 13.50238308013107, "grad_norm": 3.4626781940460205, "learning_rate": 1.44257093320188e-05, "loss": 0.455, "num_input_tokens_seen": 52634328, "step": 90655 }, { "epoch": 13.503127792672029, "grad_norm": 1.8991954326629639, "learning_rate": 1.4422764981081691e-05, "loss": 0.6089, "num_input_tokens_seen": 52637144, "step": 90660 }, { "epoch": 13.503872505212987, "grad_norm": 1.675477147102356, "learning_rate": 1.4419820808836207e-05, "loss": 0.4195, "num_input_tokens_seen": 52640184, "step": 90665 }, { "epoch": 13.504617217753946, "grad_norm": 1.474558711051941, "learning_rate": 1.441687681533211e-05, "loss": 0.625, "num_input_tokens_seen": 52643288, "step": 90670 }, { "epoch": 13.505361930294907, "grad_norm": 2.3184757232666016, "learning_rate": 1.4413933000619106e-05, "loss": 0.5481, "num_input_tokens_seen": 52646232, "step": 90675 }, { "epoch": 13.506106642835865, "grad_norm": 1.8600026369094849, "learning_rate": 1.441098936474696e-05, "loss": 0.6701, "num_input_tokens_seen": 52649336, "step": 90680 }, { "epoch": 13.506851355376824, "grad_norm": 1.3939971923828125, "learning_rate": 1.4408045907765385e-05, "loss": 0.5685, "num_input_tokens_seen": 52651928, "step": 90685 }, { "epoch": 13.507596067917785, "grad_norm": 1.6028817892074585, "learning_rate": 1.44051026297241e-05, "loss": 0.5319, "num_input_tokens_seen": 52654584, "step": 90690 }, { "epoch": 13.508340780458743, "grad_norm": 1.1577062606811523, "learning_rate": 1.4402159530672849e-05, "loss": 0.5945, "num_input_tokens_seen": 52657336, "step": 90695 }, { "epoch": 13.509085492999702, "grad_norm": 1.4365758895874023, "learning_rate": 1.4399216610661329e-05, "loss": 0.6665, "num_input_tokens_seen": 52660088, "step": 90700 }, { "epoch": 13.509830205540661, "grad_norm": 3.1935667991638184, "learning_rate": 1.4396273869739279e-05, "loss": 0.551, "num_input_tokens_seen": 52663288, "step": 90705 }, { "epoch": 13.51057491808162, "grad_norm": 1.1201038360595703, "learning_rate": 1.4393331307956399e-05, "loss": 0.4361, "num_input_tokens_seen": 52666072, "step": 90710 }, { "epoch": 13.51131963062258, "grad_norm": 2.4060416221618652, "learning_rate": 1.4390388925362413e-05, "loss": 0.5694, "num_input_tokens_seen": 52668824, "step": 90715 }, { "epoch": 13.512064343163539, "grad_norm": 2.657097101211548, "learning_rate": 1.4387446722007025e-05, "loss": 0.6172, "num_input_tokens_seen": 52671416, "step": 90720 }, { "epoch": 13.512809055704498, "grad_norm": 1.5732570886611938, "learning_rate": 1.4384504697939938e-05, "loss": 0.6321, "num_input_tokens_seen": 52674200, "step": 90725 }, { "epoch": 13.513553768245457, "grad_norm": 1.6694128513336182, "learning_rate": 1.4381562853210856e-05, "loss": 0.5491, "num_input_tokens_seen": 52677016, "step": 90730 }, { "epoch": 13.514298480786417, "grad_norm": 2.0333874225616455, "learning_rate": 1.4378621187869467e-05, "loss": 0.7346, "num_input_tokens_seen": 52679768, "step": 90735 }, { "epoch": 13.515043193327376, "grad_norm": 2.4901247024536133, "learning_rate": 1.4375679701965488e-05, "loss": 0.7416, "num_input_tokens_seen": 52682872, "step": 90740 }, { "epoch": 13.515787905868335, "grad_norm": 2.4210638999938965, "learning_rate": 1.437273839554859e-05, "loss": 0.5963, "num_input_tokens_seen": 52686072, "step": 90745 }, { "epoch": 13.516532618409293, "grad_norm": 1.1610594987869263, "learning_rate": 1.436979726866849e-05, "loss": 0.494, "num_input_tokens_seen": 52688664, "step": 90750 }, { "epoch": 13.517277330950254, "grad_norm": 2.0648000240325928, "learning_rate": 1.436685632137485e-05, "loss": 0.4584, "num_input_tokens_seen": 52691704, "step": 90755 }, { "epoch": 13.518022043491213, "grad_norm": 2.786449432373047, "learning_rate": 1.4363915553717371e-05, "loss": 0.612, "num_input_tokens_seen": 52694648, "step": 90760 }, { "epoch": 13.518766756032171, "grad_norm": 1.8079408407211304, "learning_rate": 1.4360974965745732e-05, "loss": 0.4722, "num_input_tokens_seen": 52697880, "step": 90765 }, { "epoch": 13.51951146857313, "grad_norm": 1.8665703535079956, "learning_rate": 1.4358034557509598e-05, "loss": 0.5671, "num_input_tokens_seen": 52700632, "step": 90770 }, { "epoch": 13.52025618111409, "grad_norm": 3.2230498790740967, "learning_rate": 1.4355094329058666e-05, "loss": 0.6328, "num_input_tokens_seen": 52703416, "step": 90775 }, { "epoch": 13.52100089365505, "grad_norm": 1.5359609127044678, "learning_rate": 1.4352154280442592e-05, "loss": 0.5638, "num_input_tokens_seen": 52706616, "step": 90780 }, { "epoch": 13.521745606196008, "grad_norm": 2.5903663635253906, "learning_rate": 1.4349214411711043e-05, "loss": 0.6566, "num_input_tokens_seen": 52709400, "step": 90785 }, { "epoch": 13.522490318736967, "grad_norm": 1.8040403127670288, "learning_rate": 1.43462747229137e-05, "loss": 0.425, "num_input_tokens_seen": 52712120, "step": 90790 }, { "epoch": 13.523235031277927, "grad_norm": 1.2417347431182861, "learning_rate": 1.4343335214100218e-05, "loss": 0.5051, "num_input_tokens_seen": 52715000, "step": 90795 }, { "epoch": 13.523979743818886, "grad_norm": 0.979548990726471, "learning_rate": 1.4340395885320257e-05, "loss": 0.5392, "num_input_tokens_seen": 52717880, "step": 90800 }, { "epoch": 13.524724456359845, "grad_norm": 1.8197510242462158, "learning_rate": 1.4337456736623462e-05, "loss": 0.6322, "num_input_tokens_seen": 52720632, "step": 90805 }, { "epoch": 13.525469168900804, "grad_norm": 1.306819200515747, "learning_rate": 1.433451776805951e-05, "loss": 0.7335, "num_input_tokens_seen": 52723544, "step": 90810 }, { "epoch": 13.526213881441764, "grad_norm": 2.55525279045105, "learning_rate": 1.4331578979678029e-05, "loss": 0.5336, "num_input_tokens_seen": 52726424, "step": 90815 }, { "epoch": 13.526958593982723, "grad_norm": 1.568364143371582, "learning_rate": 1.432864037152869e-05, "loss": 0.5537, "num_input_tokens_seen": 52729048, "step": 90820 }, { "epoch": 13.527703306523682, "grad_norm": 0.9969625473022461, "learning_rate": 1.4325701943661116e-05, "loss": 0.5024, "num_input_tokens_seen": 52731896, "step": 90825 }, { "epoch": 13.52844801906464, "grad_norm": 2.2693982124328613, "learning_rate": 1.432276369612497e-05, "loss": 0.5263, "num_input_tokens_seen": 52734776, "step": 90830 }, { "epoch": 13.529192731605601, "grad_norm": 1.2350481748580933, "learning_rate": 1.4319825628969863e-05, "loss": 0.4719, "num_input_tokens_seen": 52737592, "step": 90835 }, { "epoch": 13.52993744414656, "grad_norm": 1.2657450437545776, "learning_rate": 1.4316887742245464e-05, "loss": 0.6639, "num_input_tokens_seen": 52740632, "step": 90840 }, { "epoch": 13.530682156687519, "grad_norm": 2.879807949066162, "learning_rate": 1.4313950036001384e-05, "loss": 0.5884, "num_input_tokens_seen": 52743864, "step": 90845 }, { "epoch": 13.531426869228477, "grad_norm": 1.5464704036712646, "learning_rate": 1.431101251028726e-05, "loss": 0.5335, "num_input_tokens_seen": 52747288, "step": 90850 }, { "epoch": 13.532171581769436, "grad_norm": 1.9233160018920898, "learning_rate": 1.4308075165152718e-05, "loss": 0.5452, "num_input_tokens_seen": 52750040, "step": 90855 }, { "epoch": 13.532916294310397, "grad_norm": 1.958191156387329, "learning_rate": 1.4305138000647367e-05, "loss": 0.6722, "num_input_tokens_seen": 52752856, "step": 90860 }, { "epoch": 13.533661006851355, "grad_norm": 1.7328323125839233, "learning_rate": 1.4302201016820849e-05, "loss": 0.6008, "num_input_tokens_seen": 52756024, "step": 90865 }, { "epoch": 13.534405719392314, "grad_norm": 1.7185701131820679, "learning_rate": 1.4299264213722762e-05, "loss": 0.6484, "num_input_tokens_seen": 52758744, "step": 90870 }, { "epoch": 13.535150431933273, "grad_norm": 1.1404955387115479, "learning_rate": 1.4296327591402742e-05, "loss": 0.6947, "num_input_tokens_seen": 52761432, "step": 90875 }, { "epoch": 13.535895144474233, "grad_norm": 0.9843661785125732, "learning_rate": 1.4293391149910384e-05, "loss": 0.4498, "num_input_tokens_seen": 52764376, "step": 90880 }, { "epoch": 13.536639857015192, "grad_norm": 1.2385735511779785, "learning_rate": 1.4290454889295296e-05, "loss": 0.5782, "num_input_tokens_seen": 52767032, "step": 90885 }, { "epoch": 13.537384569556151, "grad_norm": 1.492343544960022, "learning_rate": 1.4287518809607097e-05, "loss": 0.5499, "num_input_tokens_seen": 52769976, "step": 90890 }, { "epoch": 13.53812928209711, "grad_norm": 1.6076345443725586, "learning_rate": 1.428458291089537e-05, "loss": 0.6109, "num_input_tokens_seen": 52773112, "step": 90895 }, { "epoch": 13.53887399463807, "grad_norm": 1.6878814697265625, "learning_rate": 1.4281647193209732e-05, "loss": 0.495, "num_input_tokens_seen": 52775960, "step": 90900 }, { "epoch": 13.539618707179029, "grad_norm": 2.983222723007202, "learning_rate": 1.4278711656599764e-05, "loss": 0.5854, "num_input_tokens_seen": 52778776, "step": 90905 }, { "epoch": 13.540363419719988, "grad_norm": 3.1161999702453613, "learning_rate": 1.4275776301115074e-05, "loss": 0.7133, "num_input_tokens_seen": 52781464, "step": 90910 }, { "epoch": 13.541108132260947, "grad_norm": 1.9103648662567139, "learning_rate": 1.4272841126805242e-05, "loss": 0.7506, "num_input_tokens_seen": 52784120, "step": 90915 }, { "epoch": 13.541852844801907, "grad_norm": 1.1959487199783325, "learning_rate": 1.4269906133719863e-05, "loss": 0.6062, "num_input_tokens_seen": 52786744, "step": 90920 }, { "epoch": 13.542597557342866, "grad_norm": 1.614070177078247, "learning_rate": 1.4266971321908507e-05, "loss": 0.5858, "num_input_tokens_seen": 52789688, "step": 90925 }, { "epoch": 13.543342269883825, "grad_norm": 1.4959640502929688, "learning_rate": 1.4264036691420756e-05, "loss": 0.5275, "num_input_tokens_seen": 52792472, "step": 90930 }, { "epoch": 13.544086982424783, "grad_norm": 1.7697639465332031, "learning_rate": 1.42611022423062e-05, "loss": 0.656, "num_input_tokens_seen": 52795064, "step": 90935 }, { "epoch": 13.544831694965744, "grad_norm": 1.6807403564453125, "learning_rate": 1.42581679746144e-05, "loss": 0.6495, "num_input_tokens_seen": 52797912, "step": 90940 }, { "epoch": 13.545576407506703, "grad_norm": 0.854097843170166, "learning_rate": 1.4255233888394947e-05, "loss": 0.5577, "num_input_tokens_seen": 52800792, "step": 90945 }, { "epoch": 13.546321120047661, "grad_norm": 1.307357907295227, "learning_rate": 1.4252299983697381e-05, "loss": 0.5381, "num_input_tokens_seen": 52803832, "step": 90950 }, { "epoch": 13.54706583258862, "grad_norm": 1.852365255355835, "learning_rate": 1.4249366260571299e-05, "loss": 0.6646, "num_input_tokens_seen": 52806744, "step": 90955 }, { "epoch": 13.54781054512958, "grad_norm": 3.936814785003662, "learning_rate": 1.4246432719066244e-05, "loss": 0.5923, "num_input_tokens_seen": 52809592, "step": 90960 }, { "epoch": 13.54855525767054, "grad_norm": 0.9083534479141235, "learning_rate": 1.4243499359231771e-05, "loss": 0.5534, "num_input_tokens_seen": 52812280, "step": 90965 }, { "epoch": 13.549299970211498, "grad_norm": 2.193822145462036, "learning_rate": 1.4240566181117451e-05, "loss": 0.6526, "num_input_tokens_seen": 52815128, "step": 90970 }, { "epoch": 13.550044682752457, "grad_norm": 2.315793037414551, "learning_rate": 1.4237633184772822e-05, "loss": 0.5759, "num_input_tokens_seen": 52817848, "step": 90975 }, { "epoch": 13.550789395293418, "grad_norm": 2.3436763286590576, "learning_rate": 1.423470037024745e-05, "loss": 0.6478, "num_input_tokens_seen": 52820984, "step": 90980 }, { "epoch": 13.551534107834376, "grad_norm": 1.4916000366210938, "learning_rate": 1.423176773759088e-05, "loss": 0.6078, "num_input_tokens_seen": 52823832, "step": 90985 }, { "epoch": 13.552278820375335, "grad_norm": 1.2360670566558838, "learning_rate": 1.4228835286852643e-05, "loss": 0.6018, "num_input_tokens_seen": 52826936, "step": 90990 }, { "epoch": 13.553023532916294, "grad_norm": 1.4471075534820557, "learning_rate": 1.4225903018082278e-05, "loss": 0.762, "num_input_tokens_seen": 52830104, "step": 90995 }, { "epoch": 13.553768245457253, "grad_norm": 1.066816806793213, "learning_rate": 1.4222970931329343e-05, "loss": 0.6412, "num_input_tokens_seen": 52832728, "step": 91000 }, { "epoch": 13.554512957998213, "grad_norm": 1.5696930885314941, "learning_rate": 1.4220039026643361e-05, "loss": 0.7077, "num_input_tokens_seen": 52835640, "step": 91005 }, { "epoch": 13.555257670539172, "grad_norm": 2.721487522125244, "learning_rate": 1.4217107304073851e-05, "loss": 0.636, "num_input_tokens_seen": 52838712, "step": 91010 }, { "epoch": 13.55600238308013, "grad_norm": 1.6803474426269531, "learning_rate": 1.4214175763670365e-05, "loss": 0.6663, "num_input_tokens_seen": 52841400, "step": 91015 }, { "epoch": 13.556747095621091, "grad_norm": 1.392469882965088, "learning_rate": 1.4211244405482408e-05, "loss": 0.5337, "num_input_tokens_seen": 52844056, "step": 91020 }, { "epoch": 13.55749180816205, "grad_norm": 1.7157018184661865, "learning_rate": 1.420831322955952e-05, "loss": 0.5311, "num_input_tokens_seen": 52846776, "step": 91025 }, { "epoch": 13.558236520703009, "grad_norm": 1.9597532749176025, "learning_rate": 1.4205382235951204e-05, "loss": 0.5414, "num_input_tokens_seen": 52849720, "step": 91030 }, { "epoch": 13.558981233243967, "grad_norm": 1.9871546030044556, "learning_rate": 1.4202451424706991e-05, "loss": 0.6474, "num_input_tokens_seen": 52852632, "step": 91035 }, { "epoch": 13.559725945784926, "grad_norm": 2.794759750366211, "learning_rate": 1.4199520795876387e-05, "loss": 0.7709, "num_input_tokens_seen": 52855512, "step": 91040 }, { "epoch": 13.560470658325887, "grad_norm": 2.0861964225769043, "learning_rate": 1.4196590349508896e-05, "loss": 0.5906, "num_input_tokens_seen": 52858616, "step": 91045 }, { "epoch": 13.561215370866845, "grad_norm": 1.295277714729309, "learning_rate": 1.4193660085654037e-05, "loss": 0.533, "num_input_tokens_seen": 52861528, "step": 91050 }, { "epoch": 13.561960083407804, "grad_norm": 1.5927633047103882, "learning_rate": 1.419073000436131e-05, "loss": 0.737, "num_input_tokens_seen": 52864248, "step": 91055 }, { "epoch": 13.562704795948763, "grad_norm": 1.7224773168563843, "learning_rate": 1.4187800105680213e-05, "loss": 0.6728, "num_input_tokens_seen": 52867064, "step": 91060 }, { "epoch": 13.563449508489724, "grad_norm": 1.372249722480774, "learning_rate": 1.4184870389660235e-05, "loss": 0.4903, "num_input_tokens_seen": 52869976, "step": 91065 }, { "epoch": 13.564194221030682, "grad_norm": 1.2226313352584839, "learning_rate": 1.4181940856350889e-05, "loss": 0.6769, "num_input_tokens_seen": 52872888, "step": 91070 }, { "epoch": 13.564938933571641, "grad_norm": 1.6012037992477417, "learning_rate": 1.4179011505801648e-05, "loss": 0.5646, "num_input_tokens_seen": 52875832, "step": 91075 }, { "epoch": 13.5656836461126, "grad_norm": 0.6614324450492859, "learning_rate": 1.4176082338062019e-05, "loss": 0.5826, "num_input_tokens_seen": 52878584, "step": 91080 }, { "epoch": 13.56642835865356, "grad_norm": 1.9540220499038696, "learning_rate": 1.4173153353181477e-05, "loss": 0.5822, "num_input_tokens_seen": 52881720, "step": 91085 }, { "epoch": 13.567173071194519, "grad_norm": 2.5253474712371826, "learning_rate": 1.4170224551209493e-05, "loss": 0.6976, "num_input_tokens_seen": 52884504, "step": 91090 }, { "epoch": 13.567917783735478, "grad_norm": 1.609115481376648, "learning_rate": 1.4167295932195573e-05, "loss": 0.5759, "num_input_tokens_seen": 52887512, "step": 91095 }, { "epoch": 13.568662496276437, "grad_norm": 1.1388612985610962, "learning_rate": 1.4164367496189169e-05, "loss": 0.4734, "num_input_tokens_seen": 52890360, "step": 91100 }, { "epoch": 13.569407208817397, "grad_norm": 3.415898084640503, "learning_rate": 1.4161439243239768e-05, "loss": 0.6807, "num_input_tokens_seen": 52893400, "step": 91105 }, { "epoch": 13.570151921358356, "grad_norm": 2.3089895248413086, "learning_rate": 1.4158511173396838e-05, "loss": 0.628, "num_input_tokens_seen": 52896344, "step": 91110 }, { "epoch": 13.570896633899315, "grad_norm": 1.2565852403640747, "learning_rate": 1.4155583286709833e-05, "loss": 0.8234, "num_input_tokens_seen": 52899352, "step": 91115 }, { "epoch": 13.571641346440273, "grad_norm": 1.1397134065628052, "learning_rate": 1.4152655583228235e-05, "loss": 0.4682, "num_input_tokens_seen": 52902264, "step": 91120 }, { "epoch": 13.572386058981234, "grad_norm": 1.6014262437820435, "learning_rate": 1.41497280630015e-05, "loss": 0.4601, "num_input_tokens_seen": 52905080, "step": 91125 }, { "epoch": 13.573130771522193, "grad_norm": 1.5022481679916382, "learning_rate": 1.414680072607908e-05, "loss": 0.6626, "num_input_tokens_seen": 52908120, "step": 91130 }, { "epoch": 13.573875484063151, "grad_norm": 0.978057861328125, "learning_rate": 1.414387357251042e-05, "loss": 0.5499, "num_input_tokens_seen": 52910904, "step": 91135 }, { "epoch": 13.57462019660411, "grad_norm": 0.9974403381347656, "learning_rate": 1.4140946602344993e-05, "loss": 0.5228, "num_input_tokens_seen": 52913624, "step": 91140 }, { "epoch": 13.57536490914507, "grad_norm": 2.9824178218841553, "learning_rate": 1.4138019815632226e-05, "loss": 0.769, "num_input_tokens_seen": 52916504, "step": 91145 }, { "epoch": 13.57610962168603, "grad_norm": 1.3401405811309814, "learning_rate": 1.4135093212421584e-05, "loss": 0.5776, "num_input_tokens_seen": 52919288, "step": 91150 }, { "epoch": 13.576854334226988, "grad_norm": 1.2758374214172363, "learning_rate": 1.4132166792762491e-05, "loss": 0.6272, "num_input_tokens_seen": 52922328, "step": 91155 }, { "epoch": 13.577599046767947, "grad_norm": 1.6047135591506958, "learning_rate": 1.4129240556704403e-05, "loss": 0.7228, "num_input_tokens_seen": 52925240, "step": 91160 }, { "epoch": 13.578343759308908, "grad_norm": 1.8089451789855957, "learning_rate": 1.4126314504296751e-05, "loss": 0.6235, "num_input_tokens_seen": 52928088, "step": 91165 }, { "epoch": 13.579088471849866, "grad_norm": 1.5431681871414185, "learning_rate": 1.412338863558895e-05, "loss": 0.5753, "num_input_tokens_seen": 52931320, "step": 91170 }, { "epoch": 13.579833184390825, "grad_norm": 1.307986855506897, "learning_rate": 1.4120462950630453e-05, "loss": 0.6641, "num_input_tokens_seen": 52934136, "step": 91175 }, { "epoch": 13.580577896931784, "grad_norm": 2.409803628921509, "learning_rate": 1.411753744947068e-05, "loss": 0.7086, "num_input_tokens_seen": 52936952, "step": 91180 }, { "epoch": 13.581322609472743, "grad_norm": 1.67826247215271, "learning_rate": 1.4114612132159049e-05, "loss": 0.6859, "num_input_tokens_seen": 52939672, "step": 91185 }, { "epoch": 13.582067322013703, "grad_norm": 1.0448524951934814, "learning_rate": 1.4111686998744975e-05, "loss": 0.4634, "num_input_tokens_seen": 52942392, "step": 91190 }, { "epoch": 13.582812034554662, "grad_norm": 1.0766448974609375, "learning_rate": 1.410876204927789e-05, "loss": 0.4778, "num_input_tokens_seen": 52945432, "step": 91195 }, { "epoch": 13.58355674709562, "grad_norm": 1.3014062643051147, "learning_rate": 1.4105837283807194e-05, "loss": 0.5016, "num_input_tokens_seen": 52948632, "step": 91200 }, { "epoch": 13.584301459636581, "grad_norm": 1.8664616346359253, "learning_rate": 1.410291270238231e-05, "loss": 0.6183, "num_input_tokens_seen": 52951416, "step": 91205 }, { "epoch": 13.58504617217754, "grad_norm": 1.4927499294281006, "learning_rate": 1.4099988305052644e-05, "loss": 0.4996, "num_input_tokens_seen": 52954360, "step": 91210 }, { "epoch": 13.585790884718499, "grad_norm": 1.4174678325653076, "learning_rate": 1.4097064091867587e-05, "loss": 0.5783, "num_input_tokens_seen": 52957144, "step": 91215 }, { "epoch": 13.586535597259457, "grad_norm": 2.979320526123047, "learning_rate": 1.4094140062876559e-05, "loss": 0.7828, "num_input_tokens_seen": 52960184, "step": 91220 }, { "epoch": 13.587280309800416, "grad_norm": 1.9222608804702759, "learning_rate": 1.4091216218128945e-05, "loss": 0.4836, "num_input_tokens_seen": 52963288, "step": 91225 }, { "epoch": 13.588025022341377, "grad_norm": 1.6028313636779785, "learning_rate": 1.4088292557674155e-05, "loss": 0.6171, "num_input_tokens_seen": 52966136, "step": 91230 }, { "epoch": 13.588769734882336, "grad_norm": 1.4009060859680176, "learning_rate": 1.4085369081561556e-05, "loss": 0.4442, "num_input_tokens_seen": 52969080, "step": 91235 }, { "epoch": 13.589514447423294, "grad_norm": 1.0860114097595215, "learning_rate": 1.408244578984057e-05, "loss": 0.6739, "num_input_tokens_seen": 52972280, "step": 91240 }, { "epoch": 13.590259159964253, "grad_norm": 0.782606303691864, "learning_rate": 1.4079522682560563e-05, "loss": 0.4729, "num_input_tokens_seen": 52975288, "step": 91245 }, { "epoch": 13.591003872505214, "grad_norm": 2.051560163497925, "learning_rate": 1.4076599759770919e-05, "loss": 0.6567, "num_input_tokens_seen": 52977912, "step": 91250 }, { "epoch": 13.591748585046172, "grad_norm": 1.3450990915298462, "learning_rate": 1.4073677021521026e-05, "loss": 0.5164, "num_input_tokens_seen": 52980856, "step": 91255 }, { "epoch": 13.592493297587131, "grad_norm": 1.5513107776641846, "learning_rate": 1.407075446786024e-05, "loss": 0.5953, "num_input_tokens_seen": 52983576, "step": 91260 }, { "epoch": 13.59323801012809, "grad_norm": 1.352249264717102, "learning_rate": 1.406783209883796e-05, "loss": 0.3733, "num_input_tokens_seen": 52986648, "step": 91265 }, { "epoch": 13.59398272266905, "grad_norm": 2.3060290813446045, "learning_rate": 1.4064909914503537e-05, "loss": 0.6992, "num_input_tokens_seen": 52989816, "step": 91270 }, { "epoch": 13.59472743521001, "grad_norm": 1.530909538269043, "learning_rate": 1.4061987914906354e-05, "loss": 0.5395, "num_input_tokens_seen": 52992568, "step": 91275 }, { "epoch": 13.595472147750968, "grad_norm": 1.6092759370803833, "learning_rate": 1.4059066100095763e-05, "loss": 0.582, "num_input_tokens_seen": 52995384, "step": 91280 }, { "epoch": 13.596216860291927, "grad_norm": 2.3388047218322754, "learning_rate": 1.4056144470121137e-05, "loss": 0.5762, "num_input_tokens_seen": 52998360, "step": 91285 }, { "epoch": 13.596961572832887, "grad_norm": 1.789753794670105, "learning_rate": 1.405322302503183e-05, "loss": 0.604, "num_input_tokens_seen": 53001240, "step": 91290 }, { "epoch": 13.597706285373846, "grad_norm": 1.7910728454589844, "learning_rate": 1.4050301764877183e-05, "loss": 0.5747, "num_input_tokens_seen": 53004088, "step": 91295 }, { "epoch": 13.598450997914805, "grad_norm": 1.566158413887024, "learning_rate": 1.4047380689706568e-05, "loss": 0.5344, "num_input_tokens_seen": 53006904, "step": 91300 }, { "epoch": 13.599195710455763, "grad_norm": 0.9076380729675293, "learning_rate": 1.4044459799569316e-05, "loss": 0.4997, "num_input_tokens_seen": 53009752, "step": 91305 }, { "epoch": 13.599940422996724, "grad_norm": 1.966834545135498, "learning_rate": 1.4041539094514788e-05, "loss": 0.6602, "num_input_tokens_seen": 53012920, "step": 91310 }, { "epoch": 13.600685135537683, "grad_norm": 2.079684257507324, "learning_rate": 1.4038618574592322e-05, "loss": 0.6581, "num_input_tokens_seen": 53015576, "step": 91315 }, { "epoch": 13.601429848078642, "grad_norm": 1.7786046266555786, "learning_rate": 1.4035698239851253e-05, "loss": 0.5416, "num_input_tokens_seen": 53018680, "step": 91320 }, { "epoch": 13.6021745606196, "grad_norm": 1.5822336673736572, "learning_rate": 1.403277809034092e-05, "loss": 0.5211, "num_input_tokens_seen": 53021912, "step": 91325 }, { "epoch": 13.60291927316056, "grad_norm": 1.9316939115524292, "learning_rate": 1.4029858126110645e-05, "loss": 0.4853, "num_input_tokens_seen": 53024472, "step": 91330 }, { "epoch": 13.60366398570152, "grad_norm": 1.2122188806533813, "learning_rate": 1.4026938347209778e-05, "loss": 0.6002, "num_input_tokens_seen": 53027352, "step": 91335 }, { "epoch": 13.604408698242478, "grad_norm": 1.8822566270828247, "learning_rate": 1.4024018753687624e-05, "loss": 0.7042, "num_input_tokens_seen": 53030328, "step": 91340 }, { "epoch": 13.605153410783437, "grad_norm": 1.6628071069717407, "learning_rate": 1.4021099345593524e-05, "loss": 0.4913, "num_input_tokens_seen": 53033240, "step": 91345 }, { "epoch": 13.605898123324398, "grad_norm": 0.9345501661300659, "learning_rate": 1.4018180122976788e-05, "loss": 0.3581, "num_input_tokens_seen": 53036536, "step": 91350 }, { "epoch": 13.606642835865356, "grad_norm": 1.6627389192581177, "learning_rate": 1.4015261085886743e-05, "loss": 0.5213, "num_input_tokens_seen": 53039224, "step": 91355 }, { "epoch": 13.607387548406315, "grad_norm": 0.9593519568443298, "learning_rate": 1.4012342234372688e-05, "loss": 0.6937, "num_input_tokens_seen": 53042680, "step": 91360 }, { "epoch": 13.608132260947274, "grad_norm": 2.845118522644043, "learning_rate": 1.4009423568483957e-05, "loss": 0.6722, "num_input_tokens_seen": 53045560, "step": 91365 }, { "epoch": 13.608876973488233, "grad_norm": 2.12343430519104, "learning_rate": 1.4006505088269841e-05, "loss": 0.4904, "num_input_tokens_seen": 53048408, "step": 91370 }, { "epoch": 13.609621686029193, "grad_norm": 1.1809300184249878, "learning_rate": 1.4003586793779641e-05, "loss": 0.7106, "num_input_tokens_seen": 53051704, "step": 91375 }, { "epoch": 13.610366398570152, "grad_norm": 0.7563309669494629, "learning_rate": 1.4000668685062674e-05, "loss": 0.4165, "num_input_tokens_seen": 53054424, "step": 91380 }, { "epoch": 13.61111111111111, "grad_norm": 1.168715476989746, "learning_rate": 1.3997750762168232e-05, "loss": 0.6672, "num_input_tokens_seen": 53057464, "step": 91385 }, { "epoch": 13.61185582365207, "grad_norm": 2.141601085662842, "learning_rate": 1.3994833025145607e-05, "loss": 0.5884, "num_input_tokens_seen": 53060536, "step": 91390 }, { "epoch": 13.61260053619303, "grad_norm": 1.7769440412521362, "learning_rate": 1.3991915474044081e-05, "loss": 0.7024, "num_input_tokens_seen": 53063576, "step": 91395 }, { "epoch": 13.613345248733989, "grad_norm": 1.3354122638702393, "learning_rate": 1.398899810891297e-05, "loss": 0.729, "num_input_tokens_seen": 53066744, "step": 91400 }, { "epoch": 13.614089961274948, "grad_norm": 0.7636323571205139, "learning_rate": 1.3986080929801543e-05, "loss": 0.5808, "num_input_tokens_seen": 53070008, "step": 91405 }, { "epoch": 13.614834673815906, "grad_norm": 2.8158059120178223, "learning_rate": 1.3983163936759072e-05, "loss": 0.5961, "num_input_tokens_seen": 53073080, "step": 91410 }, { "epoch": 13.615579386356867, "grad_norm": 1.530608892440796, "learning_rate": 1.398024712983486e-05, "loss": 0.6049, "num_input_tokens_seen": 53076216, "step": 91415 }, { "epoch": 13.616324098897826, "grad_norm": 1.8236510753631592, "learning_rate": 1.3977330509078165e-05, "loss": 0.7571, "num_input_tokens_seen": 53079256, "step": 91420 }, { "epoch": 13.617068811438784, "grad_norm": 0.9040523171424866, "learning_rate": 1.3974414074538277e-05, "loss": 0.7319, "num_input_tokens_seen": 53081976, "step": 91425 }, { "epoch": 13.617813523979743, "grad_norm": 1.9802881479263306, "learning_rate": 1.3971497826264448e-05, "loss": 0.4947, "num_input_tokens_seen": 53084632, "step": 91430 }, { "epoch": 13.618558236520704, "grad_norm": 1.5650570392608643, "learning_rate": 1.3968581764305965e-05, "loss": 0.4864, "num_input_tokens_seen": 53087672, "step": 91435 }, { "epoch": 13.619302949061662, "grad_norm": 2.3979501724243164, "learning_rate": 1.396566588871208e-05, "loss": 0.736, "num_input_tokens_seen": 53090968, "step": 91440 }, { "epoch": 13.620047661602621, "grad_norm": 1.647329568862915, "learning_rate": 1.3962750199532042e-05, "loss": 0.7691, "num_input_tokens_seen": 53093912, "step": 91445 }, { "epoch": 13.62079237414358, "grad_norm": 2.331743001937866, "learning_rate": 1.3959834696815138e-05, "loss": 0.5883, "num_input_tokens_seen": 53096728, "step": 91450 }, { "epoch": 13.62153708668454, "grad_norm": 2.383720636367798, "learning_rate": 1.39569193806106e-05, "loss": 0.7849, "num_input_tokens_seen": 53099640, "step": 91455 }, { "epoch": 13.6222817992255, "grad_norm": 1.3771389722824097, "learning_rate": 1.395400425096769e-05, "loss": 0.7794, "num_input_tokens_seen": 53102680, "step": 91460 }, { "epoch": 13.623026511766458, "grad_norm": 1.6009718179702759, "learning_rate": 1.3951089307935639e-05, "loss": 0.4956, "num_input_tokens_seen": 53105528, "step": 91465 }, { "epoch": 13.623771224307417, "grad_norm": 1.7563467025756836, "learning_rate": 1.3948174551563713e-05, "loss": 0.4861, "num_input_tokens_seen": 53109656, "step": 91470 }, { "epoch": 13.624515936848377, "grad_norm": 1.3875997066497803, "learning_rate": 1.3945259981901138e-05, "loss": 0.5211, "num_input_tokens_seen": 53112440, "step": 91475 }, { "epoch": 13.625260649389336, "grad_norm": 0.8581035733222961, "learning_rate": 1.394234559899717e-05, "loss": 0.6904, "num_input_tokens_seen": 53115160, "step": 91480 }, { "epoch": 13.626005361930295, "grad_norm": 1.6780545711517334, "learning_rate": 1.3939431402901034e-05, "loss": 0.4754, "num_input_tokens_seen": 53118136, "step": 91485 }, { "epoch": 13.626750074471254, "grad_norm": 1.7962467670440674, "learning_rate": 1.3936517393661955e-05, "loss": 0.5609, "num_input_tokens_seen": 53120952, "step": 91490 }, { "epoch": 13.627494787012214, "grad_norm": 1.6596120595932007, "learning_rate": 1.393360357132918e-05, "loss": 0.6136, "num_input_tokens_seen": 53124152, "step": 91495 }, { "epoch": 13.628239499553173, "grad_norm": 1.5438472032546997, "learning_rate": 1.3930689935951913e-05, "loss": 0.4707, "num_input_tokens_seen": 53126904, "step": 91500 }, { "epoch": 13.628984212094132, "grad_norm": 1.3637361526489258, "learning_rate": 1.3927776487579397e-05, "loss": 0.5957, "num_input_tokens_seen": 53129816, "step": 91505 }, { "epoch": 13.62972892463509, "grad_norm": 2.1055867671966553, "learning_rate": 1.3924863226260849e-05, "loss": 0.6456, "num_input_tokens_seen": 53132792, "step": 91510 }, { "epoch": 13.63047363717605, "grad_norm": 1.193041443824768, "learning_rate": 1.3921950152045477e-05, "loss": 0.6216, "num_input_tokens_seen": 53135640, "step": 91515 }, { "epoch": 13.63121834971701, "grad_norm": 1.4593462944030762, "learning_rate": 1.391903726498249e-05, "loss": 0.4011, "num_input_tokens_seen": 53138584, "step": 91520 }, { "epoch": 13.631963062257968, "grad_norm": 1.616316318511963, "learning_rate": 1.3916124565121114e-05, "loss": 0.5479, "num_input_tokens_seen": 53141240, "step": 91525 }, { "epoch": 13.632707774798927, "grad_norm": 1.4578150510787964, "learning_rate": 1.391321205251055e-05, "loss": 0.54, "num_input_tokens_seen": 53144120, "step": 91530 }, { "epoch": 13.633452487339888, "grad_norm": 1.2609282732009888, "learning_rate": 1.3910299727199991e-05, "loss": 0.4791, "num_input_tokens_seen": 53146808, "step": 91535 }, { "epoch": 13.634197199880846, "grad_norm": 1.033753752708435, "learning_rate": 1.3907387589238657e-05, "loss": 0.4726, "num_input_tokens_seen": 53149720, "step": 91540 }, { "epoch": 13.634941912421805, "grad_norm": 2.17362380027771, "learning_rate": 1.3904475638675724e-05, "loss": 0.7435, "num_input_tokens_seen": 53153016, "step": 91545 }, { "epoch": 13.635686624962764, "grad_norm": 2.7652294635772705, "learning_rate": 1.3901563875560408e-05, "loss": 0.6279, "num_input_tokens_seen": 53156216, "step": 91550 }, { "epoch": 13.636431337503723, "grad_norm": 2.3863043785095215, "learning_rate": 1.3898652299941883e-05, "loss": 0.6522, "num_input_tokens_seen": 53159160, "step": 91555 }, { "epoch": 13.637176050044683, "grad_norm": 1.2926656007766724, "learning_rate": 1.3895740911869351e-05, "loss": 0.5728, "num_input_tokens_seen": 53162136, "step": 91560 }, { "epoch": 13.637920762585642, "grad_norm": 1.8171526193618774, "learning_rate": 1.389282971139199e-05, "loss": 0.5377, "num_input_tokens_seen": 53165112, "step": 91565 }, { "epoch": 13.6386654751266, "grad_norm": 1.4706125259399414, "learning_rate": 1.3889918698558976e-05, "loss": 0.5726, "num_input_tokens_seen": 53167928, "step": 91570 }, { "epoch": 13.63941018766756, "grad_norm": 1.4703915119171143, "learning_rate": 1.3887007873419503e-05, "loss": 0.5643, "num_input_tokens_seen": 53171256, "step": 91575 }, { "epoch": 13.64015490020852, "grad_norm": 1.646037220954895, "learning_rate": 1.3884097236022736e-05, "loss": 0.576, "num_input_tokens_seen": 53174104, "step": 91580 }, { "epoch": 13.640899612749479, "grad_norm": 1.6906172037124634, "learning_rate": 1.3881186786417848e-05, "loss": 0.4448, "num_input_tokens_seen": 53176920, "step": 91585 }, { "epoch": 13.641644325290438, "grad_norm": 1.5204659700393677, "learning_rate": 1.3878276524654e-05, "loss": 0.6897, "num_input_tokens_seen": 53179832, "step": 91590 }, { "epoch": 13.642389037831396, "grad_norm": 1.6701568365097046, "learning_rate": 1.3875366450780375e-05, "loss": 0.7961, "num_input_tokens_seen": 53182616, "step": 91595 }, { "epoch": 13.643133750372357, "grad_norm": 2.08223819732666, "learning_rate": 1.387245656484612e-05, "loss": 0.422, "num_input_tokens_seen": 53185624, "step": 91600 }, { "epoch": 13.643878462913316, "grad_norm": 1.3599482774734497, "learning_rate": 1.3869546866900409e-05, "loss": 0.6716, "num_input_tokens_seen": 53188312, "step": 91605 }, { "epoch": 13.644623175454274, "grad_norm": 0.7232316732406616, "learning_rate": 1.3866637356992393e-05, "loss": 0.7123, "num_input_tokens_seen": 53191128, "step": 91610 }, { "epoch": 13.645367887995233, "grad_norm": 1.5602236986160278, "learning_rate": 1.3863728035171214e-05, "loss": 0.7855, "num_input_tokens_seen": 53193976, "step": 91615 }, { "epoch": 13.646112600536194, "grad_norm": 2.5342612266540527, "learning_rate": 1.386081890148604e-05, "loss": 0.5901, "num_input_tokens_seen": 53196920, "step": 91620 }, { "epoch": 13.646857313077152, "grad_norm": 0.9480149745941162, "learning_rate": 1.3857909955985999e-05, "loss": 0.5921, "num_input_tokens_seen": 53199608, "step": 91625 }, { "epoch": 13.647602025618111, "grad_norm": 1.8320833444595337, "learning_rate": 1.3855001198720255e-05, "loss": 0.4861, "num_input_tokens_seen": 53202424, "step": 91630 }, { "epoch": 13.64834673815907, "grad_norm": 0.8646575212478638, "learning_rate": 1.3852092629737928e-05, "loss": 0.4162, "num_input_tokens_seen": 53205080, "step": 91635 }, { "epoch": 13.64909145070003, "grad_norm": 1.0621141195297241, "learning_rate": 1.3849184249088176e-05, "loss": 0.5506, "num_input_tokens_seen": 53207960, "step": 91640 }, { "epoch": 13.64983616324099, "grad_norm": 2.5641510486602783, "learning_rate": 1.3846276056820123e-05, "loss": 0.4322, "num_input_tokens_seen": 53210648, "step": 91645 }, { "epoch": 13.650580875781948, "grad_norm": 0.9266141653060913, "learning_rate": 1.3843368052982903e-05, "loss": 0.6133, "num_input_tokens_seen": 53213752, "step": 91650 }, { "epoch": 13.651325588322907, "grad_norm": 1.4128419160842896, "learning_rate": 1.3840460237625635e-05, "loss": 0.7705, "num_input_tokens_seen": 53216728, "step": 91655 }, { "epoch": 13.652070300863867, "grad_norm": 0.6684879064559937, "learning_rate": 1.3837552610797444e-05, "loss": 0.4138, "num_input_tokens_seen": 53219448, "step": 91660 }, { "epoch": 13.652815013404826, "grad_norm": 1.150653600692749, "learning_rate": 1.3834645172547467e-05, "loss": 0.3955, "num_input_tokens_seen": 53222392, "step": 91665 }, { "epoch": 13.653559725945785, "grad_norm": 2.6664137840270996, "learning_rate": 1.3831737922924798e-05, "loss": 0.6072, "num_input_tokens_seen": 53225080, "step": 91670 }, { "epoch": 13.654304438486744, "grad_norm": 1.3378335237503052, "learning_rate": 1.3828830861978579e-05, "loss": 0.5938, "num_input_tokens_seen": 53228088, "step": 91675 }, { "epoch": 13.655049151027704, "grad_norm": 1.8885067701339722, "learning_rate": 1.3825923989757896e-05, "loss": 0.6101, "num_input_tokens_seen": 53230936, "step": 91680 }, { "epoch": 13.655793863568663, "grad_norm": 1.3355752229690552, "learning_rate": 1.382301730631188e-05, "loss": 0.6603, "num_input_tokens_seen": 53234008, "step": 91685 }, { "epoch": 13.656538576109622, "grad_norm": 3.1761224269866943, "learning_rate": 1.382011081168963e-05, "loss": 0.6625, "num_input_tokens_seen": 53236856, "step": 91690 }, { "epoch": 13.65728328865058, "grad_norm": 1.340343713760376, "learning_rate": 1.3817204505940235e-05, "loss": 0.511, "num_input_tokens_seen": 53239608, "step": 91695 }, { "epoch": 13.65802800119154, "grad_norm": 0.9549660682678223, "learning_rate": 1.3814298389112811e-05, "loss": 0.7402, "num_input_tokens_seen": 53242488, "step": 91700 }, { "epoch": 13.6587727137325, "grad_norm": 1.0770471096038818, "learning_rate": 1.381139246125644e-05, "loss": 0.5159, "num_input_tokens_seen": 53245368, "step": 91705 }, { "epoch": 13.659517426273458, "grad_norm": 1.9125627279281616, "learning_rate": 1.3808486722420233e-05, "loss": 0.7345, "num_input_tokens_seen": 53248216, "step": 91710 }, { "epoch": 13.660262138814417, "grad_norm": 1.37763512134552, "learning_rate": 1.3805581172653265e-05, "loss": 0.6148, "num_input_tokens_seen": 53251096, "step": 91715 }, { "epoch": 13.661006851355378, "grad_norm": 0.8751893043518066, "learning_rate": 1.3802675812004626e-05, "loss": 0.425, "num_input_tokens_seen": 53254104, "step": 91720 }, { "epoch": 13.661751563896336, "grad_norm": 1.5816452503204346, "learning_rate": 1.3799770640523398e-05, "loss": 0.6093, "num_input_tokens_seen": 53257048, "step": 91725 }, { "epoch": 13.662496276437295, "grad_norm": 1.432116150856018, "learning_rate": 1.3796865658258654e-05, "loss": 0.6765, "num_input_tokens_seen": 53260280, "step": 91730 }, { "epoch": 13.663240988978254, "grad_norm": 1.661849856376648, "learning_rate": 1.3793960865259486e-05, "loss": 0.472, "num_input_tokens_seen": 53263320, "step": 91735 }, { "epoch": 13.663985701519213, "grad_norm": 2.712928056716919, "learning_rate": 1.3791056261574952e-05, "loss": 0.7117, "num_input_tokens_seen": 53266104, "step": 91740 }, { "epoch": 13.664730414060173, "grad_norm": 1.4544389247894287, "learning_rate": 1.3788151847254139e-05, "loss": 0.5707, "num_input_tokens_seen": 53268920, "step": 91745 }, { "epoch": 13.665475126601132, "grad_norm": 2.2676241397857666, "learning_rate": 1.3785247622346098e-05, "loss": 0.7046, "num_input_tokens_seen": 53271736, "step": 91750 }, { "epoch": 13.66621983914209, "grad_norm": 1.9485660791397095, "learning_rate": 1.3782343586899906e-05, "loss": 0.7686, "num_input_tokens_seen": 53274808, "step": 91755 }, { "epoch": 13.66696455168305, "grad_norm": 1.664207935333252, "learning_rate": 1.377943974096461e-05, "loss": 0.5422, "num_input_tokens_seen": 53277592, "step": 91760 }, { "epoch": 13.66770926422401, "grad_norm": 1.3262519836425781, "learning_rate": 1.3776536084589287e-05, "loss": 0.7014, "num_input_tokens_seen": 53280568, "step": 91765 }, { "epoch": 13.668453976764969, "grad_norm": 2.4699952602386475, "learning_rate": 1.377363261782298e-05, "loss": 0.6339, "num_input_tokens_seen": 53283512, "step": 91770 }, { "epoch": 13.669198689305928, "grad_norm": 1.490035057067871, "learning_rate": 1.3770729340714728e-05, "loss": 0.6381, "num_input_tokens_seen": 53286872, "step": 91775 }, { "epoch": 13.669943401846886, "grad_norm": 1.3560062646865845, "learning_rate": 1.3767826253313599e-05, "loss": 0.5198, "num_input_tokens_seen": 53290168, "step": 91780 }, { "epoch": 13.670688114387847, "grad_norm": 1.0784891843795776, "learning_rate": 1.376492335566863e-05, "loss": 0.4545, "num_input_tokens_seen": 53293464, "step": 91785 }, { "epoch": 13.671432826928806, "grad_norm": 1.296295166015625, "learning_rate": 1.3762020647828866e-05, "loss": 0.5653, "num_input_tokens_seen": 53296344, "step": 91790 }, { "epoch": 13.672177539469764, "grad_norm": 1.63676118850708, "learning_rate": 1.375911812984333e-05, "loss": 0.5549, "num_input_tokens_seen": 53299480, "step": 91795 }, { "epoch": 13.672922252010723, "grad_norm": 2.1065850257873535, "learning_rate": 1.3756215801761074e-05, "loss": 0.7429, "num_input_tokens_seen": 53302104, "step": 91800 }, { "epoch": 13.673666964551684, "grad_norm": 1.749738097190857, "learning_rate": 1.3753313663631119e-05, "loss": 0.663, "num_input_tokens_seen": 53305144, "step": 91805 }, { "epoch": 13.674411677092642, "grad_norm": 2.4198713302612305, "learning_rate": 1.375041171550251e-05, "loss": 0.507, "num_input_tokens_seen": 53307864, "step": 91810 }, { "epoch": 13.675156389633601, "grad_norm": 2.2249228954315186, "learning_rate": 1.3747509957424259e-05, "loss": 0.6095, "num_input_tokens_seen": 53310616, "step": 91815 }, { "epoch": 13.67590110217456, "grad_norm": 1.466249942779541, "learning_rate": 1.3744608389445379e-05, "loss": 0.6376, "num_input_tokens_seen": 53313432, "step": 91820 }, { "epoch": 13.67664581471552, "grad_norm": 1.4055684804916382, "learning_rate": 1.3741707011614912e-05, "loss": 0.4968, "num_input_tokens_seen": 53316056, "step": 91825 }, { "epoch": 13.67739052725648, "grad_norm": 1.360268235206604, "learning_rate": 1.3738805823981857e-05, "loss": 0.439, "num_input_tokens_seen": 53318680, "step": 91830 }, { "epoch": 13.678135239797438, "grad_norm": 1.4211562871932983, "learning_rate": 1.3735904826595236e-05, "loss": 0.5971, "num_input_tokens_seen": 53321688, "step": 91835 }, { "epoch": 13.678879952338397, "grad_norm": 2.295177459716797, "learning_rate": 1.3733004019504058e-05, "loss": 0.5907, "num_input_tokens_seen": 53324152, "step": 91840 }, { "epoch": 13.679624664879357, "grad_norm": 2.3021342754364014, "learning_rate": 1.3730103402757327e-05, "loss": 0.4729, "num_input_tokens_seen": 53327096, "step": 91845 }, { "epoch": 13.680369377420316, "grad_norm": 1.6192981004714966, "learning_rate": 1.3727202976404033e-05, "loss": 0.5991, "num_input_tokens_seen": 53329912, "step": 91850 }, { "epoch": 13.681114089961275, "grad_norm": 2.4541380405426025, "learning_rate": 1.3724302740493198e-05, "loss": 0.7616, "num_input_tokens_seen": 53332632, "step": 91855 }, { "epoch": 13.681858802502234, "grad_norm": 1.2932311296463013, "learning_rate": 1.372140269507381e-05, "loss": 0.5076, "num_input_tokens_seen": 53335544, "step": 91860 }, { "epoch": 13.682603515043194, "grad_norm": 1.5917637348175049, "learning_rate": 1.3718502840194847e-05, "loss": 0.5844, "num_input_tokens_seen": 53338456, "step": 91865 }, { "epoch": 13.683348227584153, "grad_norm": 1.186302661895752, "learning_rate": 1.3715603175905322e-05, "loss": 0.7919, "num_input_tokens_seen": 53341464, "step": 91870 }, { "epoch": 13.684092940125112, "grad_norm": 1.020400047302246, "learning_rate": 1.3712703702254203e-05, "loss": 0.5611, "num_input_tokens_seen": 53344568, "step": 91875 }, { "epoch": 13.68483765266607, "grad_norm": 2.4313948154449463, "learning_rate": 1.3709804419290496e-05, "loss": 0.3417, "num_input_tokens_seen": 53347096, "step": 91880 }, { "epoch": 13.68558236520703, "grad_norm": 2.312410831451416, "learning_rate": 1.3706905327063158e-05, "loss": 0.6523, "num_input_tokens_seen": 53350008, "step": 91885 }, { "epoch": 13.68632707774799, "grad_norm": 2.151304006576538, "learning_rate": 1.3704006425621185e-05, "loss": 0.5401, "num_input_tokens_seen": 53353208, "step": 91890 }, { "epoch": 13.687071790288948, "grad_norm": 0.8528792858123779, "learning_rate": 1.3701107715013542e-05, "loss": 0.718, "num_input_tokens_seen": 53356120, "step": 91895 }, { "epoch": 13.687816502829907, "grad_norm": 1.9081175327301025, "learning_rate": 1.369820919528919e-05, "loss": 0.7628, "num_input_tokens_seen": 53358840, "step": 91900 }, { "epoch": 13.688561215370868, "grad_norm": 1.2437262535095215, "learning_rate": 1.3695310866497119e-05, "loss": 0.6579, "num_input_tokens_seen": 53361880, "step": 91905 }, { "epoch": 13.689305927911827, "grad_norm": 2.466719150543213, "learning_rate": 1.3692412728686282e-05, "loss": 0.6176, "num_input_tokens_seen": 53365016, "step": 91910 }, { "epoch": 13.690050640452785, "grad_norm": 1.3664838075637817, "learning_rate": 1.3689514781905638e-05, "loss": 0.5661, "num_input_tokens_seen": 53367864, "step": 91915 }, { "epoch": 13.690795352993744, "grad_norm": 1.1125224828720093, "learning_rate": 1.3686617026204138e-05, "loss": 0.4494, "num_input_tokens_seen": 53370776, "step": 91920 }, { "epoch": 13.691540065534703, "grad_norm": 1.3592008352279663, "learning_rate": 1.368371946163075e-05, "loss": 0.6917, "num_input_tokens_seen": 53373848, "step": 91925 }, { "epoch": 13.692284778075663, "grad_norm": 2.3779492378234863, "learning_rate": 1.3680822088234427e-05, "loss": 0.5937, "num_input_tokens_seen": 53376856, "step": 91930 }, { "epoch": 13.693029490616622, "grad_norm": 1.456801176071167, "learning_rate": 1.3677924906064097e-05, "loss": 0.7549, "num_input_tokens_seen": 53379928, "step": 91935 }, { "epoch": 13.69377420315758, "grad_norm": 0.9972853660583496, "learning_rate": 1.3675027915168729e-05, "loss": 0.5132, "num_input_tokens_seen": 53383032, "step": 91940 }, { "epoch": 13.69451891569854, "grad_norm": 1.9625385999679565, "learning_rate": 1.3672131115597241e-05, "loss": 0.6357, "num_input_tokens_seen": 53385784, "step": 91945 }, { "epoch": 13.6952636282395, "grad_norm": 0.9900276064872742, "learning_rate": 1.3669234507398601e-05, "loss": 0.5506, "num_input_tokens_seen": 53388856, "step": 91950 }, { "epoch": 13.696008340780459, "grad_norm": 1.1547369956970215, "learning_rate": 1.3666338090621716e-05, "loss": 0.5113, "num_input_tokens_seen": 53391672, "step": 91955 }, { "epoch": 13.696753053321418, "grad_norm": 1.7396161556243896, "learning_rate": 1.3663441865315538e-05, "loss": 0.5898, "num_input_tokens_seen": 53394392, "step": 91960 }, { "epoch": 13.697497765862376, "grad_norm": 1.3298248052597046, "learning_rate": 1.3660545831528975e-05, "loss": 0.4388, "num_input_tokens_seen": 53397336, "step": 91965 }, { "epoch": 13.698242478403337, "grad_norm": 0.9834871888160706, "learning_rate": 1.365764998931098e-05, "loss": 0.5404, "num_input_tokens_seen": 53400280, "step": 91970 }, { "epoch": 13.698987190944296, "grad_norm": 1.4151480197906494, "learning_rate": 1.365475433871046e-05, "loss": 0.6542, "num_input_tokens_seen": 53403224, "step": 91975 }, { "epoch": 13.699731903485254, "grad_norm": 1.4972766637802124, "learning_rate": 1.3651858879776336e-05, "loss": 0.49, "num_input_tokens_seen": 53406168, "step": 91980 }, { "epoch": 13.700476616026213, "grad_norm": 1.5818617343902588, "learning_rate": 1.3648963612557519e-05, "loss": 0.6333, "num_input_tokens_seen": 53409528, "step": 91985 }, { "epoch": 13.701221328567174, "grad_norm": 1.9584987163543701, "learning_rate": 1.3646068537102916e-05, "loss": 0.6712, "num_input_tokens_seen": 53412504, "step": 91990 }, { "epoch": 13.701966041108133, "grad_norm": 1.097146987915039, "learning_rate": 1.3643173653461454e-05, "loss": 0.4363, "num_input_tokens_seen": 53415416, "step": 91995 }, { "epoch": 13.702710753649091, "grad_norm": 1.0301545858383179, "learning_rate": 1.3640278961682023e-05, "loss": 0.7658, "num_input_tokens_seen": 53418232, "step": 92000 }, { "epoch": 13.70345546619005, "grad_norm": 1.854252815246582, "learning_rate": 1.3637384461813546e-05, "loss": 0.6533, "num_input_tokens_seen": 53421112, "step": 92005 }, { "epoch": 13.70420017873101, "grad_norm": 1.2680612802505493, "learning_rate": 1.3634490153904905e-05, "loss": 0.5711, "num_input_tokens_seen": 53424120, "step": 92010 }, { "epoch": 13.70494489127197, "grad_norm": 0.662034273147583, "learning_rate": 1.3631596038004994e-05, "loss": 0.4747, "num_input_tokens_seen": 53426968, "step": 92015 }, { "epoch": 13.705689603812928, "grad_norm": 1.485537052154541, "learning_rate": 1.3628702114162722e-05, "loss": 0.5262, "num_input_tokens_seen": 53429656, "step": 92020 }, { "epoch": 13.706434316353887, "grad_norm": 1.7516751289367676, "learning_rate": 1.3625808382426964e-05, "loss": 0.5453, "num_input_tokens_seen": 53432696, "step": 92025 }, { "epoch": 13.707179028894847, "grad_norm": 2.2199623584747314, "learning_rate": 1.3622914842846619e-05, "loss": 0.602, "num_input_tokens_seen": 53435512, "step": 92030 }, { "epoch": 13.707923741435806, "grad_norm": 1.3111093044281006, "learning_rate": 1.3620021495470556e-05, "loss": 0.5621, "num_input_tokens_seen": 53438392, "step": 92035 }, { "epoch": 13.708668453976765, "grad_norm": 1.445586919784546, "learning_rate": 1.3617128340347673e-05, "loss": 0.3834, "num_input_tokens_seen": 53441176, "step": 92040 }, { "epoch": 13.709413166517724, "grad_norm": 1.8877445459365845, "learning_rate": 1.361423537752684e-05, "loss": 0.5346, "num_input_tokens_seen": 53444088, "step": 92045 }, { "epoch": 13.710157879058684, "grad_norm": 1.551920771598816, "learning_rate": 1.3611342607056925e-05, "loss": 0.4766, "num_input_tokens_seen": 53447096, "step": 92050 }, { "epoch": 13.710902591599643, "grad_norm": 2.400059938430786, "learning_rate": 1.3608450028986804e-05, "loss": 0.5527, "num_input_tokens_seen": 53449912, "step": 92055 }, { "epoch": 13.711647304140602, "grad_norm": 1.5398472547531128, "learning_rate": 1.3605557643365333e-05, "loss": 0.563, "num_input_tokens_seen": 53452920, "step": 92060 }, { "epoch": 13.71239201668156, "grad_norm": 1.379378318786621, "learning_rate": 1.3602665450241392e-05, "loss": 0.6332, "num_input_tokens_seen": 53455704, "step": 92065 }, { "epoch": 13.71313672922252, "grad_norm": 1.536741018295288, "learning_rate": 1.3599773449663828e-05, "loss": 0.8026, "num_input_tokens_seen": 53458584, "step": 92070 }, { "epoch": 13.71388144176348, "grad_norm": 1.9928467273712158, "learning_rate": 1.3596881641681513e-05, "loss": 0.5402, "num_input_tokens_seen": 53461784, "step": 92075 }, { "epoch": 13.714626154304439, "grad_norm": 1.9725127220153809, "learning_rate": 1.3593990026343284e-05, "loss": 0.4496, "num_input_tokens_seen": 53464376, "step": 92080 }, { "epoch": 13.715370866845397, "grad_norm": 4.407564640045166, "learning_rate": 1.3591098603698007e-05, "loss": 0.595, "num_input_tokens_seen": 53467544, "step": 92085 }, { "epoch": 13.716115579386356, "grad_norm": 0.9891673922538757, "learning_rate": 1.3588207373794526e-05, "loss": 0.4083, "num_input_tokens_seen": 53470296, "step": 92090 }, { "epoch": 13.716860291927317, "grad_norm": 1.9574954509735107, "learning_rate": 1.3585316336681675e-05, "loss": 0.5224, "num_input_tokens_seen": 53473432, "step": 92095 }, { "epoch": 13.717605004468275, "grad_norm": 2.5416765213012695, "learning_rate": 1.3582425492408313e-05, "loss": 0.6559, "num_input_tokens_seen": 53476600, "step": 92100 }, { "epoch": 13.718349717009234, "grad_norm": 1.719932198524475, "learning_rate": 1.3579534841023256e-05, "loss": 0.6324, "num_input_tokens_seen": 53479448, "step": 92105 }, { "epoch": 13.719094429550193, "grad_norm": 1.8530155420303345, "learning_rate": 1.357664438257536e-05, "loss": 0.5443, "num_input_tokens_seen": 53482136, "step": 92110 }, { "epoch": 13.719839142091153, "grad_norm": 1.758423089981079, "learning_rate": 1.3573754117113446e-05, "loss": 0.5639, "num_input_tokens_seen": 53484984, "step": 92115 }, { "epoch": 13.720583854632112, "grad_norm": 1.2071470022201538, "learning_rate": 1.3570864044686349e-05, "loss": 0.698, "num_input_tokens_seen": 53487800, "step": 92120 }, { "epoch": 13.721328567173071, "grad_norm": 1.9591639041900635, "learning_rate": 1.3567974165342873e-05, "loss": 0.5324, "num_input_tokens_seen": 53490648, "step": 92125 }, { "epoch": 13.72207327971403, "grad_norm": 1.4795092344284058, "learning_rate": 1.3565084479131865e-05, "loss": 0.5711, "num_input_tokens_seen": 53493304, "step": 92130 }, { "epoch": 13.72281799225499, "grad_norm": 1.3018066883087158, "learning_rate": 1.3562194986102134e-05, "loss": 0.4332, "num_input_tokens_seen": 53496248, "step": 92135 }, { "epoch": 13.723562704795949, "grad_norm": 1.9592406749725342, "learning_rate": 1.3559305686302482e-05, "loss": 0.6931, "num_input_tokens_seen": 53499256, "step": 92140 }, { "epoch": 13.724307417336908, "grad_norm": 1.9354004859924316, "learning_rate": 1.3556416579781745e-05, "loss": 0.5919, "num_input_tokens_seen": 53502072, "step": 92145 }, { "epoch": 13.725052129877866, "grad_norm": 1.6089704036712646, "learning_rate": 1.355352766658871e-05, "loss": 0.5223, "num_input_tokens_seen": 53505176, "step": 92150 }, { "epoch": 13.725796842418827, "grad_norm": 1.7237285375595093, "learning_rate": 1.3550638946772198e-05, "loss": 0.4784, "num_input_tokens_seen": 53508056, "step": 92155 }, { "epoch": 13.726541554959786, "grad_norm": 1.508995532989502, "learning_rate": 1.3547750420380994e-05, "loss": 0.6918, "num_input_tokens_seen": 53510808, "step": 92160 }, { "epoch": 13.727286267500745, "grad_norm": 1.6886746883392334, "learning_rate": 1.3544862087463922e-05, "loss": 0.5615, "num_input_tokens_seen": 53513784, "step": 92165 }, { "epoch": 13.728030980041703, "grad_norm": 1.1271774768829346, "learning_rate": 1.3541973948069757e-05, "loss": 0.5879, "num_input_tokens_seen": 53516472, "step": 92170 }, { "epoch": 13.728775692582664, "grad_norm": 0.9351418614387512, "learning_rate": 1.3539086002247301e-05, "loss": 0.5125, "num_input_tokens_seen": 53519288, "step": 92175 }, { "epoch": 13.729520405123623, "grad_norm": 0.8441421389579773, "learning_rate": 1.3536198250045326e-05, "loss": 0.5544, "num_input_tokens_seen": 53522136, "step": 92180 }, { "epoch": 13.730265117664581, "grad_norm": 1.8260592222213745, "learning_rate": 1.353331069151264e-05, "loss": 0.6192, "num_input_tokens_seen": 53525272, "step": 92185 }, { "epoch": 13.73100983020554, "grad_norm": 2.364224672317505, "learning_rate": 1.3530423326698015e-05, "loss": 0.9599, "num_input_tokens_seen": 53527928, "step": 92190 }, { "epoch": 13.7317545427465, "grad_norm": 2.030100107192993, "learning_rate": 1.3527536155650224e-05, "loss": 0.6459, "num_input_tokens_seen": 53530712, "step": 92195 }, { "epoch": 13.73249925528746, "grad_norm": 1.418293833732605, "learning_rate": 1.3524649178418058e-05, "loss": 0.7887, "num_input_tokens_seen": 53533592, "step": 92200 }, { "epoch": 13.733243967828418, "grad_norm": 2.4218201637268066, "learning_rate": 1.3521762395050272e-05, "loss": 0.7339, "num_input_tokens_seen": 53536568, "step": 92205 }, { "epoch": 13.733988680369377, "grad_norm": 1.1151068210601807, "learning_rate": 1.3518875805595654e-05, "loss": 0.6886, "num_input_tokens_seen": 53539320, "step": 92210 }, { "epoch": 13.734733392910336, "grad_norm": 1.1302062273025513, "learning_rate": 1.3515989410102959e-05, "loss": 0.7464, "num_input_tokens_seen": 53542200, "step": 92215 }, { "epoch": 13.735478105451296, "grad_norm": 3.5691418647766113, "learning_rate": 1.3513103208620941e-05, "loss": 0.5015, "num_input_tokens_seen": 53544952, "step": 92220 }, { "epoch": 13.736222817992255, "grad_norm": 2.4150257110595703, "learning_rate": 1.3510217201198383e-05, "loss": 0.6657, "num_input_tokens_seen": 53547576, "step": 92225 }, { "epoch": 13.736967530533214, "grad_norm": 2.2637743949890137, "learning_rate": 1.3507331387884015e-05, "loss": 0.6654, "num_input_tokens_seen": 53550520, "step": 92230 }, { "epoch": 13.737712243074174, "grad_norm": 1.214159369468689, "learning_rate": 1.3504445768726612e-05, "loss": 0.6564, "num_input_tokens_seen": 53553560, "step": 92235 }, { "epoch": 13.738456955615133, "grad_norm": 0.8568990230560303, "learning_rate": 1.3501560343774917e-05, "loss": 0.5402, "num_input_tokens_seen": 53556312, "step": 92240 }, { "epoch": 13.739201668156092, "grad_norm": 1.1513583660125732, "learning_rate": 1.3498675113077669e-05, "loss": 0.5072, "num_input_tokens_seen": 53559576, "step": 92245 }, { "epoch": 13.73994638069705, "grad_norm": 1.0974376201629639, "learning_rate": 1.3495790076683617e-05, "loss": 0.5857, "num_input_tokens_seen": 53562616, "step": 92250 }, { "epoch": 13.74069109323801, "grad_norm": 1.5673472881317139, "learning_rate": 1.3492905234641492e-05, "loss": 0.604, "num_input_tokens_seen": 53565624, "step": 92255 }, { "epoch": 13.74143580577897, "grad_norm": 1.8527295589447021, "learning_rate": 1.3490020587000046e-05, "loss": 0.6568, "num_input_tokens_seen": 53568728, "step": 92260 }, { "epoch": 13.742180518319929, "grad_norm": 2.9721198081970215, "learning_rate": 1.3487136133807992e-05, "loss": 0.4939, "num_input_tokens_seen": 53571992, "step": 92265 }, { "epoch": 13.742925230860887, "grad_norm": 2.132952928543091, "learning_rate": 1.3484251875114085e-05, "loss": 0.6942, "num_input_tokens_seen": 53574968, "step": 92270 }, { "epoch": 13.743669943401846, "grad_norm": 2.0335373878479004, "learning_rate": 1.3481367810967027e-05, "loss": 0.746, "num_input_tokens_seen": 53577752, "step": 92275 }, { "epoch": 13.744414655942807, "grad_norm": 2.299647569656372, "learning_rate": 1.3478483941415565e-05, "loss": 0.6312, "num_input_tokens_seen": 53581016, "step": 92280 }, { "epoch": 13.745159368483765, "grad_norm": 1.164086937904358, "learning_rate": 1.3475600266508395e-05, "loss": 0.684, "num_input_tokens_seen": 53583832, "step": 92285 }, { "epoch": 13.745904081024724, "grad_norm": 1.6638667583465576, "learning_rate": 1.3472716786294254e-05, "loss": 0.5831, "num_input_tokens_seen": 53586712, "step": 92290 }, { "epoch": 13.746648793565683, "grad_norm": 3.457871198654175, "learning_rate": 1.3469833500821848e-05, "loss": 0.6061, "num_input_tokens_seen": 53589400, "step": 92295 }, { "epoch": 13.747393506106643, "grad_norm": 2.4045567512512207, "learning_rate": 1.3466950410139878e-05, "loss": 0.6355, "num_input_tokens_seen": 53592088, "step": 92300 }, { "epoch": 13.748138218647602, "grad_norm": 2.6836960315704346, "learning_rate": 1.3464067514297069e-05, "loss": 0.6247, "num_input_tokens_seen": 53594904, "step": 92305 }, { "epoch": 13.748882931188561, "grad_norm": 0.8129281997680664, "learning_rate": 1.3461184813342116e-05, "loss": 0.6215, "num_input_tokens_seen": 53597784, "step": 92310 }, { "epoch": 13.74962764372952, "grad_norm": 1.0527626276016235, "learning_rate": 1.3458302307323714e-05, "loss": 0.3548, "num_input_tokens_seen": 53600664, "step": 92315 }, { "epoch": 13.75037235627048, "grad_norm": 1.3767281770706177, "learning_rate": 1.3455419996290558e-05, "loss": 0.5639, "num_input_tokens_seen": 53603800, "step": 92320 }, { "epoch": 13.751117068811439, "grad_norm": 1.3951473236083984, "learning_rate": 1.3452537880291355e-05, "loss": 0.5446, "num_input_tokens_seen": 53606840, "step": 92325 }, { "epoch": 13.751861781352398, "grad_norm": 1.494591474533081, "learning_rate": 1.3449655959374791e-05, "loss": 0.6508, "num_input_tokens_seen": 53609784, "step": 92330 }, { "epoch": 13.752606493893357, "grad_norm": 1.4482697248458862, "learning_rate": 1.3446774233589537e-05, "loss": 0.5816, "num_input_tokens_seen": 53612856, "step": 92335 }, { "epoch": 13.753351206434317, "grad_norm": 1.97062349319458, "learning_rate": 1.3443892702984302e-05, "loss": 0.5817, "num_input_tokens_seen": 53615512, "step": 92340 }, { "epoch": 13.754095918975276, "grad_norm": 1.4446626901626587, "learning_rate": 1.3441011367607743e-05, "loss": 0.6073, "num_input_tokens_seen": 53618360, "step": 92345 }, { "epoch": 13.754840631516235, "grad_norm": 1.890576720237732, "learning_rate": 1.343813022750856e-05, "loss": 0.5388, "num_input_tokens_seen": 53621464, "step": 92350 }, { "epoch": 13.755585344057193, "grad_norm": 1.084934115409851, "learning_rate": 1.3435249282735407e-05, "loss": 0.4971, "num_input_tokens_seen": 53624344, "step": 92355 }, { "epoch": 13.756330056598154, "grad_norm": 2.6416335105895996, "learning_rate": 1.343236853333697e-05, "loss": 0.5893, "num_input_tokens_seen": 53627256, "step": 92360 }, { "epoch": 13.757074769139113, "grad_norm": 2.4162838459014893, "learning_rate": 1.3429487979361905e-05, "loss": 0.5506, "num_input_tokens_seen": 53630168, "step": 92365 }, { "epoch": 13.757819481680071, "grad_norm": 1.3750163316726685, "learning_rate": 1.342660762085889e-05, "loss": 0.5909, "num_input_tokens_seen": 53632888, "step": 92370 }, { "epoch": 13.75856419422103, "grad_norm": 1.192149043083191, "learning_rate": 1.3423727457876572e-05, "loss": 0.7453, "num_input_tokens_seen": 53635480, "step": 92375 }, { "epoch": 13.75930890676199, "grad_norm": 1.031899333000183, "learning_rate": 1.3420847490463614e-05, "loss": 0.5545, "num_input_tokens_seen": 53638776, "step": 92380 }, { "epoch": 13.76005361930295, "grad_norm": 2.0542244911193848, "learning_rate": 1.3417967718668672e-05, "loss": 0.6056, "num_input_tokens_seen": 53641592, "step": 92385 }, { "epoch": 13.760798331843908, "grad_norm": 2.044725179672241, "learning_rate": 1.3415088142540383e-05, "loss": 0.486, "num_input_tokens_seen": 53645048, "step": 92390 }, { "epoch": 13.761543044384867, "grad_norm": 2.0842480659484863, "learning_rate": 1.3412208762127415e-05, "loss": 0.6187, "num_input_tokens_seen": 53647928, "step": 92395 }, { "epoch": 13.762287756925826, "grad_norm": 2.1514346599578857, "learning_rate": 1.3409329577478391e-05, "loss": 0.6357, "num_input_tokens_seen": 53650776, "step": 92400 }, { "epoch": 13.763032469466786, "grad_norm": 2.088176965713501, "learning_rate": 1.3406450588641978e-05, "loss": 0.6703, "num_input_tokens_seen": 53653720, "step": 92405 }, { "epoch": 13.763777182007745, "grad_norm": 2.5407533645629883, "learning_rate": 1.3403571795666786e-05, "loss": 0.6157, "num_input_tokens_seen": 53656600, "step": 92410 }, { "epoch": 13.764521894548704, "grad_norm": 2.2392776012420654, "learning_rate": 1.3400693198601472e-05, "loss": 0.7002, "num_input_tokens_seen": 53659416, "step": 92415 }, { "epoch": 13.765266607089664, "grad_norm": 1.6584198474884033, "learning_rate": 1.339781479749466e-05, "loss": 0.602, "num_input_tokens_seen": 53662168, "step": 92420 }, { "epoch": 13.766011319630623, "grad_norm": 1.2559741735458374, "learning_rate": 1.3394936592394963e-05, "loss": 0.5486, "num_input_tokens_seen": 53665048, "step": 92425 }, { "epoch": 13.766756032171582, "grad_norm": 1.9175176620483398, "learning_rate": 1.3392058583351027e-05, "loss": 0.5259, "num_input_tokens_seen": 53667928, "step": 92430 }, { "epoch": 13.76750074471254, "grad_norm": 1.0389037132263184, "learning_rate": 1.3389180770411456e-05, "loss": 0.6389, "num_input_tokens_seen": 53671256, "step": 92435 }, { "epoch": 13.7682454572535, "grad_norm": 0.9207088351249695, "learning_rate": 1.3386303153624882e-05, "loss": 0.4663, "num_input_tokens_seen": 53673912, "step": 92440 }, { "epoch": 13.76899016979446, "grad_norm": 2.610729217529297, "learning_rate": 1.3383425733039914e-05, "loss": 0.6484, "num_input_tokens_seen": 53676632, "step": 92445 }, { "epoch": 13.769734882335419, "grad_norm": 1.8575485944747925, "learning_rate": 1.3380548508705162e-05, "loss": 0.5237, "num_input_tokens_seen": 53679256, "step": 92450 }, { "epoch": 13.770479594876377, "grad_norm": 1.7036851644515991, "learning_rate": 1.3377671480669235e-05, "loss": 0.7287, "num_input_tokens_seen": 53682168, "step": 92455 }, { "epoch": 13.771224307417336, "grad_norm": 1.2286417484283447, "learning_rate": 1.3374794648980721e-05, "loss": 0.5386, "num_input_tokens_seen": 53684952, "step": 92460 }, { "epoch": 13.771969019958297, "grad_norm": 1.0155748128890991, "learning_rate": 1.337191801368825e-05, "loss": 0.6662, "num_input_tokens_seen": 53687800, "step": 92465 }, { "epoch": 13.772713732499255, "grad_norm": 1.6820329427719116, "learning_rate": 1.3369041574840396e-05, "loss": 0.5537, "num_input_tokens_seen": 53690584, "step": 92470 }, { "epoch": 13.773458445040214, "grad_norm": 1.661049246788025, "learning_rate": 1.3366165332485772e-05, "loss": 0.7433, "num_input_tokens_seen": 53693496, "step": 92475 }, { "epoch": 13.774203157581173, "grad_norm": 1.1315555572509766, "learning_rate": 1.3363289286672952e-05, "loss": 0.6616, "num_input_tokens_seen": 53696216, "step": 92480 }, { "epoch": 13.774947870122134, "grad_norm": 1.6751848459243774, "learning_rate": 1.3360413437450542e-05, "loss": 0.6318, "num_input_tokens_seen": 53698872, "step": 92485 }, { "epoch": 13.775692582663092, "grad_norm": 1.4016019105911255, "learning_rate": 1.3357537784867105e-05, "loss": 0.5546, "num_input_tokens_seen": 53701688, "step": 92490 }, { "epoch": 13.776437295204051, "grad_norm": 2.4747400283813477, "learning_rate": 1.3354662328971246e-05, "loss": 0.6515, "num_input_tokens_seen": 53704664, "step": 92495 }, { "epoch": 13.77718200774501, "grad_norm": 3.245136260986328, "learning_rate": 1.3351787069811533e-05, "loss": 0.5148, "num_input_tokens_seen": 53707480, "step": 92500 }, { "epoch": 13.77792672028597, "grad_norm": 1.155205488204956, "learning_rate": 1.3348912007436537e-05, "loss": 0.702, "num_input_tokens_seen": 53710296, "step": 92505 }, { "epoch": 13.778671432826929, "grad_norm": 1.1712671518325806, "learning_rate": 1.3346037141894829e-05, "loss": 0.4753, "num_input_tokens_seen": 53713400, "step": 92510 }, { "epoch": 13.779416145367888, "grad_norm": 1.2536224126815796, "learning_rate": 1.3343162473234972e-05, "loss": 0.489, "num_input_tokens_seen": 53716120, "step": 92515 }, { "epoch": 13.780160857908847, "grad_norm": 1.179396629333496, "learning_rate": 1.3340288001505546e-05, "loss": 0.4285, "num_input_tokens_seen": 53718904, "step": 92520 }, { "epoch": 13.780905570449807, "grad_norm": 1.2315078973770142, "learning_rate": 1.3337413726755093e-05, "loss": 0.5296, "num_input_tokens_seen": 53721784, "step": 92525 }, { "epoch": 13.781650282990766, "grad_norm": 2.6016104221343994, "learning_rate": 1.3334539649032193e-05, "loss": 0.4699, "num_input_tokens_seen": 53724760, "step": 92530 }, { "epoch": 13.782394995531725, "grad_norm": 2.608426570892334, "learning_rate": 1.3331665768385387e-05, "loss": 0.6498, "num_input_tokens_seen": 53727672, "step": 92535 }, { "epoch": 13.783139708072683, "grad_norm": 1.2603622674942017, "learning_rate": 1.3328792084863223e-05, "loss": 0.5818, "num_input_tokens_seen": 53730360, "step": 92540 }, { "epoch": 13.783884420613644, "grad_norm": 1.6722397804260254, "learning_rate": 1.3325918598514265e-05, "loss": 0.4993, "num_input_tokens_seen": 53733048, "step": 92545 }, { "epoch": 13.784629133154603, "grad_norm": 1.6158925294876099, "learning_rate": 1.3323045309387033e-05, "loss": 0.5474, "num_input_tokens_seen": 53735896, "step": 92550 }, { "epoch": 13.785373845695561, "grad_norm": 2.1567535400390625, "learning_rate": 1.3320172217530094e-05, "loss": 0.6249, "num_input_tokens_seen": 53738808, "step": 92555 }, { "epoch": 13.78611855823652, "grad_norm": 0.9625620245933533, "learning_rate": 1.3317299322991966e-05, "loss": 0.539, "num_input_tokens_seen": 53741688, "step": 92560 }, { "epoch": 13.78686327077748, "grad_norm": 4.822132110595703, "learning_rate": 1.33144266258212e-05, "loss": 0.5933, "num_input_tokens_seen": 53744280, "step": 92565 }, { "epoch": 13.78760798331844, "grad_norm": 3.7742607593536377, "learning_rate": 1.3311554126066323e-05, "loss": 0.5837, "num_input_tokens_seen": 53747000, "step": 92570 }, { "epoch": 13.788352695859398, "grad_norm": 3.7456002235412598, "learning_rate": 1.3308681823775853e-05, "loss": 0.5998, "num_input_tokens_seen": 53749976, "step": 92575 }, { "epoch": 13.789097408400357, "grad_norm": 1.5053964853286743, "learning_rate": 1.3305809718998324e-05, "loss": 0.5312, "num_input_tokens_seen": 53752760, "step": 92580 }, { "epoch": 13.789842120941316, "grad_norm": 2.607839822769165, "learning_rate": 1.3302937811782249e-05, "loss": 0.9382, "num_input_tokens_seen": 53755704, "step": 92585 }, { "epoch": 13.790586833482276, "grad_norm": 1.4223737716674805, "learning_rate": 1.3300066102176157e-05, "loss": 0.5599, "num_input_tokens_seen": 53758840, "step": 92590 }, { "epoch": 13.791331546023235, "grad_norm": 1.1500645875930786, "learning_rate": 1.3297194590228545e-05, "loss": 0.5141, "num_input_tokens_seen": 53761624, "step": 92595 }, { "epoch": 13.792076258564194, "grad_norm": 1.8114346265792847, "learning_rate": 1.3294323275987953e-05, "loss": 0.4199, "num_input_tokens_seen": 53764504, "step": 92600 }, { "epoch": 13.792820971105153, "grad_norm": 3.2032599449157715, "learning_rate": 1.3291452159502853e-05, "loss": 0.6041, "num_input_tokens_seen": 53767224, "step": 92605 }, { "epoch": 13.793565683646113, "grad_norm": 1.243717908859253, "learning_rate": 1.3288581240821785e-05, "loss": 0.6606, "num_input_tokens_seen": 53770104, "step": 92610 }, { "epoch": 13.794310396187072, "grad_norm": 1.4516258239746094, "learning_rate": 1.3285710519993233e-05, "loss": 0.4181, "num_input_tokens_seen": 53772728, "step": 92615 }, { "epoch": 13.79505510872803, "grad_norm": 2.8445682525634766, "learning_rate": 1.3282839997065689e-05, "loss": 0.5966, "num_input_tokens_seen": 53776024, "step": 92620 }, { "epoch": 13.79579982126899, "grad_norm": 1.4036431312561035, "learning_rate": 1.327996967208766e-05, "loss": 0.5655, "num_input_tokens_seen": 53779064, "step": 92625 }, { "epoch": 13.79654453380995, "grad_norm": 1.816441535949707, "learning_rate": 1.3277099545107622e-05, "loss": 0.663, "num_input_tokens_seen": 53782200, "step": 92630 }, { "epoch": 13.797289246350909, "grad_norm": 1.3037270307540894, "learning_rate": 1.3274229616174084e-05, "loss": 0.6264, "num_input_tokens_seen": 53785464, "step": 92635 }, { "epoch": 13.798033958891867, "grad_norm": 1.4039483070373535, "learning_rate": 1.3271359885335515e-05, "loss": 0.6182, "num_input_tokens_seen": 53788536, "step": 92640 }, { "epoch": 13.798778671432826, "grad_norm": 1.5566641092300415, "learning_rate": 1.3268490352640405e-05, "loss": 0.5664, "num_input_tokens_seen": 53791512, "step": 92645 }, { "epoch": 13.799523383973787, "grad_norm": 1.550805926322937, "learning_rate": 1.3265621018137216e-05, "loss": 0.6374, "num_input_tokens_seen": 53794264, "step": 92650 }, { "epoch": 13.800268096514746, "grad_norm": 1.223897933959961, "learning_rate": 1.3262751881874443e-05, "loss": 0.5297, "num_input_tokens_seen": 53797176, "step": 92655 }, { "epoch": 13.801012809055704, "grad_norm": 2.5233192443847656, "learning_rate": 1.3259882943900547e-05, "loss": 0.5881, "num_input_tokens_seen": 53799928, "step": 92660 }, { "epoch": 13.801757521596663, "grad_norm": 1.6607273817062378, "learning_rate": 1.325701420426399e-05, "loss": 0.6584, "num_input_tokens_seen": 53803096, "step": 92665 }, { "epoch": 13.802502234137624, "grad_norm": 1.6108288764953613, "learning_rate": 1.3254145663013251e-05, "loss": 0.849, "num_input_tokens_seen": 53805880, "step": 92670 }, { "epoch": 13.803246946678582, "grad_norm": 1.3662185668945312, "learning_rate": 1.3251277320196772e-05, "loss": 0.5764, "num_input_tokens_seen": 53808824, "step": 92675 }, { "epoch": 13.803991659219541, "grad_norm": 1.7093896865844727, "learning_rate": 1.3248409175863033e-05, "loss": 0.6602, "num_input_tokens_seen": 53811640, "step": 92680 }, { "epoch": 13.8047363717605, "grad_norm": 1.7493205070495605, "learning_rate": 1.3245541230060465e-05, "loss": 0.7515, "num_input_tokens_seen": 53814616, "step": 92685 }, { "epoch": 13.80548108430146, "grad_norm": 1.123278260231018, "learning_rate": 1.3242673482837544e-05, "loss": 0.6232, "num_input_tokens_seen": 53817624, "step": 92690 }, { "epoch": 13.80622579684242, "grad_norm": 1.5673953294754028, "learning_rate": 1.3239805934242704e-05, "loss": 0.8288, "num_input_tokens_seen": 53820440, "step": 92695 }, { "epoch": 13.806970509383378, "grad_norm": 1.4577468633651733, "learning_rate": 1.3236938584324382e-05, "loss": 0.6463, "num_input_tokens_seen": 53823288, "step": 92700 }, { "epoch": 13.807715221924337, "grad_norm": 2.8975236415863037, "learning_rate": 1.3234071433131034e-05, "loss": 0.3473, "num_input_tokens_seen": 53826424, "step": 92705 }, { "epoch": 13.808459934465297, "grad_norm": 1.4338041543960571, "learning_rate": 1.323120448071109e-05, "loss": 0.5924, "num_input_tokens_seen": 53829208, "step": 92710 }, { "epoch": 13.809204647006256, "grad_norm": 1.7763416767120361, "learning_rate": 1.3228337727112988e-05, "loss": 0.4448, "num_input_tokens_seen": 53831864, "step": 92715 }, { "epoch": 13.809949359547215, "grad_norm": 0.1318308413028717, "learning_rate": 1.3225471172385145e-05, "loss": 0.5985, "num_input_tokens_seen": 53835096, "step": 92720 }, { "epoch": 13.810694072088173, "grad_norm": 1.4026113748550415, "learning_rate": 1.3222604816576011e-05, "loss": 0.4013, "num_input_tokens_seen": 53838104, "step": 92725 }, { "epoch": 13.811438784629132, "grad_norm": 2.4061450958251953, "learning_rate": 1.3219738659733988e-05, "loss": 0.653, "num_input_tokens_seen": 53840728, "step": 92730 }, { "epoch": 13.812183497170093, "grad_norm": 2.200745105743408, "learning_rate": 1.3216872701907515e-05, "loss": 0.694, "num_input_tokens_seen": 53843512, "step": 92735 }, { "epoch": 13.812928209711052, "grad_norm": 1.3096014261245728, "learning_rate": 1.3214006943145002e-05, "loss": 0.5928, "num_input_tokens_seen": 53846648, "step": 92740 }, { "epoch": 13.81367292225201, "grad_norm": 1.8039281368255615, "learning_rate": 1.3211141383494856e-05, "loss": 0.6255, "num_input_tokens_seen": 53849496, "step": 92745 }, { "epoch": 13.81441763479297, "grad_norm": 1.6050320863723755, "learning_rate": 1.32082760230055e-05, "loss": 0.4404, "num_input_tokens_seen": 53852216, "step": 92750 }, { "epoch": 13.81516234733393, "grad_norm": 1.3273704051971436, "learning_rate": 1.3205410861725331e-05, "loss": 0.6151, "num_input_tokens_seen": 53855160, "step": 92755 }, { "epoch": 13.815907059874888, "grad_norm": 3.1900477409362793, "learning_rate": 1.3202545899702768e-05, "loss": 0.6847, "num_input_tokens_seen": 53857784, "step": 92760 }, { "epoch": 13.816651772415847, "grad_norm": 1.158642053604126, "learning_rate": 1.3199681136986186e-05, "loss": 0.4644, "num_input_tokens_seen": 53860760, "step": 92765 }, { "epoch": 13.817396484956806, "grad_norm": 2.754246234893799, "learning_rate": 1.3196816573624013e-05, "loss": 0.576, "num_input_tokens_seen": 53864088, "step": 92770 }, { "epoch": 13.818141197497766, "grad_norm": 2.4276607036590576, "learning_rate": 1.3193952209664625e-05, "loss": 0.6364, "num_input_tokens_seen": 53867096, "step": 92775 }, { "epoch": 13.818885910038725, "grad_norm": 2.9495670795440674, "learning_rate": 1.319108804515642e-05, "loss": 0.6764, "num_input_tokens_seen": 53869752, "step": 92780 }, { "epoch": 13.819630622579684, "grad_norm": 3.5765633583068848, "learning_rate": 1.3188224080147776e-05, "loss": 0.5966, "num_input_tokens_seen": 53872760, "step": 92785 }, { "epoch": 13.820375335120643, "grad_norm": 2.0063493251800537, "learning_rate": 1.318536031468707e-05, "loss": 0.6228, "num_input_tokens_seen": 53875544, "step": 92790 }, { "epoch": 13.821120047661603, "grad_norm": 1.1604348421096802, "learning_rate": 1.3182496748822706e-05, "loss": 0.5964, "num_input_tokens_seen": 53878424, "step": 92795 }, { "epoch": 13.821864760202562, "grad_norm": 0.9595988988876343, "learning_rate": 1.3179633382603041e-05, "loss": 0.6996, "num_input_tokens_seen": 53881240, "step": 92800 }, { "epoch": 13.82260947274352, "grad_norm": 2.1948583126068115, "learning_rate": 1.3176770216076462e-05, "loss": 0.5371, "num_input_tokens_seen": 53884216, "step": 92805 }, { "epoch": 13.82335418528448, "grad_norm": 1.8356539011001587, "learning_rate": 1.3173907249291326e-05, "loss": 0.5998, "num_input_tokens_seen": 53887160, "step": 92810 }, { "epoch": 13.82409889782544, "grad_norm": 1.5945814847946167, "learning_rate": 1.3171044482296017e-05, "loss": 0.7767, "num_input_tokens_seen": 53890680, "step": 92815 }, { "epoch": 13.824843610366399, "grad_norm": 1.0263547897338867, "learning_rate": 1.3168181915138889e-05, "loss": 0.4365, "num_input_tokens_seen": 53893688, "step": 92820 }, { "epoch": 13.825588322907358, "grad_norm": 1.6542439460754395, "learning_rate": 1.316531954786829e-05, "loss": 0.5897, "num_input_tokens_seen": 53896920, "step": 92825 }, { "epoch": 13.826333035448316, "grad_norm": 1.2830199003219604, "learning_rate": 1.31624573805326e-05, "loss": 0.5702, "num_input_tokens_seen": 53899896, "step": 92830 }, { "epoch": 13.827077747989277, "grad_norm": 1.6006897687911987, "learning_rate": 1.3159595413180164e-05, "loss": 0.6025, "num_input_tokens_seen": 53902744, "step": 92835 }, { "epoch": 13.827822460530236, "grad_norm": 1.3345884084701538, "learning_rate": 1.3156733645859328e-05, "loss": 0.6195, "num_input_tokens_seen": 53905784, "step": 92840 }, { "epoch": 13.828567173071194, "grad_norm": 1.3298629522323608, "learning_rate": 1.3153872078618428e-05, "loss": 0.8554, "num_input_tokens_seen": 53908760, "step": 92845 }, { "epoch": 13.829311885612153, "grad_norm": 2.178875207901001, "learning_rate": 1.3151010711505835e-05, "loss": 0.7808, "num_input_tokens_seen": 53911736, "step": 92850 }, { "epoch": 13.830056598153114, "grad_norm": 1.8728233575820923, "learning_rate": 1.3148149544569868e-05, "loss": 0.5884, "num_input_tokens_seen": 53914552, "step": 92855 }, { "epoch": 13.830801310694072, "grad_norm": 4.775575637817383, "learning_rate": 1.3145288577858861e-05, "loss": 0.8021, "num_input_tokens_seen": 53917400, "step": 92860 }, { "epoch": 13.831546023235031, "grad_norm": 1.892401933670044, "learning_rate": 1.3142427811421165e-05, "loss": 0.7156, "num_input_tokens_seen": 53920440, "step": 92865 }, { "epoch": 13.83229073577599, "grad_norm": 1.6147347688674927, "learning_rate": 1.313956724530509e-05, "loss": 0.6951, "num_input_tokens_seen": 53923640, "step": 92870 }, { "epoch": 13.83303544831695, "grad_norm": 3.4828641414642334, "learning_rate": 1.3136706879558979e-05, "loss": 0.9222, "num_input_tokens_seen": 53926584, "step": 92875 }, { "epoch": 13.83378016085791, "grad_norm": 2.5643651485443115, "learning_rate": 1.3133846714231141e-05, "loss": 0.7385, "num_input_tokens_seen": 53929432, "step": 92880 }, { "epoch": 13.834524873398868, "grad_norm": 0.8846163749694824, "learning_rate": 1.3130986749369911e-05, "loss": 0.5788, "num_input_tokens_seen": 53932248, "step": 92885 }, { "epoch": 13.835269585939827, "grad_norm": 1.7653053998947144, "learning_rate": 1.3128126985023586e-05, "loss": 0.6353, "num_input_tokens_seen": 53935064, "step": 92890 }, { "epoch": 13.836014298480787, "grad_norm": 0.9462071061134338, "learning_rate": 1.3125267421240504e-05, "loss": 0.5491, "num_input_tokens_seen": 53938232, "step": 92895 }, { "epoch": 13.836759011021746, "grad_norm": 1.66511869430542, "learning_rate": 1.3122408058068955e-05, "loss": 0.6786, "num_input_tokens_seen": 53941176, "step": 92900 }, { "epoch": 13.837503723562705, "grad_norm": 2.1099112033843994, "learning_rate": 1.3119548895557252e-05, "loss": 0.6869, "num_input_tokens_seen": 53944120, "step": 92905 }, { "epoch": 13.838248436103664, "grad_norm": 1.2229900360107422, "learning_rate": 1.3116689933753696e-05, "loss": 0.7282, "num_input_tokens_seen": 53946968, "step": 92910 }, { "epoch": 13.838993148644622, "grad_norm": 1.5855770111083984, "learning_rate": 1.3113831172706575e-05, "loss": 0.4983, "num_input_tokens_seen": 53950008, "step": 92915 }, { "epoch": 13.839737861185583, "grad_norm": 2.2433574199676514, "learning_rate": 1.3110972612464207e-05, "loss": 0.6269, "num_input_tokens_seen": 53953144, "step": 92920 }, { "epoch": 13.840482573726542, "grad_norm": 2.9344875812530518, "learning_rate": 1.310811425307486e-05, "loss": 0.7134, "num_input_tokens_seen": 53956312, "step": 92925 }, { "epoch": 13.8412272862675, "grad_norm": 1.1198220252990723, "learning_rate": 1.3105256094586849e-05, "loss": 0.576, "num_input_tokens_seen": 53959448, "step": 92930 }, { "epoch": 13.84197199880846, "grad_norm": 1.412773847579956, "learning_rate": 1.310239813704845e-05, "loss": 0.7037, "num_input_tokens_seen": 53962488, "step": 92935 }, { "epoch": 13.84271671134942, "grad_norm": 1.4007813930511475, "learning_rate": 1.3099540380507927e-05, "loss": 0.6187, "num_input_tokens_seen": 53965592, "step": 92940 }, { "epoch": 13.843461423890378, "grad_norm": 2.0930025577545166, "learning_rate": 1.3096682825013584e-05, "loss": 0.6113, "num_input_tokens_seen": 53968312, "step": 92945 }, { "epoch": 13.844206136431337, "grad_norm": 1.2361189126968384, "learning_rate": 1.309382547061368e-05, "loss": 0.606, "num_input_tokens_seen": 53971192, "step": 92950 }, { "epoch": 13.844950848972296, "grad_norm": 1.2822529077529907, "learning_rate": 1.3090968317356502e-05, "loss": 0.5472, "num_input_tokens_seen": 53974008, "step": 92955 }, { "epoch": 13.845695561513256, "grad_norm": 1.5672439336776733, "learning_rate": 1.3088111365290302e-05, "loss": 0.6444, "num_input_tokens_seen": 53976696, "step": 92960 }, { "epoch": 13.846440274054215, "grad_norm": 1.3697528839111328, "learning_rate": 1.3085254614463362e-05, "loss": 0.7598, "num_input_tokens_seen": 53979480, "step": 92965 }, { "epoch": 13.847184986595174, "grad_norm": 1.8560644388198853, "learning_rate": 1.308239806492394e-05, "loss": 0.4721, "num_input_tokens_seen": 53982360, "step": 92970 }, { "epoch": 13.847929699136133, "grad_norm": 3.2937445640563965, "learning_rate": 1.3079541716720284e-05, "loss": 0.7704, "num_input_tokens_seen": 53984952, "step": 92975 }, { "epoch": 13.848674411677093, "grad_norm": 1.2869693040847778, "learning_rate": 1.307668556990066e-05, "loss": 0.5685, "num_input_tokens_seen": 53987576, "step": 92980 }, { "epoch": 13.849419124218052, "grad_norm": 1.87758207321167, "learning_rate": 1.30738296245133e-05, "loss": 0.7131, "num_input_tokens_seen": 53990136, "step": 92985 }, { "epoch": 13.85016383675901, "grad_norm": 1.3176060914993286, "learning_rate": 1.3070973880606482e-05, "loss": 0.5612, "num_input_tokens_seen": 53993304, "step": 92990 }, { "epoch": 13.85090854929997, "grad_norm": 1.7524958848953247, "learning_rate": 1.3068118338228425e-05, "loss": 0.5846, "num_input_tokens_seen": 53996440, "step": 92995 }, { "epoch": 13.85165326184093, "grad_norm": 0.9191597104072571, "learning_rate": 1.306526299742739e-05, "loss": 0.6111, "num_input_tokens_seen": 53999224, "step": 93000 }, { "epoch": 13.852397974381889, "grad_norm": 1.3028541803359985, "learning_rate": 1.3062407858251598e-05, "loss": 0.6062, "num_input_tokens_seen": 54002264, "step": 93005 }, { "epoch": 13.853142686922848, "grad_norm": 2.1284677982330322, "learning_rate": 1.3059552920749301e-05, "loss": 0.486, "num_input_tokens_seen": 54004920, "step": 93010 }, { "epoch": 13.853887399463806, "grad_norm": 1.498666763305664, "learning_rate": 1.3056698184968714e-05, "loss": 0.6427, "num_input_tokens_seen": 54007704, "step": 93015 }, { "epoch": 13.854632112004767, "grad_norm": 1.5592812299728394, "learning_rate": 1.305384365095808e-05, "loss": 0.5373, "num_input_tokens_seen": 54010552, "step": 93020 }, { "epoch": 13.855376824545726, "grad_norm": 0.9047430753707886, "learning_rate": 1.305098931876562e-05, "loss": 0.5628, "num_input_tokens_seen": 54013304, "step": 93025 }, { "epoch": 13.856121537086684, "grad_norm": 0.9106345772743225, "learning_rate": 1.3048135188439537e-05, "loss": 0.5425, "num_input_tokens_seen": 54016184, "step": 93030 }, { "epoch": 13.856866249627643, "grad_norm": 1.6479089260101318, "learning_rate": 1.3045281260028075e-05, "loss": 0.6459, "num_input_tokens_seen": 54019128, "step": 93035 }, { "epoch": 13.857610962168604, "grad_norm": 1.6786117553710938, "learning_rate": 1.3042427533579435e-05, "loss": 0.818, "num_input_tokens_seen": 54022072, "step": 93040 }, { "epoch": 13.858355674709562, "grad_norm": 1.1482518911361694, "learning_rate": 1.303957400914183e-05, "loss": 0.5305, "num_input_tokens_seen": 54025048, "step": 93045 }, { "epoch": 13.859100387250521, "grad_norm": 1.0094720125198364, "learning_rate": 1.3036720686763454e-05, "loss": 0.6368, "num_input_tokens_seen": 54027928, "step": 93050 }, { "epoch": 13.85984509979148, "grad_norm": 1.159371256828308, "learning_rate": 1.3033867566492534e-05, "loss": 0.5755, "num_input_tokens_seen": 54030936, "step": 93055 }, { "epoch": 13.86058981233244, "grad_norm": 1.5869404077529907, "learning_rate": 1.303101464837726e-05, "loss": 0.6, "num_input_tokens_seen": 54034072, "step": 93060 }, { "epoch": 13.8613345248734, "grad_norm": 2.2351937294006348, "learning_rate": 1.3028161932465815e-05, "loss": 0.6013, "num_input_tokens_seen": 54037144, "step": 93065 }, { "epoch": 13.862079237414358, "grad_norm": 0.803976833820343, "learning_rate": 1.3025309418806422e-05, "loss": 0.5841, "num_input_tokens_seen": 54040056, "step": 93070 }, { "epoch": 13.862823949955317, "grad_norm": 1.2631659507751465, "learning_rate": 1.3022457107447244e-05, "loss": 0.6286, "num_input_tokens_seen": 54042904, "step": 93075 }, { "epoch": 13.863568662496277, "grad_norm": 2.7877097129821777, "learning_rate": 1.3019604998436491e-05, "loss": 0.624, "num_input_tokens_seen": 54045816, "step": 93080 }, { "epoch": 13.864313375037236, "grad_norm": 0.9273370504379272, "learning_rate": 1.301675309182232e-05, "loss": 0.6712, "num_input_tokens_seen": 54048280, "step": 93085 }, { "epoch": 13.865058087578195, "grad_norm": 1.046212077140808, "learning_rate": 1.3013901387652941e-05, "loss": 0.668, "num_input_tokens_seen": 54051224, "step": 93090 }, { "epoch": 13.865802800119154, "grad_norm": 1.8355517387390137, "learning_rate": 1.3011049885976505e-05, "loss": 0.6976, "num_input_tokens_seen": 54054200, "step": 93095 }, { "epoch": 13.866547512660112, "grad_norm": 2.316434144973755, "learning_rate": 1.3008198586841209e-05, "loss": 0.5494, "num_input_tokens_seen": 54056984, "step": 93100 }, { "epoch": 13.867292225201073, "grad_norm": 1.232043981552124, "learning_rate": 1.3005347490295205e-05, "loss": 0.4995, "num_input_tokens_seen": 54059640, "step": 93105 }, { "epoch": 13.868036937742032, "grad_norm": 1.3692182302474976, "learning_rate": 1.3002496596386666e-05, "loss": 0.4529, "num_input_tokens_seen": 54062264, "step": 93110 }, { "epoch": 13.86878165028299, "grad_norm": 3.613938093185425, "learning_rate": 1.2999645905163754e-05, "loss": 0.7342, "num_input_tokens_seen": 54065016, "step": 93115 }, { "epoch": 13.86952636282395, "grad_norm": 1.1366366147994995, "learning_rate": 1.2996795416674618e-05, "loss": 0.5538, "num_input_tokens_seen": 54067928, "step": 93120 }, { "epoch": 13.87027107536491, "grad_norm": 1.5806025266647339, "learning_rate": 1.2993945130967434e-05, "loss": 0.6261, "num_input_tokens_seen": 54070840, "step": 93125 }, { "epoch": 13.871015787905868, "grad_norm": 3.2076337337493896, "learning_rate": 1.2991095048090333e-05, "loss": 0.5676, "num_input_tokens_seen": 54073496, "step": 93130 }, { "epoch": 13.871760500446827, "grad_norm": 2.061556339263916, "learning_rate": 1.2988245168091485e-05, "loss": 0.4919, "num_input_tokens_seen": 54076408, "step": 93135 }, { "epoch": 13.872505212987786, "grad_norm": 2.286775827407837, "learning_rate": 1.2985395491019029e-05, "loss": 0.7713, "num_input_tokens_seen": 54079096, "step": 93140 }, { "epoch": 13.873249925528746, "grad_norm": 1.5569851398468018, "learning_rate": 1.2982546016921093e-05, "loss": 0.6624, "num_input_tokens_seen": 54081848, "step": 93145 }, { "epoch": 13.873994638069705, "grad_norm": 1.6184709072113037, "learning_rate": 1.297969674584584e-05, "loss": 0.5331, "num_input_tokens_seen": 54084984, "step": 93150 }, { "epoch": 13.874739350610664, "grad_norm": 1.830675721168518, "learning_rate": 1.2976847677841383e-05, "loss": 0.7071, "num_input_tokens_seen": 54087736, "step": 93155 }, { "epoch": 13.875484063151623, "grad_norm": 1.854125738143921, "learning_rate": 1.2973998812955876e-05, "loss": 0.6735, "num_input_tokens_seen": 54090456, "step": 93160 }, { "epoch": 13.876228775692583, "grad_norm": 2.2103679180145264, "learning_rate": 1.2971150151237435e-05, "loss": 0.5412, "num_input_tokens_seen": 54093336, "step": 93165 }, { "epoch": 13.876973488233542, "grad_norm": 1.1192851066589355, "learning_rate": 1.2968301692734187e-05, "loss": 0.6182, "num_input_tokens_seen": 54096376, "step": 93170 }, { "epoch": 13.8777182007745, "grad_norm": 1.1961158514022827, "learning_rate": 1.2965453437494243e-05, "loss": 0.5111, "num_input_tokens_seen": 54099128, "step": 93175 }, { "epoch": 13.87846291331546, "grad_norm": 1.0095051527023315, "learning_rate": 1.296260538556574e-05, "loss": 0.5278, "num_input_tokens_seen": 54102200, "step": 93180 }, { "epoch": 13.87920762585642, "grad_norm": 1.0344127416610718, "learning_rate": 1.295975753699679e-05, "loss": 0.5696, "num_input_tokens_seen": 54104984, "step": 93185 }, { "epoch": 13.879952338397379, "grad_norm": 0.8454002737998962, "learning_rate": 1.2956909891835484e-05, "loss": 0.5169, "num_input_tokens_seen": 54107704, "step": 93190 }, { "epoch": 13.880697050938338, "grad_norm": 1.6542898416519165, "learning_rate": 1.2954062450129959e-05, "loss": 0.5416, "num_input_tokens_seen": 54110680, "step": 93195 }, { "epoch": 13.881441763479296, "grad_norm": 1.8453963994979858, "learning_rate": 1.2951215211928292e-05, "loss": 0.6712, "num_input_tokens_seen": 54113656, "step": 93200 }, { "epoch": 13.882186476020257, "grad_norm": 1.616041660308838, "learning_rate": 1.2948368177278614e-05, "loss": 0.6365, "num_input_tokens_seen": 54116760, "step": 93205 }, { "epoch": 13.882931188561216, "grad_norm": 1.014564871788025, "learning_rate": 1.2945521346228989e-05, "loss": 0.5302, "num_input_tokens_seen": 54119768, "step": 93210 }, { "epoch": 13.883675901102174, "grad_norm": 1.36509108543396, "learning_rate": 1.2942674718827546e-05, "loss": 0.4812, "num_input_tokens_seen": 54122712, "step": 93215 }, { "epoch": 13.884420613643133, "grad_norm": 2.135901927947998, "learning_rate": 1.2939828295122358e-05, "loss": 0.4392, "num_input_tokens_seen": 54125464, "step": 93220 }, { "epoch": 13.885165326184094, "grad_norm": 1.0883188247680664, "learning_rate": 1.2936982075161502e-05, "loss": 0.3721, "num_input_tokens_seen": 54128376, "step": 93225 }, { "epoch": 13.885910038725052, "grad_norm": 1.9813309907913208, "learning_rate": 1.2934136058993082e-05, "loss": 0.5762, "num_input_tokens_seen": 54131512, "step": 93230 }, { "epoch": 13.886654751266011, "grad_norm": 1.6619317531585693, "learning_rate": 1.2931290246665173e-05, "loss": 0.5082, "num_input_tokens_seen": 54134648, "step": 93235 }, { "epoch": 13.88739946380697, "grad_norm": 1.6867340803146362, "learning_rate": 1.2928444638225848e-05, "loss": 0.655, "num_input_tokens_seen": 54137848, "step": 93240 }, { "epoch": 13.88814417634793, "grad_norm": 1.2306618690490723, "learning_rate": 1.2925599233723174e-05, "loss": 0.5658, "num_input_tokens_seen": 54140568, "step": 93245 }, { "epoch": 13.88888888888889, "grad_norm": 1.0783002376556396, "learning_rate": 1.2922754033205237e-05, "loss": 0.8069, "num_input_tokens_seen": 54143736, "step": 93250 }, { "epoch": 13.889633601429848, "grad_norm": 1.1486490964889526, "learning_rate": 1.2919909036720085e-05, "loss": 0.5921, "num_input_tokens_seen": 54146488, "step": 93255 }, { "epoch": 13.890378313970807, "grad_norm": 1.3119770288467407, "learning_rate": 1.2917064244315802e-05, "loss": 0.5448, "num_input_tokens_seen": 54149368, "step": 93260 }, { "epoch": 13.891123026511767, "grad_norm": 1.5242823362350464, "learning_rate": 1.2914219656040437e-05, "loss": 0.5376, "num_input_tokens_seen": 54152312, "step": 93265 }, { "epoch": 13.891867739052726, "grad_norm": 1.341362476348877, "learning_rate": 1.2911375271942042e-05, "loss": 0.6073, "num_input_tokens_seen": 54155032, "step": 93270 }, { "epoch": 13.892612451593685, "grad_norm": 2.831634998321533, "learning_rate": 1.2908531092068682e-05, "loss": 0.7539, "num_input_tokens_seen": 54158104, "step": 93275 }, { "epoch": 13.893357164134644, "grad_norm": 2.909205913543701, "learning_rate": 1.290568711646839e-05, "loss": 0.7032, "num_input_tokens_seen": 54160728, "step": 93280 }, { "epoch": 13.894101876675602, "grad_norm": 2.1225826740264893, "learning_rate": 1.2902843345189237e-05, "loss": 0.6771, "num_input_tokens_seen": 54163672, "step": 93285 }, { "epoch": 13.894846589216563, "grad_norm": 1.076459527015686, "learning_rate": 1.2899999778279235e-05, "loss": 0.5605, "num_input_tokens_seen": 54166520, "step": 93290 }, { "epoch": 13.895591301757522, "grad_norm": 2.3498787879943848, "learning_rate": 1.289715641578645e-05, "loss": 0.7785, "num_input_tokens_seen": 54169432, "step": 93295 }, { "epoch": 13.89633601429848, "grad_norm": 1.1781126260757446, "learning_rate": 1.2894313257758906e-05, "loss": 0.5687, "num_input_tokens_seen": 54172472, "step": 93300 }, { "epoch": 13.89708072683944, "grad_norm": 2.5864455699920654, "learning_rate": 1.2891470304244638e-05, "loss": 0.654, "num_input_tokens_seen": 54175736, "step": 93305 }, { "epoch": 13.8978254393804, "grad_norm": 1.6805256605148315, "learning_rate": 1.288862755529167e-05, "loss": 0.6441, "num_input_tokens_seen": 54178680, "step": 93310 }, { "epoch": 13.898570151921358, "grad_norm": 1.306048035621643, "learning_rate": 1.2885785010948023e-05, "loss": 0.719, "num_input_tokens_seen": 54181784, "step": 93315 }, { "epoch": 13.899314864462317, "grad_norm": 1.6095218658447266, "learning_rate": 1.2882942671261733e-05, "loss": 0.5667, "num_input_tokens_seen": 54184664, "step": 93320 }, { "epoch": 13.900059577003276, "grad_norm": 2.587549924850464, "learning_rate": 1.2880100536280803e-05, "loss": 0.5456, "num_input_tokens_seen": 54187480, "step": 93325 }, { "epoch": 13.900804289544237, "grad_norm": 0.9844385385513306, "learning_rate": 1.2877258606053266e-05, "loss": 0.4466, "num_input_tokens_seen": 54190520, "step": 93330 }, { "epoch": 13.901549002085195, "grad_norm": 1.7839343547821045, "learning_rate": 1.2874416880627116e-05, "loss": 0.5893, "num_input_tokens_seen": 54193304, "step": 93335 }, { "epoch": 13.902293714626154, "grad_norm": 1.6345112323760986, "learning_rate": 1.2871575360050376e-05, "loss": 0.6106, "num_input_tokens_seen": 54196088, "step": 93340 }, { "epoch": 13.903038427167113, "grad_norm": 1.4657036066055298, "learning_rate": 1.2868734044371044e-05, "loss": 0.5638, "num_input_tokens_seen": 54198904, "step": 93345 }, { "epoch": 13.903783139708073, "grad_norm": 0.8774659037590027, "learning_rate": 1.2865892933637114e-05, "loss": 0.6052, "num_input_tokens_seen": 54201752, "step": 93350 }, { "epoch": 13.904527852249032, "grad_norm": 1.2906376123428345, "learning_rate": 1.2863052027896597e-05, "loss": 0.6156, "num_input_tokens_seen": 54204696, "step": 93355 }, { "epoch": 13.90527256478999, "grad_norm": 2.0776937007904053, "learning_rate": 1.2860211327197468e-05, "loss": 0.5118, "num_input_tokens_seen": 54207640, "step": 93360 }, { "epoch": 13.90601727733095, "grad_norm": 1.1609280109405518, "learning_rate": 1.2857370831587745e-05, "loss": 0.5804, "num_input_tokens_seen": 54210584, "step": 93365 }, { "epoch": 13.90676198987191, "grad_norm": 3.4776206016540527, "learning_rate": 1.28545305411154e-05, "loss": 0.7194, "num_input_tokens_seen": 54213432, "step": 93370 }, { "epoch": 13.907506702412869, "grad_norm": 1.3329936265945435, "learning_rate": 1.2851690455828414e-05, "loss": 0.4765, "num_input_tokens_seen": 54215992, "step": 93375 }, { "epoch": 13.908251414953828, "grad_norm": 1.9655190706253052, "learning_rate": 1.2848850575774774e-05, "loss": 0.5746, "num_input_tokens_seen": 54218712, "step": 93380 }, { "epoch": 13.908996127494786, "grad_norm": 2.4215097427368164, "learning_rate": 1.2846010901002442e-05, "loss": 0.6445, "num_input_tokens_seen": 54221592, "step": 93385 }, { "epoch": 13.909740840035747, "grad_norm": 3.7439799308776855, "learning_rate": 1.2843171431559414e-05, "loss": 0.7338, "num_input_tokens_seen": 54224248, "step": 93390 }, { "epoch": 13.910485552576706, "grad_norm": 1.7242218255996704, "learning_rate": 1.284033216749364e-05, "loss": 0.6905, "num_input_tokens_seen": 54227352, "step": 93395 }, { "epoch": 13.911230265117664, "grad_norm": 2.2916481494903564, "learning_rate": 1.2837493108853105e-05, "loss": 0.5328, "num_input_tokens_seen": 54230680, "step": 93400 }, { "epoch": 13.911974977658623, "grad_norm": 1.8247325420379639, "learning_rate": 1.2834654255685752e-05, "loss": 0.7436, "num_input_tokens_seen": 54233400, "step": 93405 }, { "epoch": 13.912719690199584, "grad_norm": 0.9829142093658447, "learning_rate": 1.283181560803956e-05, "loss": 0.6029, "num_input_tokens_seen": 54236120, "step": 93410 }, { "epoch": 13.913464402740543, "grad_norm": 2.136756420135498, "learning_rate": 1.282897716596247e-05, "loss": 0.5805, "num_input_tokens_seen": 54239000, "step": 93415 }, { "epoch": 13.914209115281501, "grad_norm": 2.001190423965454, "learning_rate": 1.2826138929502446e-05, "loss": 0.7101, "num_input_tokens_seen": 54241976, "step": 93420 }, { "epoch": 13.91495382782246, "grad_norm": 2.1873257160186768, "learning_rate": 1.2823300898707432e-05, "loss": 0.5258, "num_input_tokens_seen": 54245208, "step": 93425 }, { "epoch": 13.915698540363419, "grad_norm": 2.5360753536224365, "learning_rate": 1.2820463073625367e-05, "loss": 0.627, "num_input_tokens_seen": 54248056, "step": 93430 }, { "epoch": 13.91644325290438, "grad_norm": 1.231781244277954, "learning_rate": 1.2817625454304204e-05, "loss": 0.6452, "num_input_tokens_seen": 54250840, "step": 93435 }, { "epoch": 13.917187965445338, "grad_norm": 2.0883371829986572, "learning_rate": 1.281478804079188e-05, "loss": 0.6688, "num_input_tokens_seen": 54253912, "step": 93440 }, { "epoch": 13.917932677986297, "grad_norm": 1.8366378545761108, "learning_rate": 1.2811950833136332e-05, "loss": 0.6367, "num_input_tokens_seen": 54256728, "step": 93445 }, { "epoch": 13.918677390527257, "grad_norm": 1.6615455150604248, "learning_rate": 1.2809113831385472e-05, "loss": 0.8547, "num_input_tokens_seen": 54259416, "step": 93450 }, { "epoch": 13.919422103068216, "grad_norm": 1.7258529663085938, "learning_rate": 1.2806277035587256e-05, "loss": 0.6244, "num_input_tokens_seen": 54262232, "step": 93455 }, { "epoch": 13.920166815609175, "grad_norm": 1.5445681810379028, "learning_rate": 1.2803440445789594e-05, "loss": 0.5587, "num_input_tokens_seen": 54265272, "step": 93460 }, { "epoch": 13.920911528150134, "grad_norm": 2.5877037048339844, "learning_rate": 1.2800604062040403e-05, "loss": 0.5318, "num_input_tokens_seen": 54268408, "step": 93465 }, { "epoch": 13.921656240691092, "grad_norm": 1.5966367721557617, "learning_rate": 1.2797767884387615e-05, "loss": 0.4208, "num_input_tokens_seen": 54271288, "step": 93470 }, { "epoch": 13.922400953232053, "grad_norm": 1.6896610260009766, "learning_rate": 1.2794931912879127e-05, "loss": 0.7587, "num_input_tokens_seen": 54274104, "step": 93475 }, { "epoch": 13.923145665773012, "grad_norm": 2.7955381870269775, "learning_rate": 1.2792096147562872e-05, "loss": 0.7393, "num_input_tokens_seen": 54276920, "step": 93480 }, { "epoch": 13.92389037831397, "grad_norm": 1.2731341123580933, "learning_rate": 1.2789260588486735e-05, "loss": 0.5712, "num_input_tokens_seen": 54279704, "step": 93485 }, { "epoch": 13.92463509085493, "grad_norm": 2.248046636581421, "learning_rate": 1.2786425235698634e-05, "loss": 0.5335, "num_input_tokens_seen": 54282616, "step": 93490 }, { "epoch": 13.92537980339589, "grad_norm": 1.327226996421814, "learning_rate": 1.2783590089246473e-05, "loss": 0.4097, "num_input_tokens_seen": 54285336, "step": 93495 }, { "epoch": 13.926124515936849, "grad_norm": 1.2422153949737549, "learning_rate": 1.2780755149178136e-05, "loss": 0.552, "num_input_tokens_seen": 54288312, "step": 93500 }, { "epoch": 13.926869228477807, "grad_norm": 1.5318107604980469, "learning_rate": 1.2777920415541514e-05, "loss": 0.6658, "num_input_tokens_seen": 54291512, "step": 93505 }, { "epoch": 13.927613941018766, "grad_norm": 1.12339186668396, "learning_rate": 1.2775085888384514e-05, "loss": 0.5188, "num_input_tokens_seen": 54294200, "step": 93510 }, { "epoch": 13.928358653559727, "grad_norm": 1.005289077758789, "learning_rate": 1.2772251567755011e-05, "loss": 0.4035, "num_input_tokens_seen": 54296600, "step": 93515 }, { "epoch": 13.929103366100685, "grad_norm": 1.4077401161193848, "learning_rate": 1.2769417453700882e-05, "loss": 0.5977, "num_input_tokens_seen": 54299320, "step": 93520 }, { "epoch": 13.929848078641644, "grad_norm": 2.1016898155212402, "learning_rate": 1.2766583546270027e-05, "loss": 0.7188, "num_input_tokens_seen": 54302136, "step": 93525 }, { "epoch": 13.930592791182603, "grad_norm": 1.719275951385498, "learning_rate": 1.2763749845510297e-05, "loss": 0.4557, "num_input_tokens_seen": 54305304, "step": 93530 }, { "epoch": 13.931337503723563, "grad_norm": 0.8230704069137573, "learning_rate": 1.2760916351469588e-05, "loss": 0.6799, "num_input_tokens_seen": 54308280, "step": 93535 }, { "epoch": 13.932082216264522, "grad_norm": 2.050567150115967, "learning_rate": 1.2758083064195756e-05, "loss": 0.5105, "num_input_tokens_seen": 54311160, "step": 93540 }, { "epoch": 13.932826928805481, "grad_norm": 1.5080393552780151, "learning_rate": 1.2755249983736662e-05, "loss": 0.6454, "num_input_tokens_seen": 54313944, "step": 93545 }, { "epoch": 13.93357164134644, "grad_norm": 1.35525381565094, "learning_rate": 1.275241711014018e-05, "loss": 0.5722, "num_input_tokens_seen": 54316792, "step": 93550 }, { "epoch": 13.9343163538874, "grad_norm": 1.2754957675933838, "learning_rate": 1.2749584443454154e-05, "loss": 0.5843, "num_input_tokens_seen": 54319992, "step": 93555 }, { "epoch": 13.935061066428359, "grad_norm": 1.3187072277069092, "learning_rate": 1.2746751983726459e-05, "loss": 0.579, "num_input_tokens_seen": 54322840, "step": 93560 }, { "epoch": 13.935805778969318, "grad_norm": 1.1371654272079468, "learning_rate": 1.2743919731004938e-05, "loss": 0.7006, "num_input_tokens_seen": 54325848, "step": 93565 }, { "epoch": 13.936550491510276, "grad_norm": 1.7758865356445312, "learning_rate": 1.2741087685337432e-05, "loss": 0.5441, "num_input_tokens_seen": 54328600, "step": 93570 }, { "epoch": 13.937295204051237, "grad_norm": 1.936741828918457, "learning_rate": 1.2738255846771785e-05, "loss": 0.6167, "num_input_tokens_seen": 54331224, "step": 93575 }, { "epoch": 13.938039916592196, "grad_norm": 1.9784871339797974, "learning_rate": 1.273542421535585e-05, "loss": 0.5275, "num_input_tokens_seen": 54334200, "step": 93580 }, { "epoch": 13.938784629133155, "grad_norm": 1.6976556777954102, "learning_rate": 1.273259279113746e-05, "loss": 0.5968, "num_input_tokens_seen": 54337016, "step": 93585 }, { "epoch": 13.939529341674113, "grad_norm": 1.178265929222107, "learning_rate": 1.2729761574164434e-05, "loss": 0.5195, "num_input_tokens_seen": 54339928, "step": 93590 }, { "epoch": 13.940274054215074, "grad_norm": 1.3212400674819946, "learning_rate": 1.2726930564484627e-05, "loss": 0.5714, "num_input_tokens_seen": 54342776, "step": 93595 }, { "epoch": 13.941018766756033, "grad_norm": 1.234636664390564, "learning_rate": 1.2724099762145841e-05, "loss": 0.6799, "num_input_tokens_seen": 54345752, "step": 93600 }, { "epoch": 13.941763479296991, "grad_norm": 1.2111479043960571, "learning_rate": 1.2721269167195926e-05, "loss": 0.6247, "num_input_tokens_seen": 54348376, "step": 93605 }, { "epoch": 13.94250819183795, "grad_norm": 1.8132660388946533, "learning_rate": 1.2718438779682678e-05, "loss": 0.3946, "num_input_tokens_seen": 54351448, "step": 93610 }, { "epoch": 13.943252904378909, "grad_norm": 1.0843974351882935, "learning_rate": 1.2715608599653938e-05, "loss": 0.546, "num_input_tokens_seen": 54354456, "step": 93615 }, { "epoch": 13.94399761691987, "grad_norm": 1.2327046394348145, "learning_rate": 1.271277862715749e-05, "loss": 0.6888, "num_input_tokens_seen": 54357432, "step": 93620 }, { "epoch": 13.944742329460828, "grad_norm": 1.6425600051879883, "learning_rate": 1.2709948862241173e-05, "loss": 0.6305, "num_input_tokens_seen": 54360536, "step": 93625 }, { "epoch": 13.945487042001787, "grad_norm": 1.1767070293426514, "learning_rate": 1.2707119304952777e-05, "loss": 0.5817, "num_input_tokens_seen": 54363352, "step": 93630 }, { "epoch": 13.946231754542747, "grad_norm": 2.039457082748413, "learning_rate": 1.2704289955340107e-05, "loss": 0.5986, "num_input_tokens_seen": 54366200, "step": 93635 }, { "epoch": 13.946976467083706, "grad_norm": 2.384244680404663, "learning_rate": 1.270146081345096e-05, "loss": 0.5286, "num_input_tokens_seen": 54368792, "step": 93640 }, { "epoch": 13.947721179624665, "grad_norm": 4.169332504272461, "learning_rate": 1.2698631879333126e-05, "loss": 0.6123, "num_input_tokens_seen": 54371480, "step": 93645 }, { "epoch": 13.948465892165624, "grad_norm": 1.3825503587722778, "learning_rate": 1.2695803153034411e-05, "loss": 0.682, "num_input_tokens_seen": 54374488, "step": 93650 }, { "epoch": 13.949210604706582, "grad_norm": 1.2416187524795532, "learning_rate": 1.2692974634602586e-05, "loss": 0.6447, "num_input_tokens_seen": 54377208, "step": 93655 }, { "epoch": 13.949955317247543, "grad_norm": 2.203605890274048, "learning_rate": 1.2690146324085458e-05, "loss": 0.6761, "num_input_tokens_seen": 54380088, "step": 93660 }, { "epoch": 13.950700029788502, "grad_norm": 1.3652150630950928, "learning_rate": 1.2687318221530797e-05, "loss": 0.6919, "num_input_tokens_seen": 54383064, "step": 93665 }, { "epoch": 13.95144474232946, "grad_norm": 1.4925397634506226, "learning_rate": 1.268449032698637e-05, "loss": 0.4863, "num_input_tokens_seen": 54385912, "step": 93670 }, { "epoch": 13.95218945487042, "grad_norm": 1.3583983182907104, "learning_rate": 1.2681662640499969e-05, "loss": 0.4489, "num_input_tokens_seen": 54388728, "step": 93675 }, { "epoch": 13.95293416741138, "grad_norm": 1.2982325553894043, "learning_rate": 1.2678835162119352e-05, "loss": 0.586, "num_input_tokens_seen": 54391640, "step": 93680 }, { "epoch": 13.953678879952339, "grad_norm": 1.3977735042572021, "learning_rate": 1.26760078918923e-05, "loss": 0.5892, "num_input_tokens_seen": 54394680, "step": 93685 }, { "epoch": 13.954423592493297, "grad_norm": 0.4186042249202728, "learning_rate": 1.267318082986656e-05, "loss": 0.5489, "num_input_tokens_seen": 54397560, "step": 93690 }, { "epoch": 13.955168305034256, "grad_norm": 1.0436909198760986, "learning_rate": 1.267035397608991e-05, "loss": 0.6509, "num_input_tokens_seen": 54400408, "step": 93695 }, { "epoch": 13.955913017575217, "grad_norm": 1.4203810691833496, "learning_rate": 1.2667527330610101e-05, "loss": 0.7253, "num_input_tokens_seen": 54403256, "step": 93700 }, { "epoch": 13.956657730116175, "grad_norm": 1.8234059810638428, "learning_rate": 1.2664700893474884e-05, "loss": 0.5144, "num_input_tokens_seen": 54406040, "step": 93705 }, { "epoch": 13.957402442657134, "grad_norm": 1.3114970922470093, "learning_rate": 1.2661874664732004e-05, "loss": 0.618, "num_input_tokens_seen": 54408856, "step": 93710 }, { "epoch": 13.958147155198093, "grad_norm": 2.0945048332214355, "learning_rate": 1.2659048644429205e-05, "loss": 0.6338, "num_input_tokens_seen": 54411544, "step": 93715 }, { "epoch": 13.958891867739053, "grad_norm": 1.481331467628479, "learning_rate": 1.2656222832614245e-05, "loss": 0.5369, "num_input_tokens_seen": 54414296, "step": 93720 }, { "epoch": 13.959636580280012, "grad_norm": 1.4554264545440674, "learning_rate": 1.2653397229334846e-05, "loss": 0.6032, "num_input_tokens_seen": 54416984, "step": 93725 }, { "epoch": 13.960381292820971, "grad_norm": 1.164549708366394, "learning_rate": 1.2650571834638764e-05, "loss": 0.559, "num_input_tokens_seen": 54419864, "step": 93730 }, { "epoch": 13.96112600536193, "grad_norm": 1.5938303470611572, "learning_rate": 1.2647746648573705e-05, "loss": 0.6972, "num_input_tokens_seen": 54422808, "step": 93735 }, { "epoch": 13.96187071790289, "grad_norm": 1.1783688068389893, "learning_rate": 1.2644921671187424e-05, "loss": 0.4828, "num_input_tokens_seen": 54426136, "step": 93740 }, { "epoch": 13.962615430443849, "grad_norm": 1.015498161315918, "learning_rate": 1.2642096902527633e-05, "loss": 0.5188, "num_input_tokens_seen": 54428984, "step": 93745 }, { "epoch": 13.963360142984808, "grad_norm": 1.0090816020965576, "learning_rate": 1.2639272342642047e-05, "loss": 0.613, "num_input_tokens_seen": 54432088, "step": 93750 }, { "epoch": 13.964104855525767, "grad_norm": 1.515710711479187, "learning_rate": 1.2636447991578401e-05, "loss": 0.6221, "num_input_tokens_seen": 54434904, "step": 93755 }, { "epoch": 13.964849568066727, "grad_norm": 1.405872106552124, "learning_rate": 1.263362384938439e-05, "loss": 0.5801, "num_input_tokens_seen": 54437720, "step": 93760 }, { "epoch": 13.965594280607686, "grad_norm": 1.2821869850158691, "learning_rate": 1.2630799916107747e-05, "loss": 0.6057, "num_input_tokens_seen": 54440792, "step": 93765 }, { "epoch": 13.966338993148645, "grad_norm": 1.2549346685409546, "learning_rate": 1.2627976191796165e-05, "loss": 0.6047, "num_input_tokens_seen": 54443704, "step": 93770 }, { "epoch": 13.967083705689603, "grad_norm": 1.9454411268234253, "learning_rate": 1.2625152676497354e-05, "loss": 0.5069, "num_input_tokens_seen": 54446392, "step": 93775 }, { "epoch": 13.967828418230564, "grad_norm": 2.6821401119232178, "learning_rate": 1.2622329370259001e-05, "loss": 0.6646, "num_input_tokens_seen": 54449304, "step": 93780 }, { "epoch": 13.968573130771523, "grad_norm": 1.9415843486785889, "learning_rate": 1.261950627312882e-05, "loss": 0.6409, "num_input_tokens_seen": 54452056, "step": 93785 }, { "epoch": 13.969317843312481, "grad_norm": 2.4828245639801025, "learning_rate": 1.2616683385154498e-05, "loss": 0.6026, "num_input_tokens_seen": 54454968, "step": 93790 }, { "epoch": 13.97006255585344, "grad_norm": 1.8729255199432373, "learning_rate": 1.2613860706383718e-05, "loss": 0.4579, "num_input_tokens_seen": 54457752, "step": 93795 }, { "epoch": 13.970807268394399, "grad_norm": 1.412544846534729, "learning_rate": 1.261103823686418e-05, "loss": 0.6369, "num_input_tokens_seen": 54460440, "step": 93800 }, { "epoch": 13.97155198093536, "grad_norm": 2.204883098602295, "learning_rate": 1.260821597664355e-05, "loss": 0.4635, "num_input_tokens_seen": 54463256, "step": 93805 }, { "epoch": 13.972296693476318, "grad_norm": 1.3891041278839111, "learning_rate": 1.2605393925769526e-05, "loss": 0.7914, "num_input_tokens_seen": 54465912, "step": 93810 }, { "epoch": 13.973041406017277, "grad_norm": 2.385145425796509, "learning_rate": 1.2602572084289765e-05, "loss": 0.6315, "num_input_tokens_seen": 54468568, "step": 93815 }, { "epoch": 13.973786118558236, "grad_norm": 1.4867359399795532, "learning_rate": 1.259975045225196e-05, "loss": 0.606, "num_input_tokens_seen": 54471192, "step": 93820 }, { "epoch": 13.974530831099196, "grad_norm": 1.1809722185134888, "learning_rate": 1.2596929029703766e-05, "loss": 0.6936, "num_input_tokens_seen": 54474072, "step": 93825 }, { "epoch": 13.975275543640155, "grad_norm": 1.3883429765701294, "learning_rate": 1.2594107816692852e-05, "loss": 0.5604, "num_input_tokens_seen": 54477080, "step": 93830 }, { "epoch": 13.976020256181114, "grad_norm": 1.6199959516525269, "learning_rate": 1.2591286813266867e-05, "loss": 0.532, "num_input_tokens_seen": 54480152, "step": 93835 }, { "epoch": 13.976764968722073, "grad_norm": 2.217618942260742, "learning_rate": 1.2588466019473488e-05, "loss": 0.6839, "num_input_tokens_seen": 54483288, "step": 93840 }, { "epoch": 13.977509681263033, "grad_norm": 2.635481357574463, "learning_rate": 1.2585645435360361e-05, "loss": 0.7426, "num_input_tokens_seen": 54486456, "step": 93845 }, { "epoch": 13.978254393803992, "grad_norm": 1.307848334312439, "learning_rate": 1.2582825060975128e-05, "loss": 0.4923, "num_input_tokens_seen": 54489208, "step": 93850 }, { "epoch": 13.97899910634495, "grad_norm": 1.2434358596801758, "learning_rate": 1.2580004896365455e-05, "loss": 0.5997, "num_input_tokens_seen": 54492408, "step": 93855 }, { "epoch": 13.97974381888591, "grad_norm": 1.215861439704895, "learning_rate": 1.2577184941578968e-05, "loss": 0.6924, "num_input_tokens_seen": 54495352, "step": 93860 }, { "epoch": 13.98048853142687, "grad_norm": 1.903029441833496, "learning_rate": 1.2574365196663324e-05, "loss": 0.5245, "num_input_tokens_seen": 54498232, "step": 93865 }, { "epoch": 13.981233243967829, "grad_norm": 1.6619149446487427, "learning_rate": 1.2571545661666151e-05, "loss": 0.5378, "num_input_tokens_seen": 54501304, "step": 93870 }, { "epoch": 13.981977956508787, "grad_norm": 1.6297192573547363, "learning_rate": 1.2568726336635073e-05, "loss": 0.6096, "num_input_tokens_seen": 54503960, "step": 93875 }, { "epoch": 13.982722669049746, "grad_norm": 1.2325960397720337, "learning_rate": 1.2565907221617738e-05, "loss": 0.6564, "num_input_tokens_seen": 54506872, "step": 93880 }, { "epoch": 13.983467381590707, "grad_norm": 1.723360538482666, "learning_rate": 1.2563088316661753e-05, "loss": 0.7089, "num_input_tokens_seen": 54510008, "step": 93885 }, { "epoch": 13.984212094131665, "grad_norm": 2.0667130947113037, "learning_rate": 1.256026962181476e-05, "loss": 0.4481, "num_input_tokens_seen": 54513048, "step": 93890 }, { "epoch": 13.984956806672624, "grad_norm": 1.5274536609649658, "learning_rate": 1.255745113712437e-05, "loss": 0.6334, "num_input_tokens_seen": 54515768, "step": 93895 }, { "epoch": 13.985701519213583, "grad_norm": 1.4230623245239258, "learning_rate": 1.2554632862638197e-05, "loss": 0.4731, "num_input_tokens_seen": 54518712, "step": 93900 }, { "epoch": 13.986446231754543, "grad_norm": 1.4985753297805786, "learning_rate": 1.2551814798403851e-05, "loss": 0.6361, "num_input_tokens_seen": 54521944, "step": 93905 }, { "epoch": 13.987190944295502, "grad_norm": 1.0767894983291626, "learning_rate": 1.2548996944468935e-05, "loss": 0.6441, "num_input_tokens_seen": 54524696, "step": 93910 }, { "epoch": 13.987935656836461, "grad_norm": 1.3746533393859863, "learning_rate": 1.254617930088107e-05, "loss": 0.5527, "num_input_tokens_seen": 54527448, "step": 93915 }, { "epoch": 13.98868036937742, "grad_norm": 1.697393536567688, "learning_rate": 1.2543361867687836e-05, "loss": 0.7604, "num_input_tokens_seen": 54530456, "step": 93920 }, { "epoch": 13.98942508191838, "grad_norm": 2.3383841514587402, "learning_rate": 1.2540544644936858e-05, "loss": 0.5591, "num_input_tokens_seen": 54533240, "step": 93925 }, { "epoch": 13.990169794459339, "grad_norm": 2.094233751296997, "learning_rate": 1.2537727632675699e-05, "loss": 0.7663, "num_input_tokens_seen": 54535864, "step": 93930 }, { "epoch": 13.990914507000298, "grad_norm": 2.4296083450317383, "learning_rate": 1.253491083095198e-05, "loss": 0.5725, "num_input_tokens_seen": 54538616, "step": 93935 }, { "epoch": 13.991659219541257, "grad_norm": 0.9869616031646729, "learning_rate": 1.253209423981326e-05, "loss": 0.453, "num_input_tokens_seen": 54541304, "step": 93940 }, { "epoch": 13.992403932082215, "grad_norm": 2.2806925773620605, "learning_rate": 1.2529277859307148e-05, "loss": 0.652, "num_input_tokens_seen": 54544152, "step": 93945 }, { "epoch": 13.993148644623176, "grad_norm": 1.3974504470825195, "learning_rate": 1.2526461689481212e-05, "loss": 0.5313, "num_input_tokens_seen": 54547032, "step": 93950 }, { "epoch": 13.993893357164135, "grad_norm": 1.8641107082366943, "learning_rate": 1.2523645730383018e-05, "loss": 0.5437, "num_input_tokens_seen": 54549816, "step": 93955 }, { "epoch": 13.994638069705093, "grad_norm": 0.8700082302093506, "learning_rate": 1.2520829982060162e-05, "loss": 0.5829, "num_input_tokens_seen": 54552760, "step": 93960 }, { "epoch": 13.995382782246054, "grad_norm": 1.9905592203140259, "learning_rate": 1.2518014444560195e-05, "loss": 0.6925, "num_input_tokens_seen": 54555736, "step": 93965 }, { "epoch": 13.996127494787013, "grad_norm": 1.3248445987701416, "learning_rate": 1.251519911793069e-05, "loss": 0.5958, "num_input_tokens_seen": 54558680, "step": 93970 }, { "epoch": 13.996872207327971, "grad_norm": 1.3263708353042603, "learning_rate": 1.2512384002219196e-05, "loss": 0.6959, "num_input_tokens_seen": 54561656, "step": 93975 }, { "epoch": 13.99761691986893, "grad_norm": 1.5801883935928345, "learning_rate": 1.2509569097473295e-05, "loss": 0.5788, "num_input_tokens_seen": 54564408, "step": 93980 }, { "epoch": 13.998361632409889, "grad_norm": 1.939935564994812, "learning_rate": 1.2506754403740529e-05, "loss": 0.7724, "num_input_tokens_seen": 54567736, "step": 93985 }, { "epoch": 13.99910634495085, "grad_norm": 2.041090250015259, "learning_rate": 1.2503939921068435e-05, "loss": 0.6125, "num_input_tokens_seen": 54570552, "step": 93990 }, { "epoch": 13.999851057491808, "grad_norm": 1.6058927774429321, "learning_rate": 1.2501125649504591e-05, "loss": 0.5635, "num_input_tokens_seen": 54573752, "step": 93995 }, { "epoch": 14.0, "eval_loss": 0.6551037430763245, "eval_runtime": 47.0229, "eval_samples_per_second": 63.459, "eval_steps_per_second": 15.865, "num_input_tokens_seen": 54573896, "step": 93996 }, { "epoch": 14.000595770032767, "grad_norm": 1.267048954963684, "learning_rate": 1.2498311589096514e-05, "loss": 0.4794, "num_input_tokens_seen": 54576136, "step": 94000 }, { "epoch": 14.001340482573726, "grad_norm": 1.2422558069229126, "learning_rate": 1.2495497739891764e-05, "loss": 0.4534, "num_input_tokens_seen": 54579176, "step": 94005 }, { "epoch": 14.002085195114686, "grad_norm": 1.5293431282043457, "learning_rate": 1.2492684101937865e-05, "loss": 0.3443, "num_input_tokens_seen": 54581896, "step": 94010 }, { "epoch": 14.002829907655645, "grad_norm": 1.2388592958450317, "learning_rate": 1.2489870675282364e-05, "loss": 0.6011, "num_input_tokens_seen": 54584648, "step": 94015 }, { "epoch": 14.003574620196604, "grad_norm": 1.7350190877914429, "learning_rate": 1.2487057459972775e-05, "loss": 0.5466, "num_input_tokens_seen": 54587400, "step": 94020 }, { "epoch": 14.004319332737563, "grad_norm": 1.2494398355484009, "learning_rate": 1.248424445605664e-05, "loss": 0.7702, "num_input_tokens_seen": 54590728, "step": 94025 }, { "epoch": 14.005064045278523, "grad_norm": 0.958795964717865, "learning_rate": 1.2481431663581474e-05, "loss": 0.5799, "num_input_tokens_seen": 54593704, "step": 94030 }, { "epoch": 14.005808757819482, "grad_norm": 2.191622018814087, "learning_rate": 1.24786190825948e-05, "loss": 0.6405, "num_input_tokens_seen": 54596584, "step": 94035 }, { "epoch": 14.00655347036044, "grad_norm": 1.3008219003677368, "learning_rate": 1.2475806713144128e-05, "loss": 0.6778, "num_input_tokens_seen": 54599208, "step": 94040 }, { "epoch": 14.0072981829014, "grad_norm": 3.9067020416259766, "learning_rate": 1.2472994555276964e-05, "loss": 0.8292, "num_input_tokens_seen": 54602152, "step": 94045 }, { "epoch": 14.00804289544236, "grad_norm": 0.875518262386322, "learning_rate": 1.2470182609040833e-05, "loss": 0.4175, "num_input_tokens_seen": 54604776, "step": 94050 }, { "epoch": 14.008787607983319, "grad_norm": 1.555317997932434, "learning_rate": 1.2467370874483225e-05, "loss": 0.5698, "num_input_tokens_seen": 54607528, "step": 94055 }, { "epoch": 14.009532320524277, "grad_norm": 1.1831679344177246, "learning_rate": 1.2464559351651658e-05, "loss": 0.5985, "num_input_tokens_seen": 54610664, "step": 94060 }, { "epoch": 14.010277033065236, "grad_norm": 1.6274096965789795, "learning_rate": 1.2461748040593621e-05, "loss": 0.6532, "num_input_tokens_seen": 54613672, "step": 94065 }, { "epoch": 14.011021745606197, "grad_norm": 1.8450103998184204, "learning_rate": 1.2458936941356594e-05, "loss": 0.668, "num_input_tokens_seen": 54616520, "step": 94070 }, { "epoch": 14.011766458147155, "grad_norm": 1.2674624919891357, "learning_rate": 1.2456126053988093e-05, "loss": 0.5073, "num_input_tokens_seen": 54619208, "step": 94075 }, { "epoch": 14.012511170688114, "grad_norm": 1.1778370141983032, "learning_rate": 1.2453315378535584e-05, "loss": 0.5531, "num_input_tokens_seen": 54622152, "step": 94080 }, { "epoch": 14.013255883229073, "grad_norm": 1.0312460660934448, "learning_rate": 1.245050491504657e-05, "loss": 0.5122, "num_input_tokens_seen": 54625096, "step": 94085 }, { "epoch": 14.014000595770034, "grad_norm": 3.2989606857299805, "learning_rate": 1.2447694663568509e-05, "loss": 0.74, "num_input_tokens_seen": 54627880, "step": 94090 }, { "epoch": 14.014745308310992, "grad_norm": 1.2830281257629395, "learning_rate": 1.24448846241489e-05, "loss": 0.704, "num_input_tokens_seen": 54630696, "step": 94095 }, { "epoch": 14.015490020851951, "grad_norm": 1.5643552541732788, "learning_rate": 1.2442074796835206e-05, "loss": 0.6475, "num_input_tokens_seen": 54633768, "step": 94100 }, { "epoch": 14.01623473339291, "grad_norm": 2.738349437713623, "learning_rate": 1.2439265181674895e-05, "loss": 0.6947, "num_input_tokens_seen": 54636616, "step": 94105 }, { "epoch": 14.01697944593387, "grad_norm": 1.341670274734497, "learning_rate": 1.2436455778715431e-05, "loss": 0.6505, "num_input_tokens_seen": 54639464, "step": 94110 }, { "epoch": 14.017724158474829, "grad_norm": 2.8435003757476807, "learning_rate": 1.2433646588004266e-05, "loss": 0.5144, "num_input_tokens_seen": 54642600, "step": 94115 }, { "epoch": 14.018468871015788, "grad_norm": 0.9510325193405151, "learning_rate": 1.2430837609588883e-05, "loss": 0.5671, "num_input_tokens_seen": 54645480, "step": 94120 }, { "epoch": 14.019213583556747, "grad_norm": 0.9871028661727905, "learning_rate": 1.2428028843516715e-05, "loss": 0.6638, "num_input_tokens_seen": 54648776, "step": 94125 }, { "epoch": 14.019958296097707, "grad_norm": 1.8948405981063843, "learning_rate": 1.2425220289835229e-05, "loss": 0.5559, "num_input_tokens_seen": 54651432, "step": 94130 }, { "epoch": 14.020703008638666, "grad_norm": 1.479057788848877, "learning_rate": 1.2422411948591855e-05, "loss": 0.7599, "num_input_tokens_seen": 54654504, "step": 94135 }, { "epoch": 14.021447721179625, "grad_norm": 1.1638988256454468, "learning_rate": 1.241960381983406e-05, "loss": 0.6648, "num_input_tokens_seen": 54657256, "step": 94140 }, { "epoch": 14.022192433720583, "grad_norm": 1.57688307762146, "learning_rate": 1.2416795903609274e-05, "loss": 0.5753, "num_input_tokens_seen": 54660424, "step": 94145 }, { "epoch": 14.022937146261542, "grad_norm": 0.9338730573654175, "learning_rate": 1.2413988199964918e-05, "loss": 0.5761, "num_input_tokens_seen": 54663592, "step": 94150 }, { "epoch": 14.023681858802503, "grad_norm": 1.9944918155670166, "learning_rate": 1.2411180708948453e-05, "loss": 0.6962, "num_input_tokens_seen": 54666472, "step": 94155 }, { "epoch": 14.024426571343461, "grad_norm": 1.3186919689178467, "learning_rate": 1.2408373430607296e-05, "loss": 0.6586, "num_input_tokens_seen": 54669704, "step": 94160 }, { "epoch": 14.02517128388442, "grad_norm": 1.2233437299728394, "learning_rate": 1.2405566364988857e-05, "loss": 0.6916, "num_input_tokens_seen": 54672904, "step": 94165 }, { "epoch": 14.025915996425379, "grad_norm": 1.176613211631775, "learning_rate": 1.2402759512140588e-05, "loss": 0.3802, "num_input_tokens_seen": 54675976, "step": 94170 }, { "epoch": 14.02666070896634, "grad_norm": 1.230481505393982, "learning_rate": 1.2399952872109893e-05, "loss": 0.6374, "num_input_tokens_seen": 54678952, "step": 94175 }, { "epoch": 14.027405421507298, "grad_norm": 2.279108762741089, "learning_rate": 1.239714644494418e-05, "loss": 0.7189, "num_input_tokens_seen": 54681672, "step": 94180 }, { "epoch": 14.028150134048257, "grad_norm": 1.3909852504730225, "learning_rate": 1.2394340230690877e-05, "loss": 0.5061, "num_input_tokens_seen": 54684520, "step": 94185 }, { "epoch": 14.028894846589216, "grad_norm": 1.968762993812561, "learning_rate": 1.2391534229397384e-05, "loss": 0.6389, "num_input_tokens_seen": 54687208, "step": 94190 }, { "epoch": 14.029639559130176, "grad_norm": 1.6297794580459595, "learning_rate": 1.2388728441111095e-05, "loss": 0.712, "num_input_tokens_seen": 54689768, "step": 94195 }, { "epoch": 14.030384271671135, "grad_norm": 1.5998117923736572, "learning_rate": 1.2385922865879432e-05, "loss": 0.7334, "num_input_tokens_seen": 54692776, "step": 94200 }, { "epoch": 14.031128984212094, "grad_norm": 2.958400249481201, "learning_rate": 1.2383117503749769e-05, "loss": 0.5182, "num_input_tokens_seen": 54695464, "step": 94205 }, { "epoch": 14.031873696753053, "grad_norm": 1.3893917798995972, "learning_rate": 1.2380312354769526e-05, "loss": 0.6727, "num_input_tokens_seen": 54698312, "step": 94210 }, { "epoch": 14.032618409294013, "grad_norm": 1.4876623153686523, "learning_rate": 1.2377507418986071e-05, "loss": 0.4504, "num_input_tokens_seen": 54701480, "step": 94215 }, { "epoch": 14.033363121834972, "grad_norm": 0.7291302680969238, "learning_rate": 1.2374702696446806e-05, "loss": 0.5312, "num_input_tokens_seen": 54704456, "step": 94220 }, { "epoch": 14.03410783437593, "grad_norm": 2.7857768535614014, "learning_rate": 1.2371898187199108e-05, "loss": 0.5861, "num_input_tokens_seen": 54707240, "step": 94225 }, { "epoch": 14.03485254691689, "grad_norm": 0.9703319668769836, "learning_rate": 1.2369093891290357e-05, "loss": 0.4804, "num_input_tokens_seen": 54710120, "step": 94230 }, { "epoch": 14.03559725945785, "grad_norm": 1.8559997081756592, "learning_rate": 1.2366289808767926e-05, "loss": 0.8114, "num_input_tokens_seen": 54712808, "step": 94235 }, { "epoch": 14.036341971998809, "grad_norm": 2.026994466781616, "learning_rate": 1.2363485939679175e-05, "loss": 0.5889, "num_input_tokens_seen": 54715848, "step": 94240 }, { "epoch": 14.037086684539767, "grad_norm": 1.4089590311050415, "learning_rate": 1.23606822840715e-05, "loss": 0.5522, "num_input_tokens_seen": 54718920, "step": 94245 }, { "epoch": 14.037831397080726, "grad_norm": 1.5683947801589966, "learning_rate": 1.2357878841992243e-05, "loss": 0.6614, "num_input_tokens_seen": 54721928, "step": 94250 }, { "epoch": 14.038576109621687, "grad_norm": 1.5405547618865967, "learning_rate": 1.2355075613488782e-05, "loss": 0.7021, "num_input_tokens_seen": 54724712, "step": 94255 }, { "epoch": 14.039320822162646, "grad_norm": 3.1638920307159424, "learning_rate": 1.2352272598608455e-05, "loss": 0.5909, "num_input_tokens_seen": 54727496, "step": 94260 }, { "epoch": 14.040065534703604, "grad_norm": 1.8795021772384644, "learning_rate": 1.234946979739864e-05, "loss": 0.6955, "num_input_tokens_seen": 54730312, "step": 94265 }, { "epoch": 14.040810247244563, "grad_norm": 1.745862364768982, "learning_rate": 1.2346667209906677e-05, "loss": 0.657, "num_input_tokens_seen": 54733512, "step": 94270 }, { "epoch": 14.041554959785524, "grad_norm": 1.2044613361358643, "learning_rate": 1.23438648361799e-05, "loss": 0.5767, "num_input_tokens_seen": 54736488, "step": 94275 }, { "epoch": 14.042299672326482, "grad_norm": 1.2985739707946777, "learning_rate": 1.2341062676265671e-05, "loss": 0.6709, "num_input_tokens_seen": 54739368, "step": 94280 }, { "epoch": 14.043044384867441, "grad_norm": 2.1275689601898193, "learning_rate": 1.2338260730211316e-05, "loss": 0.6206, "num_input_tokens_seen": 54742152, "step": 94285 }, { "epoch": 14.0437890974084, "grad_norm": 1.278432011604309, "learning_rate": 1.2335458998064184e-05, "loss": 0.5089, "num_input_tokens_seen": 54745224, "step": 94290 }, { "epoch": 14.04453380994936, "grad_norm": 1.187258243560791, "learning_rate": 1.23326574798716e-05, "loss": 0.5388, "num_input_tokens_seen": 54748008, "step": 94295 }, { "epoch": 14.04527852249032, "grad_norm": 2.3158700466156006, "learning_rate": 1.2329856175680896e-05, "loss": 0.416, "num_input_tokens_seen": 54750888, "step": 94300 }, { "epoch": 14.046023235031278, "grad_norm": 2.231717109680176, "learning_rate": 1.2327055085539382e-05, "loss": 0.5701, "num_input_tokens_seen": 54753928, "step": 94305 }, { "epoch": 14.046767947572237, "grad_norm": 1.023863673210144, "learning_rate": 1.2324254209494405e-05, "loss": 0.7028, "num_input_tokens_seen": 54757160, "step": 94310 }, { "epoch": 14.047512660113195, "grad_norm": 1.5124619007110596, "learning_rate": 1.2321453547593267e-05, "loss": 0.5504, "num_input_tokens_seen": 54760040, "step": 94315 }, { "epoch": 14.048257372654156, "grad_norm": 0.9971659183502197, "learning_rate": 1.2318653099883278e-05, "loss": 0.6242, "num_input_tokens_seen": 54763304, "step": 94320 }, { "epoch": 14.049002085195115, "grad_norm": 1.83246910572052, "learning_rate": 1.2315852866411767e-05, "loss": 0.5856, "num_input_tokens_seen": 54766248, "step": 94325 }, { "epoch": 14.049746797736073, "grad_norm": 1.373368263244629, "learning_rate": 1.2313052847226018e-05, "loss": 0.5647, "num_input_tokens_seen": 54769000, "step": 94330 }, { "epoch": 14.050491510277032, "grad_norm": 1.3063745498657227, "learning_rate": 1.2310253042373356e-05, "loss": 0.7258, "num_input_tokens_seen": 54771816, "step": 94335 }, { "epoch": 14.051236222817993, "grad_norm": 1.016156792640686, "learning_rate": 1.2307453451901063e-05, "loss": 0.7488, "num_input_tokens_seen": 54774536, "step": 94340 }, { "epoch": 14.051980935358952, "grad_norm": 2.0984835624694824, "learning_rate": 1.2304654075856452e-05, "loss": 0.546, "num_input_tokens_seen": 54777416, "step": 94345 }, { "epoch": 14.05272564789991, "grad_norm": 2.1972873210906982, "learning_rate": 1.2301854914286812e-05, "loss": 0.4949, "num_input_tokens_seen": 54780168, "step": 94350 }, { "epoch": 14.053470360440869, "grad_norm": 1.564921259880066, "learning_rate": 1.2299055967239415e-05, "loss": 0.6646, "num_input_tokens_seen": 54783144, "step": 94355 }, { "epoch": 14.05421507298183, "grad_norm": 1.5718189477920532, "learning_rate": 1.2296257234761566e-05, "loss": 0.7372, "num_input_tokens_seen": 54786024, "step": 94360 }, { "epoch": 14.054959785522788, "grad_norm": 1.5668336153030396, "learning_rate": 1.2293458716900543e-05, "loss": 0.4481, "num_input_tokens_seen": 54789000, "step": 94365 }, { "epoch": 14.055704498063747, "grad_norm": 1.1911802291870117, "learning_rate": 1.229066041370362e-05, "loss": 0.8235, "num_input_tokens_seen": 54791912, "step": 94370 }, { "epoch": 14.056449210604706, "grad_norm": 1.7993427515029907, "learning_rate": 1.228786232521806e-05, "loss": 0.75, "num_input_tokens_seen": 54794856, "step": 94375 }, { "epoch": 14.057193923145666, "grad_norm": 1.6972299814224243, "learning_rate": 1.2285064451491157e-05, "loss": 0.7851, "num_input_tokens_seen": 54797768, "step": 94380 }, { "epoch": 14.057938635686625, "grad_norm": 3.3755037784576416, "learning_rate": 1.2282266792570158e-05, "loss": 0.5677, "num_input_tokens_seen": 54800712, "step": 94385 }, { "epoch": 14.058683348227584, "grad_norm": 1.411289095878601, "learning_rate": 1.2279469348502345e-05, "loss": 0.5492, "num_input_tokens_seen": 54803400, "step": 94390 }, { "epoch": 14.059428060768543, "grad_norm": 1.4970279932022095, "learning_rate": 1.227667211933497e-05, "loss": 0.5091, "num_input_tokens_seen": 54806312, "step": 94395 }, { "epoch": 14.060172773309503, "grad_norm": 2.0690417289733887, "learning_rate": 1.2273875105115275e-05, "loss": 0.565, "num_input_tokens_seen": 54809480, "step": 94400 }, { "epoch": 14.060917485850462, "grad_norm": 1.5739545822143555, "learning_rate": 1.227107830589054e-05, "loss": 0.5559, "num_input_tokens_seen": 54812392, "step": 94405 }, { "epoch": 14.06166219839142, "grad_norm": 2.0982468128204346, "learning_rate": 1.2268281721707989e-05, "loss": 0.5262, "num_input_tokens_seen": 54815304, "step": 94410 }, { "epoch": 14.06240691093238, "grad_norm": 1.8852108716964722, "learning_rate": 1.2265485352614887e-05, "loss": 0.5859, "num_input_tokens_seen": 54818216, "step": 94415 }, { "epoch": 14.06315162347334, "grad_norm": 1.6313955783843994, "learning_rate": 1.226268919865846e-05, "loss": 0.6542, "num_input_tokens_seen": 54820872, "step": 94420 }, { "epoch": 14.063896336014299, "grad_norm": 0.9447634220123291, "learning_rate": 1.225989325988596e-05, "loss": 0.5601, "num_input_tokens_seen": 54823720, "step": 94425 }, { "epoch": 14.064641048555258, "grad_norm": 1.277021884918213, "learning_rate": 1.2257097536344613e-05, "loss": 0.4443, "num_input_tokens_seen": 54826536, "step": 94430 }, { "epoch": 14.065385761096216, "grad_norm": 1.4484063386917114, "learning_rate": 1.2254302028081657e-05, "loss": 0.6935, "num_input_tokens_seen": 54829352, "step": 94435 }, { "epoch": 14.066130473637177, "grad_norm": 2.2047035694122314, "learning_rate": 1.225150673514431e-05, "loss": 0.7333, "num_input_tokens_seen": 54832200, "step": 94440 }, { "epoch": 14.066875186178136, "grad_norm": 1.5471374988555908, "learning_rate": 1.2248711657579792e-05, "loss": 0.6881, "num_input_tokens_seen": 54835144, "step": 94445 }, { "epoch": 14.067619898719094, "grad_norm": 1.486323356628418, "learning_rate": 1.2245916795435342e-05, "loss": 0.4572, "num_input_tokens_seen": 54838120, "step": 94450 }, { "epoch": 14.068364611260053, "grad_norm": 1.84419846534729, "learning_rate": 1.2243122148758152e-05, "loss": 0.5745, "num_input_tokens_seen": 54840840, "step": 94455 }, { "epoch": 14.069109323801014, "grad_norm": 0.8779679536819458, "learning_rate": 1.224032771759546e-05, "loss": 0.575, "num_input_tokens_seen": 54843848, "step": 94460 }, { "epoch": 14.069854036341972, "grad_norm": 1.8296931982040405, "learning_rate": 1.2237533501994452e-05, "loss": 0.7808, "num_input_tokens_seen": 54846728, "step": 94465 }, { "epoch": 14.070598748882931, "grad_norm": 1.2456713914871216, "learning_rate": 1.2234739502002353e-05, "loss": 0.4516, "num_input_tokens_seen": 54849768, "step": 94470 }, { "epoch": 14.07134346142389, "grad_norm": 2.258964776992798, "learning_rate": 1.2231945717666358e-05, "loss": 0.5231, "num_input_tokens_seen": 54852680, "step": 94475 }, { "epoch": 14.07208817396485, "grad_norm": 0.9805372357368469, "learning_rate": 1.2229152149033655e-05, "loss": 0.5792, "num_input_tokens_seen": 54856040, "step": 94480 }, { "epoch": 14.07283288650581, "grad_norm": 1.1770143508911133, "learning_rate": 1.2226358796151452e-05, "loss": 0.4268, "num_input_tokens_seen": 54859208, "step": 94485 }, { "epoch": 14.073577599046768, "grad_norm": 1.4134645462036133, "learning_rate": 1.2223565659066938e-05, "loss": 0.5728, "num_input_tokens_seen": 54862056, "step": 94490 }, { "epoch": 14.074322311587727, "grad_norm": 1.596293330192566, "learning_rate": 1.2220772737827285e-05, "loss": 0.6499, "num_input_tokens_seen": 54865128, "step": 94495 }, { "epoch": 14.075067024128685, "grad_norm": 1.3124475479125977, "learning_rate": 1.2217980032479701e-05, "loss": 0.5942, "num_input_tokens_seen": 54867976, "step": 94500 }, { "epoch": 14.075811736669646, "grad_norm": 1.134738802909851, "learning_rate": 1.221518754307135e-05, "loss": 0.6293, "num_input_tokens_seen": 54870824, "step": 94505 }, { "epoch": 14.076556449210605, "grad_norm": 1.0575475692749023, "learning_rate": 1.2212395269649413e-05, "loss": 0.6613, "num_input_tokens_seen": 54874056, "step": 94510 }, { "epoch": 14.077301161751564, "grad_norm": 1.3543874025344849, "learning_rate": 1.220960321226105e-05, "loss": 0.7527, "num_input_tokens_seen": 54876744, "step": 94515 }, { "epoch": 14.078045874292522, "grad_norm": 1.124672532081604, "learning_rate": 1.2206811370953453e-05, "loss": 0.6438, "num_input_tokens_seen": 54879656, "step": 94520 }, { "epoch": 14.078790586833483, "grad_norm": 1.8301681280136108, "learning_rate": 1.2204019745773764e-05, "loss": 0.6402, "num_input_tokens_seen": 54882600, "step": 94525 }, { "epoch": 14.079535299374442, "grad_norm": 1.2265551090240479, "learning_rate": 1.2201228336769169e-05, "loss": 0.6788, "num_input_tokens_seen": 54885352, "step": 94530 }, { "epoch": 14.0802800119154, "grad_norm": 2.022709846496582, "learning_rate": 1.2198437143986798e-05, "loss": 0.6031, "num_input_tokens_seen": 54888264, "step": 94535 }, { "epoch": 14.081024724456359, "grad_norm": 1.5867407321929932, "learning_rate": 1.2195646167473835e-05, "loss": 0.4372, "num_input_tokens_seen": 54891336, "step": 94540 }, { "epoch": 14.08176943699732, "grad_norm": 1.359142541885376, "learning_rate": 1.2192855407277407e-05, "loss": 0.5521, "num_input_tokens_seen": 54894280, "step": 94545 }, { "epoch": 14.082514149538278, "grad_norm": 1.7132487297058105, "learning_rate": 1.2190064863444675e-05, "loss": 0.5341, "num_input_tokens_seen": 54897608, "step": 94550 }, { "epoch": 14.083258862079237, "grad_norm": 3.421954870223999, "learning_rate": 1.2187274536022783e-05, "loss": 0.7779, "num_input_tokens_seen": 54900296, "step": 94555 }, { "epoch": 14.084003574620196, "grad_norm": 2.0042197704315186, "learning_rate": 1.2184484425058863e-05, "loss": 0.5658, "num_input_tokens_seen": 54903112, "step": 94560 }, { "epoch": 14.084748287161156, "grad_norm": 0.6719555854797363, "learning_rate": 1.2181694530600052e-05, "loss": 0.4093, "num_input_tokens_seen": 54906184, "step": 94565 }, { "epoch": 14.085492999702115, "grad_norm": 1.323555827140808, "learning_rate": 1.2178904852693476e-05, "loss": 0.4583, "num_input_tokens_seen": 54908904, "step": 94570 }, { "epoch": 14.086237712243074, "grad_norm": 1.3800276517868042, "learning_rate": 1.217611539138628e-05, "loss": 0.5133, "num_input_tokens_seen": 54911816, "step": 94575 }, { "epoch": 14.086982424784033, "grad_norm": 1.568495512008667, "learning_rate": 1.2173326146725575e-05, "loss": 0.5718, "num_input_tokens_seen": 54914600, "step": 94580 }, { "epoch": 14.087727137324993, "grad_norm": 1.1573978662490845, "learning_rate": 1.2170537118758496e-05, "loss": 0.5744, "num_input_tokens_seen": 54917640, "step": 94585 }, { "epoch": 14.088471849865952, "grad_norm": 2.0024733543395996, "learning_rate": 1.216774830753215e-05, "loss": 0.4907, "num_input_tokens_seen": 54920392, "step": 94590 }, { "epoch": 14.08921656240691, "grad_norm": 3.2976059913635254, "learning_rate": 1.2164959713093649e-05, "loss": 0.4748, "num_input_tokens_seen": 54923080, "step": 94595 }, { "epoch": 14.08996127494787, "grad_norm": 1.6628824472427368, "learning_rate": 1.2162171335490115e-05, "loss": 0.735, "num_input_tokens_seen": 54926024, "step": 94600 }, { "epoch": 14.09070598748883, "grad_norm": 1.5602132081985474, "learning_rate": 1.2159383174768641e-05, "loss": 0.5705, "num_input_tokens_seen": 54929064, "step": 94605 }, { "epoch": 14.091450700029789, "grad_norm": 1.951608657836914, "learning_rate": 1.2156595230976348e-05, "loss": 0.6909, "num_input_tokens_seen": 54932104, "step": 94610 }, { "epoch": 14.092195412570748, "grad_norm": 0.9503153562545776, "learning_rate": 1.2153807504160313e-05, "loss": 0.6912, "num_input_tokens_seen": 54934824, "step": 94615 }, { "epoch": 14.092940125111706, "grad_norm": 0.7581939101219177, "learning_rate": 1.2151019994367655e-05, "loss": 0.5787, "num_input_tokens_seen": 54937768, "step": 94620 }, { "epoch": 14.093684837652667, "grad_norm": 0.8304468989372253, "learning_rate": 1.2148232701645453e-05, "loss": 0.3775, "num_input_tokens_seen": 54940712, "step": 94625 }, { "epoch": 14.094429550193626, "grad_norm": 1.857037901878357, "learning_rate": 1.2145445626040801e-05, "loss": 0.6129, "num_input_tokens_seen": 54943464, "step": 94630 }, { "epoch": 14.095174262734584, "grad_norm": 1.1710516214370728, "learning_rate": 1.2142658767600779e-05, "loss": 0.5487, "num_input_tokens_seen": 54946184, "step": 94635 }, { "epoch": 14.095918975275543, "grad_norm": 1.3406261205673218, "learning_rate": 1.213987212637246e-05, "loss": 0.5737, "num_input_tokens_seen": 54948968, "step": 94640 }, { "epoch": 14.096663687816504, "grad_norm": 1.1231025457382202, "learning_rate": 1.2137085702402939e-05, "loss": 0.571, "num_input_tokens_seen": 54952040, "step": 94645 }, { "epoch": 14.097408400357462, "grad_norm": 2.2070908546447754, "learning_rate": 1.2134299495739274e-05, "loss": 0.6007, "num_input_tokens_seen": 54954984, "step": 94650 }, { "epoch": 14.098153112898421, "grad_norm": 1.8658231496810913, "learning_rate": 1.2131513506428552e-05, "loss": 0.6564, "num_input_tokens_seen": 54958184, "step": 94655 }, { "epoch": 14.09889782543938, "grad_norm": 1.5987282991409302, "learning_rate": 1.2128727734517819e-05, "loss": 0.5101, "num_input_tokens_seen": 54961384, "step": 94660 }, { "epoch": 14.099642537980339, "grad_norm": 1.2728488445281982, "learning_rate": 1.2125942180054161e-05, "loss": 0.6162, "num_input_tokens_seen": 54964040, "step": 94665 }, { "epoch": 14.1003872505213, "grad_norm": 2.7880852222442627, "learning_rate": 1.2123156843084624e-05, "loss": 0.6926, "num_input_tokens_seen": 54966824, "step": 94670 }, { "epoch": 14.101131963062258, "grad_norm": 1.72451651096344, "learning_rate": 1.2120371723656257e-05, "loss": 0.5316, "num_input_tokens_seen": 54969704, "step": 94675 }, { "epoch": 14.101876675603217, "grad_norm": 2.4802517890930176, "learning_rate": 1.2117586821816127e-05, "loss": 0.6943, "num_input_tokens_seen": 54972296, "step": 94680 }, { "epoch": 14.102621388144176, "grad_norm": 1.6375523805618286, "learning_rate": 1.2114802137611266e-05, "loss": 0.6909, "num_input_tokens_seen": 54975208, "step": 94685 }, { "epoch": 14.103366100685136, "grad_norm": 1.446577548980713, "learning_rate": 1.2112017671088737e-05, "loss": 0.5281, "num_input_tokens_seen": 54977864, "step": 94690 }, { "epoch": 14.104110813226095, "grad_norm": 2.400608539581299, "learning_rate": 1.2109233422295568e-05, "loss": 0.6568, "num_input_tokens_seen": 54980712, "step": 94695 }, { "epoch": 14.104855525767054, "grad_norm": 1.0845460891723633, "learning_rate": 1.2106449391278802e-05, "loss": 0.4842, "num_input_tokens_seen": 54983400, "step": 94700 }, { "epoch": 14.105600238308012, "grad_norm": 1.6207005977630615, "learning_rate": 1.2103665578085458e-05, "loss": 0.6213, "num_input_tokens_seen": 54986248, "step": 94705 }, { "epoch": 14.106344950848973, "grad_norm": 1.666038990020752, "learning_rate": 1.2100881982762589e-05, "loss": 0.581, "num_input_tokens_seen": 54989704, "step": 94710 }, { "epoch": 14.107089663389932, "grad_norm": 1.4855741262435913, "learning_rate": 1.2098098605357205e-05, "loss": 0.6003, "num_input_tokens_seen": 54992776, "step": 94715 }, { "epoch": 14.10783437593089, "grad_norm": 2.416745901107788, "learning_rate": 1.2095315445916323e-05, "loss": 0.6234, "num_input_tokens_seen": 54995880, "step": 94720 }, { "epoch": 14.10857908847185, "grad_norm": 0.6150317192077637, "learning_rate": 1.2092532504486981e-05, "loss": 0.57, "num_input_tokens_seen": 54998472, "step": 94725 }, { "epoch": 14.10932380101281, "grad_norm": 1.6844617128372192, "learning_rate": 1.2089749781116175e-05, "loss": 0.576, "num_input_tokens_seen": 55001384, "step": 94730 }, { "epoch": 14.110068513553768, "grad_norm": 1.6629055738449097, "learning_rate": 1.2086967275850936e-05, "loss": 0.6806, "num_input_tokens_seen": 55004264, "step": 94735 }, { "epoch": 14.110813226094727, "grad_norm": 1.0681021213531494, "learning_rate": 1.2084184988738247e-05, "loss": 0.5281, "num_input_tokens_seen": 55007368, "step": 94740 }, { "epoch": 14.111557938635686, "grad_norm": 1.294558048248291, "learning_rate": 1.2081402919825139e-05, "loss": 0.7198, "num_input_tokens_seen": 55010024, "step": 94745 }, { "epoch": 14.112302651176647, "grad_norm": 1.8305330276489258, "learning_rate": 1.2078621069158596e-05, "loss": 0.8554, "num_input_tokens_seen": 55012872, "step": 94750 }, { "epoch": 14.113047363717605, "grad_norm": 1.0824899673461914, "learning_rate": 1.2075839436785611e-05, "loss": 0.6767, "num_input_tokens_seen": 55015784, "step": 94755 }, { "epoch": 14.113792076258564, "grad_norm": 1.4107962846755981, "learning_rate": 1.2073058022753189e-05, "loss": 0.4856, "num_input_tokens_seen": 55018600, "step": 94760 }, { "epoch": 14.114536788799523, "grad_norm": 1.3545057773590088, "learning_rate": 1.2070276827108315e-05, "loss": 0.6134, "num_input_tokens_seen": 55021576, "step": 94765 }, { "epoch": 14.115281501340483, "grad_norm": 1.0321458578109741, "learning_rate": 1.2067495849897972e-05, "loss": 0.4798, "num_input_tokens_seen": 55024136, "step": 94770 }, { "epoch": 14.116026213881442, "grad_norm": 1.255862832069397, "learning_rate": 1.2064715091169135e-05, "loss": 0.4991, "num_input_tokens_seen": 55027112, "step": 94775 }, { "epoch": 14.1167709264224, "grad_norm": 1.6773624420166016, "learning_rate": 1.2061934550968798e-05, "loss": 0.5929, "num_input_tokens_seen": 55030088, "step": 94780 }, { "epoch": 14.11751563896336, "grad_norm": 1.7978734970092773, "learning_rate": 1.2059154229343919e-05, "loss": 0.4681, "num_input_tokens_seen": 55032872, "step": 94785 }, { "epoch": 14.11826035150432, "grad_norm": 1.6524388790130615, "learning_rate": 1.2056374126341485e-05, "loss": 0.5803, "num_input_tokens_seen": 55036168, "step": 94790 }, { "epoch": 14.119005064045279, "grad_norm": 1.4094796180725098, "learning_rate": 1.2053594242008453e-05, "loss": 0.4194, "num_input_tokens_seen": 55039112, "step": 94795 }, { "epoch": 14.119749776586238, "grad_norm": 1.8192291259765625, "learning_rate": 1.205081457639178e-05, "loss": 0.6225, "num_input_tokens_seen": 55042024, "step": 94800 }, { "epoch": 14.120494489127196, "grad_norm": 1.7311806678771973, "learning_rate": 1.2048035129538446e-05, "loss": 0.6469, "num_input_tokens_seen": 55044968, "step": 94805 }, { "epoch": 14.121239201668157, "grad_norm": 1.8758572340011597, "learning_rate": 1.2045255901495384e-05, "loss": 0.9202, "num_input_tokens_seen": 55047912, "step": 94810 }, { "epoch": 14.121983914209116, "grad_norm": 1.3904415369033813, "learning_rate": 1.2042476892309565e-05, "loss": 0.4503, "num_input_tokens_seen": 55050984, "step": 94815 }, { "epoch": 14.122728626750074, "grad_norm": 2.0735931396484375, "learning_rate": 1.203969810202793e-05, "loss": 0.5345, "num_input_tokens_seen": 55053832, "step": 94820 }, { "epoch": 14.123473339291033, "grad_norm": 2.098463535308838, "learning_rate": 1.2036919530697412e-05, "loss": 0.6468, "num_input_tokens_seen": 55056712, "step": 94825 }, { "epoch": 14.124218051831992, "grad_norm": 2.06247878074646, "learning_rate": 1.2034141178364974e-05, "loss": 0.6323, "num_input_tokens_seen": 55059432, "step": 94830 }, { "epoch": 14.124962764372953, "grad_norm": 1.2928614616394043, "learning_rate": 1.2031363045077545e-05, "loss": 0.7301, "num_input_tokens_seen": 55062600, "step": 94835 }, { "epoch": 14.125707476913911, "grad_norm": 3.171009063720703, "learning_rate": 1.2028585130882056e-05, "loss": 0.7981, "num_input_tokens_seen": 55065416, "step": 94840 }, { "epoch": 14.12645218945487, "grad_norm": 1.1922825574874878, "learning_rate": 1.2025807435825426e-05, "loss": 0.5273, "num_input_tokens_seen": 55068104, "step": 94845 }, { "epoch": 14.127196901995829, "grad_norm": 1.8599371910095215, "learning_rate": 1.2023029959954603e-05, "loss": 0.5502, "num_input_tokens_seen": 55070920, "step": 94850 }, { "epoch": 14.12794161453679, "grad_norm": 1.456338882446289, "learning_rate": 1.2020252703316492e-05, "loss": 0.6513, "num_input_tokens_seen": 55073896, "step": 94855 }, { "epoch": 14.128686327077748, "grad_norm": 1.6380478143692017, "learning_rate": 1.2017475665958028e-05, "loss": 0.5794, "num_input_tokens_seen": 55076776, "step": 94860 }, { "epoch": 14.129431039618707, "grad_norm": 1.590394377708435, "learning_rate": 1.201469884792611e-05, "loss": 0.5911, "num_input_tokens_seen": 55079592, "step": 94865 }, { "epoch": 14.130175752159666, "grad_norm": 1.1404527425765991, "learning_rate": 1.2011922249267662e-05, "loss": 0.4399, "num_input_tokens_seen": 55082376, "step": 94870 }, { "epoch": 14.130920464700626, "grad_norm": 3.924842119216919, "learning_rate": 1.2009145870029592e-05, "loss": 0.93, "num_input_tokens_seen": 55084936, "step": 94875 }, { "epoch": 14.131665177241585, "grad_norm": 1.7810723781585693, "learning_rate": 1.200636971025879e-05, "loss": 0.5506, "num_input_tokens_seen": 55087688, "step": 94880 }, { "epoch": 14.132409889782544, "grad_norm": 1.6918165683746338, "learning_rate": 1.2003593770002169e-05, "loss": 0.5383, "num_input_tokens_seen": 55090632, "step": 94885 }, { "epoch": 14.133154602323502, "grad_norm": 0.8262726664543152, "learning_rate": 1.2000818049306628e-05, "loss": 0.5675, "num_input_tokens_seen": 55093928, "step": 94890 }, { "epoch": 14.133899314864463, "grad_norm": 2.1452488899230957, "learning_rate": 1.1998042548219052e-05, "loss": 0.707, "num_input_tokens_seen": 55096712, "step": 94895 }, { "epoch": 14.134644027405422, "grad_norm": 1.0336769819259644, "learning_rate": 1.1995267266786325e-05, "loss": 0.4852, "num_input_tokens_seen": 55099624, "step": 94900 }, { "epoch": 14.13538873994638, "grad_norm": 1.7168601751327515, "learning_rate": 1.1992492205055347e-05, "loss": 0.5295, "num_input_tokens_seen": 55102760, "step": 94905 }, { "epoch": 14.13613345248734, "grad_norm": 0.6144278645515442, "learning_rate": 1.1989717363072986e-05, "loss": 0.4689, "num_input_tokens_seen": 55105640, "step": 94910 }, { "epoch": 14.1368781650283, "grad_norm": 1.329433798789978, "learning_rate": 1.1986942740886135e-05, "loss": 0.6228, "num_input_tokens_seen": 55108200, "step": 94915 }, { "epoch": 14.137622877569259, "grad_norm": 1.281314730644226, "learning_rate": 1.198416833854166e-05, "loss": 0.6318, "num_input_tokens_seen": 55111176, "step": 94920 }, { "epoch": 14.138367590110217, "grad_norm": 4.162695407867432, "learning_rate": 1.1981394156086423e-05, "loss": 0.8237, "num_input_tokens_seen": 55114088, "step": 94925 }, { "epoch": 14.139112302651176, "grad_norm": 2.183384656906128, "learning_rate": 1.197862019356731e-05, "loss": 0.6135, "num_input_tokens_seen": 55117128, "step": 94930 }, { "epoch": 14.139857015192137, "grad_norm": 0.614247739315033, "learning_rate": 1.1975846451031167e-05, "loss": 0.4903, "num_input_tokens_seen": 55120360, "step": 94935 }, { "epoch": 14.140601727733095, "grad_norm": 2.158259153366089, "learning_rate": 1.1973072928524868e-05, "loss": 0.6862, "num_input_tokens_seen": 55123240, "step": 94940 }, { "epoch": 14.141346440274054, "grad_norm": 1.3216617107391357, "learning_rate": 1.1970299626095252e-05, "loss": 0.4783, "num_input_tokens_seen": 55126120, "step": 94945 }, { "epoch": 14.142091152815013, "grad_norm": 1.308974266052246, "learning_rate": 1.1967526543789192e-05, "loss": 0.7023, "num_input_tokens_seen": 55129128, "step": 94950 }, { "epoch": 14.142835865355973, "grad_norm": 2.0952162742614746, "learning_rate": 1.1964753681653526e-05, "loss": 0.6929, "num_input_tokens_seen": 55132232, "step": 94955 }, { "epoch": 14.143580577896932, "grad_norm": 0.9562191963195801, "learning_rate": 1.1961981039735096e-05, "loss": 0.5387, "num_input_tokens_seen": 55134920, "step": 94960 }, { "epoch": 14.14432529043789, "grad_norm": 1.8230922222137451, "learning_rate": 1.1959208618080747e-05, "loss": 0.598, "num_input_tokens_seen": 55137832, "step": 94965 }, { "epoch": 14.14507000297885, "grad_norm": 2.1797187328338623, "learning_rate": 1.1956436416737304e-05, "loss": 0.5078, "num_input_tokens_seen": 55140616, "step": 94970 }, { "epoch": 14.14581471551981, "grad_norm": 2.280991554260254, "learning_rate": 1.1953664435751621e-05, "loss": 0.723, "num_input_tokens_seen": 55143528, "step": 94975 }, { "epoch": 14.146559428060769, "grad_norm": 1.0297651290893555, "learning_rate": 1.1950892675170509e-05, "loss": 0.7158, "num_input_tokens_seen": 55146568, "step": 94980 }, { "epoch": 14.147304140601728, "grad_norm": 2.2706263065338135, "learning_rate": 1.194812113504081e-05, "loss": 0.3706, "num_input_tokens_seen": 55149256, "step": 94985 }, { "epoch": 14.148048853142686, "grad_norm": 1.6978312730789185, "learning_rate": 1.194534981540933e-05, "loss": 0.6705, "num_input_tokens_seen": 55152392, "step": 94990 }, { "epoch": 14.148793565683647, "grad_norm": 1.866043210029602, "learning_rate": 1.1942578716322905e-05, "loss": 0.5591, "num_input_tokens_seen": 55154952, "step": 94995 }, { "epoch": 14.149538278224606, "grad_norm": 2.6867928504943848, "learning_rate": 1.1939807837828345e-05, "loss": 0.712, "num_input_tokens_seen": 55157736, "step": 95000 }, { "epoch": 14.150282990765565, "grad_norm": 1.8004176616668701, "learning_rate": 1.1937037179972447e-05, "loss": 0.5694, "num_input_tokens_seen": 55160808, "step": 95005 }, { "epoch": 14.151027703306523, "grad_norm": 1.7217456102371216, "learning_rate": 1.1934266742802039e-05, "loss": 0.5437, "num_input_tokens_seen": 55163368, "step": 95010 }, { "epoch": 14.151772415847482, "grad_norm": 2.285087823867798, "learning_rate": 1.1931496526363903e-05, "loss": 0.5171, "num_input_tokens_seen": 55166088, "step": 95015 }, { "epoch": 14.152517128388443, "grad_norm": 1.4483801126480103, "learning_rate": 1.1928726530704862e-05, "loss": 0.5568, "num_input_tokens_seen": 55168776, "step": 95020 }, { "epoch": 14.153261840929401, "grad_norm": 1.1515793800354004, "learning_rate": 1.1925956755871703e-05, "loss": 0.4897, "num_input_tokens_seen": 55171560, "step": 95025 }, { "epoch": 14.15400655347036, "grad_norm": 1.2453570365905762, "learning_rate": 1.1923187201911215e-05, "loss": 0.3039, "num_input_tokens_seen": 55174536, "step": 95030 }, { "epoch": 14.154751266011319, "grad_norm": 2.163743734359741, "learning_rate": 1.1920417868870187e-05, "loss": 0.6568, "num_input_tokens_seen": 55177608, "step": 95035 }, { "epoch": 14.15549597855228, "grad_norm": 1.6281417608261108, "learning_rate": 1.1917648756795399e-05, "loss": 0.5336, "num_input_tokens_seen": 55180264, "step": 95040 }, { "epoch": 14.156240691093238, "grad_norm": 1.3160855770111084, "learning_rate": 1.1914879865733647e-05, "loss": 0.4942, "num_input_tokens_seen": 55183240, "step": 95045 }, { "epoch": 14.156985403634197, "grad_norm": 0.9436250925064087, "learning_rate": 1.1912111195731693e-05, "loss": 0.6259, "num_input_tokens_seen": 55186056, "step": 95050 }, { "epoch": 14.157730116175156, "grad_norm": 1.0026977062225342, "learning_rate": 1.1909342746836325e-05, "loss": 0.5109, "num_input_tokens_seen": 55189064, "step": 95055 }, { "epoch": 14.158474828716116, "grad_norm": 2.3881704807281494, "learning_rate": 1.1906574519094299e-05, "loss": 0.8065, "num_input_tokens_seen": 55191624, "step": 95060 }, { "epoch": 14.159219541257075, "grad_norm": 1.4594275951385498, "learning_rate": 1.1903806512552395e-05, "loss": 0.5766, "num_input_tokens_seen": 55194696, "step": 95065 }, { "epoch": 14.159964253798034, "grad_norm": 2.885185718536377, "learning_rate": 1.1901038727257366e-05, "loss": 0.5442, "num_input_tokens_seen": 55197800, "step": 95070 }, { "epoch": 14.160708966338992, "grad_norm": 1.076954960823059, "learning_rate": 1.189827116325598e-05, "loss": 0.3896, "num_input_tokens_seen": 55200616, "step": 95075 }, { "epoch": 14.161453678879953, "grad_norm": 0.7491856217384338, "learning_rate": 1.1895503820594985e-05, "loss": 0.4903, "num_input_tokens_seen": 55203688, "step": 95080 }, { "epoch": 14.162198391420912, "grad_norm": 1.7264710664749146, "learning_rate": 1.189273669932113e-05, "loss": 0.7102, "num_input_tokens_seen": 55206504, "step": 95085 }, { "epoch": 14.16294310396187, "grad_norm": 0.9427440762519836, "learning_rate": 1.1889969799481173e-05, "loss": 0.5035, "num_input_tokens_seen": 55209384, "step": 95090 }, { "epoch": 14.16368781650283, "grad_norm": 1.6315786838531494, "learning_rate": 1.1887203121121851e-05, "loss": 0.4823, "num_input_tokens_seen": 55212168, "step": 95095 }, { "epoch": 14.16443252904379, "grad_norm": 2.4165585041046143, "learning_rate": 1.1884436664289908e-05, "loss": 0.6195, "num_input_tokens_seen": 55215048, "step": 95100 }, { "epoch": 14.165177241584749, "grad_norm": 1.1541330814361572, "learning_rate": 1.1881670429032066e-05, "loss": 0.4798, "num_input_tokens_seen": 55217896, "step": 95105 }, { "epoch": 14.165921954125707, "grad_norm": 1.3405442237854004, "learning_rate": 1.1878904415395078e-05, "loss": 0.5314, "num_input_tokens_seen": 55220904, "step": 95110 }, { "epoch": 14.166666666666666, "grad_norm": 2.307105541229248, "learning_rate": 1.1876138623425667e-05, "loss": 0.6011, "num_input_tokens_seen": 55223784, "step": 95115 }, { "epoch": 14.167411379207627, "grad_norm": 1.5293552875518799, "learning_rate": 1.1873373053170545e-05, "loss": 0.6394, "num_input_tokens_seen": 55226600, "step": 95120 }, { "epoch": 14.168156091748585, "grad_norm": 0.9016022086143494, "learning_rate": 1.187060770467645e-05, "loss": 0.4506, "num_input_tokens_seen": 55229640, "step": 95125 }, { "epoch": 14.168900804289544, "grad_norm": 1.72409987449646, "learning_rate": 1.1867842577990087e-05, "loss": 0.7596, "num_input_tokens_seen": 55232680, "step": 95130 }, { "epoch": 14.169645516830503, "grad_norm": 3.659506320953369, "learning_rate": 1.1865077673158188e-05, "loss": 0.8391, "num_input_tokens_seen": 55235624, "step": 95135 }, { "epoch": 14.170390229371463, "grad_norm": 1.4235343933105469, "learning_rate": 1.186231299022744e-05, "loss": 0.4988, "num_input_tokens_seen": 55238536, "step": 95140 }, { "epoch": 14.171134941912422, "grad_norm": 1.9309223890304565, "learning_rate": 1.1859548529244571e-05, "loss": 0.8113, "num_input_tokens_seen": 55242088, "step": 95145 }, { "epoch": 14.171879654453381, "grad_norm": 2.0399763584136963, "learning_rate": 1.1856784290256276e-05, "loss": 0.5101, "num_input_tokens_seen": 55245128, "step": 95150 }, { "epoch": 14.17262436699434, "grad_norm": 3.1226749420166016, "learning_rate": 1.1854020273309241e-05, "loss": 0.5839, "num_input_tokens_seen": 55247880, "step": 95155 }, { "epoch": 14.1733690795353, "grad_norm": 2.435173511505127, "learning_rate": 1.1851256478450181e-05, "loss": 0.4853, "num_input_tokens_seen": 55250952, "step": 95160 }, { "epoch": 14.174113792076259, "grad_norm": 1.3143693208694458, "learning_rate": 1.1848492905725781e-05, "loss": 0.5537, "num_input_tokens_seen": 55253800, "step": 95165 }, { "epoch": 14.174858504617218, "grad_norm": 1.6512593030929565, "learning_rate": 1.1845729555182728e-05, "loss": 0.7156, "num_input_tokens_seen": 55256712, "step": 95170 }, { "epoch": 14.175603217158177, "grad_norm": 1.9943650960922241, "learning_rate": 1.1842966426867694e-05, "loss": 0.5906, "num_input_tokens_seen": 55259528, "step": 95175 }, { "epoch": 14.176347929699135, "grad_norm": 1.6469913721084595, "learning_rate": 1.1840203520827378e-05, "loss": 0.5522, "num_input_tokens_seen": 55262440, "step": 95180 }, { "epoch": 14.177092642240096, "grad_norm": 2.214393377304077, "learning_rate": 1.183744083710844e-05, "loss": 0.5652, "num_input_tokens_seen": 55265128, "step": 95185 }, { "epoch": 14.177837354781055, "grad_norm": 2.048906087875366, "learning_rate": 1.1834678375757571e-05, "loss": 0.5254, "num_input_tokens_seen": 55268040, "step": 95190 }, { "epoch": 14.178582067322013, "grad_norm": 1.4262560606002808, "learning_rate": 1.183191613682143e-05, "loss": 0.4363, "num_input_tokens_seen": 55270888, "step": 95195 }, { "epoch": 14.179326779862972, "grad_norm": 1.0354583263397217, "learning_rate": 1.1829154120346673e-05, "loss": 0.4837, "num_input_tokens_seen": 55273608, "step": 95200 }, { "epoch": 14.180071492403933, "grad_norm": 1.244509220123291, "learning_rate": 1.1826392326379981e-05, "loss": 0.6033, "num_input_tokens_seen": 55276840, "step": 95205 }, { "epoch": 14.180816204944891, "grad_norm": 1.4074656963348389, "learning_rate": 1.1823630754967991e-05, "loss": 0.4417, "num_input_tokens_seen": 55279688, "step": 95210 }, { "epoch": 14.18156091748585, "grad_norm": 2.2909140586853027, "learning_rate": 1.1820869406157378e-05, "loss": 0.6929, "num_input_tokens_seen": 55282664, "step": 95215 }, { "epoch": 14.182305630026809, "grad_norm": 1.2786189317703247, "learning_rate": 1.181810827999478e-05, "loss": 0.5253, "num_input_tokens_seen": 55285640, "step": 95220 }, { "epoch": 14.18305034256777, "grad_norm": 2.4651482105255127, "learning_rate": 1.1815347376526847e-05, "loss": 0.6875, "num_input_tokens_seen": 55288648, "step": 95225 }, { "epoch": 14.183795055108728, "grad_norm": 3.051565408706665, "learning_rate": 1.181258669580021e-05, "loss": 0.5574, "num_input_tokens_seen": 55291880, "step": 95230 }, { "epoch": 14.184539767649687, "grad_norm": 0.9931440949440002, "learning_rate": 1.1809826237861527e-05, "loss": 0.5866, "num_input_tokens_seen": 55294984, "step": 95235 }, { "epoch": 14.185284480190646, "grad_norm": 1.8721792697906494, "learning_rate": 1.1807066002757422e-05, "loss": 0.5954, "num_input_tokens_seen": 55297992, "step": 95240 }, { "epoch": 14.186029192731606, "grad_norm": 1.3124498128890991, "learning_rate": 1.180430599053452e-05, "loss": 0.4053, "num_input_tokens_seen": 55301096, "step": 95245 }, { "epoch": 14.186773905272565, "grad_norm": 1.0672438144683838, "learning_rate": 1.1801546201239466e-05, "loss": 0.6319, "num_input_tokens_seen": 55304072, "step": 95250 }, { "epoch": 14.187518617813524, "grad_norm": 0.9422134757041931, "learning_rate": 1.1798786634918868e-05, "loss": 0.4833, "num_input_tokens_seen": 55307272, "step": 95255 }, { "epoch": 14.188263330354483, "grad_norm": 2.6282474994659424, "learning_rate": 1.1796027291619358e-05, "loss": 0.7196, "num_input_tokens_seen": 55310216, "step": 95260 }, { "epoch": 14.189008042895443, "grad_norm": 1.2373809814453125, "learning_rate": 1.1793268171387539e-05, "loss": 0.604, "num_input_tokens_seen": 55313000, "step": 95265 }, { "epoch": 14.189752755436402, "grad_norm": 1.6943013668060303, "learning_rate": 1.1790509274270042e-05, "loss": 0.611, "num_input_tokens_seen": 55316232, "step": 95270 }, { "epoch": 14.19049746797736, "grad_norm": 2.2361576557159424, "learning_rate": 1.1787750600313465e-05, "loss": 0.7887, "num_input_tokens_seen": 55319432, "step": 95275 }, { "epoch": 14.19124218051832, "grad_norm": 1.6340546607971191, "learning_rate": 1.1784992149564403e-05, "loss": 0.5597, "num_input_tokens_seen": 55322248, "step": 95280 }, { "epoch": 14.19198689305928, "grad_norm": 0.7860987782478333, "learning_rate": 1.1782233922069478e-05, "loss": 0.4978, "num_input_tokens_seen": 55325224, "step": 95285 }, { "epoch": 14.192731605600239, "grad_norm": 1.333762764930725, "learning_rate": 1.1779475917875278e-05, "loss": 0.51, "num_input_tokens_seen": 55328072, "step": 95290 }, { "epoch": 14.193476318141197, "grad_norm": 1.3017650842666626, "learning_rate": 1.1776718137028392e-05, "loss": 0.6675, "num_input_tokens_seen": 55330952, "step": 95295 }, { "epoch": 14.194221030682156, "grad_norm": 1.3095794916152954, "learning_rate": 1.1773960579575408e-05, "loss": 0.6723, "num_input_tokens_seen": 55333768, "step": 95300 }, { "epoch": 14.194965743223117, "grad_norm": 1.6949020624160767, "learning_rate": 1.1771203245562924e-05, "loss": 0.5432, "num_input_tokens_seen": 55336680, "step": 95305 }, { "epoch": 14.195710455764075, "grad_norm": 1.1132290363311768, "learning_rate": 1.176844613503751e-05, "loss": 0.4919, "num_input_tokens_seen": 55339496, "step": 95310 }, { "epoch": 14.196455168305034, "grad_norm": 1.3750792741775513, "learning_rate": 1.1765689248045755e-05, "loss": 0.5227, "num_input_tokens_seen": 55342440, "step": 95315 }, { "epoch": 14.197199880845993, "grad_norm": 1.3985600471496582, "learning_rate": 1.1762932584634234e-05, "loss": 0.5671, "num_input_tokens_seen": 55345384, "step": 95320 }, { "epoch": 14.197944593386953, "grad_norm": 1.425645351409912, "learning_rate": 1.1760176144849502e-05, "loss": 0.6947, "num_input_tokens_seen": 55348776, "step": 95325 }, { "epoch": 14.198689305927912, "grad_norm": 1.4146170616149902, "learning_rate": 1.1757419928738147e-05, "loss": 0.5941, "num_input_tokens_seen": 55351624, "step": 95330 }, { "epoch": 14.199434018468871, "grad_norm": 1.5847076177597046, "learning_rate": 1.1754663936346713e-05, "loss": 0.5001, "num_input_tokens_seen": 55354536, "step": 95335 }, { "epoch": 14.20017873100983, "grad_norm": 3.1546213626861572, "learning_rate": 1.1751908167721782e-05, "loss": 0.5543, "num_input_tokens_seen": 55357320, "step": 95340 }, { "epoch": 14.200923443550789, "grad_norm": 0.9658360481262207, "learning_rate": 1.1749152622909884e-05, "loss": 0.3204, "num_input_tokens_seen": 55360136, "step": 95345 }, { "epoch": 14.201668156091749, "grad_norm": 1.4058308601379395, "learning_rate": 1.1746397301957598e-05, "loss": 0.5986, "num_input_tokens_seen": 55363240, "step": 95350 }, { "epoch": 14.202412868632708, "grad_norm": 1.1633714437484741, "learning_rate": 1.174364220491146e-05, "loss": 0.6133, "num_input_tokens_seen": 55366312, "step": 95355 }, { "epoch": 14.203157581173667, "grad_norm": 2.706784963607788, "learning_rate": 1.1740887331818009e-05, "loss": 0.7159, "num_input_tokens_seen": 55369480, "step": 95360 }, { "epoch": 14.203902293714625, "grad_norm": 1.1013526916503906, "learning_rate": 1.1738132682723797e-05, "loss": 0.5557, "num_input_tokens_seen": 55372392, "step": 95365 }, { "epoch": 14.204647006255586, "grad_norm": 1.4663362503051758, "learning_rate": 1.1735378257675338e-05, "loss": 0.6449, "num_input_tokens_seen": 55375240, "step": 95370 }, { "epoch": 14.205391718796545, "grad_norm": 1.6118111610412598, "learning_rate": 1.1732624056719197e-05, "loss": 0.4894, "num_input_tokens_seen": 55377928, "step": 95375 }, { "epoch": 14.206136431337503, "grad_norm": 1.8320685625076294, "learning_rate": 1.1729870079901875e-05, "loss": 0.6459, "num_input_tokens_seen": 55380840, "step": 95380 }, { "epoch": 14.206881143878462, "grad_norm": 1.1843079328536987, "learning_rate": 1.1727116327269924e-05, "loss": 0.5152, "num_input_tokens_seen": 55383560, "step": 95385 }, { "epoch": 14.207625856419423, "grad_norm": 1.5713868141174316, "learning_rate": 1.172436279886984e-05, "loss": 0.6797, "num_input_tokens_seen": 55386216, "step": 95390 }, { "epoch": 14.208370568960381, "grad_norm": 1.935257911682129, "learning_rate": 1.1721609494748164e-05, "loss": 0.6757, "num_input_tokens_seen": 55389000, "step": 95395 }, { "epoch": 14.20911528150134, "grad_norm": 3.8072891235351562, "learning_rate": 1.1718856414951402e-05, "loss": 0.9087, "num_input_tokens_seen": 55392136, "step": 95400 }, { "epoch": 14.209859994042299, "grad_norm": 1.0775388479232788, "learning_rate": 1.1716103559526051e-05, "loss": 0.506, "num_input_tokens_seen": 55395208, "step": 95405 }, { "epoch": 14.21060470658326, "grad_norm": 1.1492736339569092, "learning_rate": 1.1713350928518639e-05, "loss": 0.5758, "num_input_tokens_seen": 55398056, "step": 95410 }, { "epoch": 14.211349419124218, "grad_norm": 3.1391141414642334, "learning_rate": 1.171059852197565e-05, "loss": 0.5001, "num_input_tokens_seen": 55401160, "step": 95415 }, { "epoch": 14.212094131665177, "grad_norm": 1.968984842300415, "learning_rate": 1.1707846339943601e-05, "loss": 0.6191, "num_input_tokens_seen": 55403880, "step": 95420 }, { "epoch": 14.212838844206136, "grad_norm": 1.7501089572906494, "learning_rate": 1.1705094382468979e-05, "loss": 0.6001, "num_input_tokens_seen": 55407112, "step": 95425 }, { "epoch": 14.213583556747096, "grad_norm": 1.678762435913086, "learning_rate": 1.1702342649598274e-05, "loss": 0.5776, "num_input_tokens_seen": 55410024, "step": 95430 }, { "epoch": 14.214328269288055, "grad_norm": 1.6322212219238281, "learning_rate": 1.1699591141377967e-05, "loss": 0.6069, "num_input_tokens_seen": 55412744, "step": 95435 }, { "epoch": 14.215072981829014, "grad_norm": 2.6349728107452393, "learning_rate": 1.1696839857854558e-05, "loss": 0.422, "num_input_tokens_seen": 55415976, "step": 95440 }, { "epoch": 14.215817694369973, "grad_norm": 3.0771727561950684, "learning_rate": 1.169408879907452e-05, "loss": 0.6382, "num_input_tokens_seen": 55419144, "step": 95445 }, { "epoch": 14.216562406910933, "grad_norm": 1.8681752681732178, "learning_rate": 1.1691337965084321e-05, "loss": 0.4193, "num_input_tokens_seen": 55422248, "step": 95450 }, { "epoch": 14.217307119451892, "grad_norm": 1.1638083457946777, "learning_rate": 1.1688587355930444e-05, "loss": 0.5975, "num_input_tokens_seen": 55425064, "step": 95455 }, { "epoch": 14.21805183199285, "grad_norm": 1.741492509841919, "learning_rate": 1.168583697165935e-05, "loss": 0.5649, "num_input_tokens_seen": 55427848, "step": 95460 }, { "epoch": 14.21879654453381, "grad_norm": 1.0625425577163696, "learning_rate": 1.1683086812317517e-05, "loss": 0.5212, "num_input_tokens_seen": 55430536, "step": 95465 }, { "epoch": 14.21954125707477, "grad_norm": 1.7134279012680054, "learning_rate": 1.1680336877951387e-05, "loss": 0.6068, "num_input_tokens_seen": 55433416, "step": 95470 }, { "epoch": 14.220285969615729, "grad_norm": 1.5653561353683472, "learning_rate": 1.1677587168607437e-05, "loss": 0.5535, "num_input_tokens_seen": 55436264, "step": 95475 }, { "epoch": 14.221030682156687, "grad_norm": 1.0711746215820312, "learning_rate": 1.1674837684332113e-05, "loss": 0.5577, "num_input_tokens_seen": 55439144, "step": 95480 }, { "epoch": 14.221775394697646, "grad_norm": 1.697623610496521, "learning_rate": 1.1672088425171854e-05, "loss": 0.546, "num_input_tokens_seen": 55442248, "step": 95485 }, { "epoch": 14.222520107238607, "grad_norm": 1.7044572830200195, "learning_rate": 1.1669339391173122e-05, "loss": 0.5855, "num_input_tokens_seen": 55445000, "step": 95490 }, { "epoch": 14.223264819779565, "grad_norm": 1.3949284553527832, "learning_rate": 1.1666590582382355e-05, "loss": 0.631, "num_input_tokens_seen": 55447944, "step": 95495 }, { "epoch": 14.224009532320524, "grad_norm": 2.6088359355926514, "learning_rate": 1.166384199884599e-05, "loss": 0.7202, "num_input_tokens_seen": 55450664, "step": 95500 }, { "epoch": 14.224754244861483, "grad_norm": 1.1697171926498413, "learning_rate": 1.1661093640610445e-05, "loss": 0.4426, "num_input_tokens_seen": 55453544, "step": 95505 }, { "epoch": 14.225498957402444, "grad_norm": 2.0415828227996826, "learning_rate": 1.1658345507722182e-05, "loss": 0.6988, "num_input_tokens_seen": 55456360, "step": 95510 }, { "epoch": 14.226243669943402, "grad_norm": 1.7690197229385376, "learning_rate": 1.1655597600227597e-05, "loss": 0.4604, "num_input_tokens_seen": 55459272, "step": 95515 }, { "epoch": 14.226988382484361, "grad_norm": 1.3846049308776855, "learning_rate": 1.1652849918173139e-05, "loss": 0.6489, "num_input_tokens_seen": 55462024, "step": 95520 }, { "epoch": 14.22773309502532, "grad_norm": 1.5189059972763062, "learning_rate": 1.165010246160522e-05, "loss": 0.6904, "num_input_tokens_seen": 55464936, "step": 95525 }, { "epoch": 14.228477807566279, "grad_norm": 3.164090156555176, "learning_rate": 1.1647355230570237e-05, "loss": 0.5018, "num_input_tokens_seen": 55468200, "step": 95530 }, { "epoch": 14.229222520107239, "grad_norm": 1.4684966802597046, "learning_rate": 1.1644608225114629e-05, "loss": 0.3437, "num_input_tokens_seen": 55470952, "step": 95535 }, { "epoch": 14.229967232648198, "grad_norm": 1.2112923860549927, "learning_rate": 1.164186144528478e-05, "loss": 0.6401, "num_input_tokens_seen": 55473896, "step": 95540 }, { "epoch": 14.230711945189157, "grad_norm": 1.0429970026016235, "learning_rate": 1.1639114891127114e-05, "loss": 0.7539, "num_input_tokens_seen": 55476744, "step": 95545 }, { "epoch": 14.231456657730115, "grad_norm": 2.075679302215576, "learning_rate": 1.1636368562688024e-05, "loss": 0.6835, "num_input_tokens_seen": 55479752, "step": 95550 }, { "epoch": 14.232201370271076, "grad_norm": 1.6393210887908936, "learning_rate": 1.1633622460013904e-05, "loss": 0.5714, "num_input_tokens_seen": 55482536, "step": 95555 }, { "epoch": 14.232946082812035, "grad_norm": 1.287261724472046, "learning_rate": 1.163087658315114e-05, "loss": 0.5484, "num_input_tokens_seen": 55485512, "step": 95560 }, { "epoch": 14.233690795352993, "grad_norm": 1.590603232383728, "learning_rate": 1.1628130932146137e-05, "loss": 0.6346, "num_input_tokens_seen": 55488072, "step": 95565 }, { "epoch": 14.234435507893952, "grad_norm": 1.0196716785430908, "learning_rate": 1.1625385507045272e-05, "loss": 0.6388, "num_input_tokens_seen": 55491016, "step": 95570 }, { "epoch": 14.235180220434913, "grad_norm": 0.8704013228416443, "learning_rate": 1.1622640307894913e-05, "loss": 0.6339, "num_input_tokens_seen": 55494248, "step": 95575 }, { "epoch": 14.235924932975871, "grad_norm": 2.6356582641601562, "learning_rate": 1.1619895334741463e-05, "loss": 0.7409, "num_input_tokens_seen": 55497032, "step": 95580 }, { "epoch": 14.23666964551683, "grad_norm": 1.363977313041687, "learning_rate": 1.161715058763127e-05, "loss": 0.5012, "num_input_tokens_seen": 55499752, "step": 95585 }, { "epoch": 14.237414358057789, "grad_norm": 1.7284424304962158, "learning_rate": 1.1614406066610728e-05, "loss": 0.7513, "num_input_tokens_seen": 55502760, "step": 95590 }, { "epoch": 14.23815907059875, "grad_norm": 1.7594459056854248, "learning_rate": 1.1611661771726181e-05, "loss": 0.6502, "num_input_tokens_seen": 55505672, "step": 95595 }, { "epoch": 14.238903783139708, "grad_norm": 1.4531511068344116, "learning_rate": 1.1608917703024009e-05, "loss": 0.3961, "num_input_tokens_seen": 55508520, "step": 95600 }, { "epoch": 14.239648495680667, "grad_norm": 2.8164265155792236, "learning_rate": 1.1606173860550562e-05, "loss": 0.6993, "num_input_tokens_seen": 55511112, "step": 95605 }, { "epoch": 14.240393208221626, "grad_norm": 1.9053220748901367, "learning_rate": 1.1603430244352187e-05, "loss": 0.5035, "num_input_tokens_seen": 55513896, "step": 95610 }, { "epoch": 14.241137920762586, "grad_norm": 1.713420033454895, "learning_rate": 1.160068685447525e-05, "loss": 0.5156, "num_input_tokens_seen": 55516744, "step": 95615 }, { "epoch": 14.241882633303545, "grad_norm": 1.232511043548584, "learning_rate": 1.1597943690966092e-05, "loss": 0.4439, "num_input_tokens_seen": 55519624, "step": 95620 }, { "epoch": 14.242627345844504, "grad_norm": 2.1046037673950195, "learning_rate": 1.1595200753871055e-05, "loss": 0.6656, "num_input_tokens_seen": 55522600, "step": 95625 }, { "epoch": 14.243372058385463, "grad_norm": 1.4841289520263672, "learning_rate": 1.1592458043236468e-05, "loss": 0.5963, "num_input_tokens_seen": 55525544, "step": 95630 }, { "epoch": 14.244116770926423, "grad_norm": 1.4828135967254639, "learning_rate": 1.1589715559108682e-05, "loss": 0.5815, "num_input_tokens_seen": 55528520, "step": 95635 }, { "epoch": 14.244861483467382, "grad_norm": 1.396789312362671, "learning_rate": 1.1586973301534024e-05, "loss": 0.8104, "num_input_tokens_seen": 55531624, "step": 95640 }, { "epoch": 14.24560619600834, "grad_norm": 1.7724076509475708, "learning_rate": 1.158423127055881e-05, "loss": 0.5513, "num_input_tokens_seen": 55534472, "step": 95645 }, { "epoch": 14.2463509085493, "grad_norm": 4.299491882324219, "learning_rate": 1.1581489466229381e-05, "loss": 0.6316, "num_input_tokens_seen": 55537608, "step": 95650 }, { "epoch": 14.24709562109026, "grad_norm": 1.2924909591674805, "learning_rate": 1.1578747888592043e-05, "loss": 0.6709, "num_input_tokens_seen": 55540520, "step": 95655 }, { "epoch": 14.247840333631219, "grad_norm": 1.5886982679367065, "learning_rate": 1.1576006537693127e-05, "loss": 0.5196, "num_input_tokens_seen": 55543240, "step": 95660 }, { "epoch": 14.248585046172177, "grad_norm": 1.71322762966156, "learning_rate": 1.1573265413578926e-05, "loss": 0.6314, "num_input_tokens_seen": 55546120, "step": 95665 }, { "epoch": 14.249329758713136, "grad_norm": 1.4639593362808228, "learning_rate": 1.1570524516295773e-05, "loss": 0.5959, "num_input_tokens_seen": 55549000, "step": 95670 }, { "epoch": 14.250074471254097, "grad_norm": 0.9808644652366638, "learning_rate": 1.1567783845889946e-05, "loss": 0.5973, "num_input_tokens_seen": 55551624, "step": 95675 }, { "epoch": 14.250819183795056, "grad_norm": 3.892733097076416, "learning_rate": 1.1565043402407768e-05, "loss": 0.783, "num_input_tokens_seen": 55554280, "step": 95680 }, { "epoch": 14.251563896336014, "grad_norm": 1.008992314338684, "learning_rate": 1.1562303185895528e-05, "loss": 0.5516, "num_input_tokens_seen": 55557288, "step": 95685 }, { "epoch": 14.252308608876973, "grad_norm": 2.0851962566375732, "learning_rate": 1.155956319639952e-05, "loss": 0.501, "num_input_tokens_seen": 55559944, "step": 95690 }, { "epoch": 14.253053321417934, "grad_norm": 1.4472897052764893, "learning_rate": 1.155682343396603e-05, "loss": 0.6623, "num_input_tokens_seen": 55563208, "step": 95695 }, { "epoch": 14.253798033958892, "grad_norm": 2.328615665435791, "learning_rate": 1.1554083898641335e-05, "loss": 0.7091, "num_input_tokens_seen": 55565896, "step": 95700 }, { "epoch": 14.254542746499851, "grad_norm": 1.7568789720535278, "learning_rate": 1.1551344590471739e-05, "loss": 0.4884, "num_input_tokens_seen": 55568776, "step": 95705 }, { "epoch": 14.25528745904081, "grad_norm": 0.8942344784736633, "learning_rate": 1.1548605509503496e-05, "loss": 0.4519, "num_input_tokens_seen": 55571688, "step": 95710 }, { "epoch": 14.256032171581769, "grad_norm": 1.056700587272644, "learning_rate": 1.15458666557829e-05, "loss": 0.6434, "num_input_tokens_seen": 55574568, "step": 95715 }, { "epoch": 14.25677688412273, "grad_norm": 1.9646540880203247, "learning_rate": 1.1543128029356215e-05, "loss": 0.6185, "num_input_tokens_seen": 55577448, "step": 95720 }, { "epoch": 14.257521596663688, "grad_norm": 1.1657425165176392, "learning_rate": 1.1540389630269693e-05, "loss": 0.6441, "num_input_tokens_seen": 55580136, "step": 95725 }, { "epoch": 14.258266309204647, "grad_norm": 1.6062973737716675, "learning_rate": 1.153765145856962e-05, "loss": 0.5367, "num_input_tokens_seen": 55583048, "step": 95730 }, { "epoch": 14.259011021745605, "grad_norm": 1.3874571323394775, "learning_rate": 1.1534913514302232e-05, "loss": 0.6226, "num_input_tokens_seen": 55585928, "step": 95735 }, { "epoch": 14.259755734286566, "grad_norm": 2.205305814743042, "learning_rate": 1.1532175797513806e-05, "loss": 0.5199, "num_input_tokens_seen": 55588872, "step": 95740 }, { "epoch": 14.260500446827525, "grad_norm": 1.3007755279541016, "learning_rate": 1.152943830825057e-05, "loss": 0.4608, "num_input_tokens_seen": 55592104, "step": 95745 }, { "epoch": 14.261245159368483, "grad_norm": 2.6077473163604736, "learning_rate": 1.1526701046558794e-05, "loss": 0.486, "num_input_tokens_seen": 55594600, "step": 95750 }, { "epoch": 14.261989871909442, "grad_norm": 2.496675968170166, "learning_rate": 1.1523964012484712e-05, "loss": 0.8124, "num_input_tokens_seen": 55597704, "step": 95755 }, { "epoch": 14.262734584450403, "grad_norm": 2.491593360900879, "learning_rate": 1.1521227206074559e-05, "loss": 0.5957, "num_input_tokens_seen": 55600424, "step": 95760 }, { "epoch": 14.263479296991362, "grad_norm": 1.7199972867965698, "learning_rate": 1.1518490627374572e-05, "loss": 0.455, "num_input_tokens_seen": 55603336, "step": 95765 }, { "epoch": 14.26422400953232, "grad_norm": 1.3764550685882568, "learning_rate": 1.151575427643098e-05, "loss": 0.7052, "num_input_tokens_seen": 55606376, "step": 95770 }, { "epoch": 14.264968722073279, "grad_norm": 2.019444704055786, "learning_rate": 1.1513018153290018e-05, "loss": 0.44, "num_input_tokens_seen": 55609256, "step": 95775 }, { "epoch": 14.26571343461424, "grad_norm": 1.3021854162216187, "learning_rate": 1.15102822579979e-05, "loss": 0.7004, "num_input_tokens_seen": 55612104, "step": 95780 }, { "epoch": 14.266458147155198, "grad_norm": 2.405012369155884, "learning_rate": 1.1507546590600862e-05, "loss": 0.7983, "num_input_tokens_seen": 55614984, "step": 95785 }, { "epoch": 14.267202859696157, "grad_norm": 1.1957696676254272, "learning_rate": 1.15048111511451e-05, "loss": 0.6407, "num_input_tokens_seen": 55618376, "step": 95790 }, { "epoch": 14.267947572237116, "grad_norm": 3.436708927154541, "learning_rate": 1.1502075939676852e-05, "loss": 0.553, "num_input_tokens_seen": 55621256, "step": 95795 }, { "epoch": 14.268692284778076, "grad_norm": 1.5073751211166382, "learning_rate": 1.1499340956242307e-05, "loss": 0.3218, "num_input_tokens_seen": 55624136, "step": 95800 }, { "epoch": 14.269436997319035, "grad_norm": 3.103426218032837, "learning_rate": 1.1496606200887669e-05, "loss": 0.6876, "num_input_tokens_seen": 55626664, "step": 95805 }, { "epoch": 14.270181709859994, "grad_norm": 1.1959433555603027, "learning_rate": 1.1493871673659155e-05, "loss": 0.6539, "num_input_tokens_seen": 55629576, "step": 95810 }, { "epoch": 14.270926422400953, "grad_norm": 1.4979894161224365, "learning_rate": 1.1491137374602939e-05, "loss": 0.6564, "num_input_tokens_seen": 55632264, "step": 95815 }, { "epoch": 14.271671134941913, "grad_norm": 2.861849784851074, "learning_rate": 1.1488403303765239e-05, "loss": 0.6283, "num_input_tokens_seen": 55634952, "step": 95820 }, { "epoch": 14.272415847482872, "grad_norm": 1.4851536750793457, "learning_rate": 1.1485669461192233e-05, "loss": 0.3967, "num_input_tokens_seen": 55637736, "step": 95825 }, { "epoch": 14.27316056002383, "grad_norm": 1.2524197101593018, "learning_rate": 1.1482935846930104e-05, "loss": 0.5692, "num_input_tokens_seen": 55640488, "step": 95830 }, { "epoch": 14.27390527256479, "grad_norm": 1.884373664855957, "learning_rate": 1.148020246102503e-05, "loss": 0.8378, "num_input_tokens_seen": 55643176, "step": 95835 }, { "epoch": 14.27464998510575, "grad_norm": 0.7111784815788269, "learning_rate": 1.14774693035232e-05, "loss": 0.8527, "num_input_tokens_seen": 55645960, "step": 95840 }, { "epoch": 14.275394697646709, "grad_norm": 1.4330992698669434, "learning_rate": 1.1474736374470785e-05, "loss": 0.6238, "num_input_tokens_seen": 55648712, "step": 95845 }, { "epoch": 14.276139410187668, "grad_norm": 1.5300084352493286, "learning_rate": 1.1472003673913942e-05, "loss": 0.6309, "num_input_tokens_seen": 55651464, "step": 95850 }, { "epoch": 14.276884122728626, "grad_norm": 1.3440722227096558, "learning_rate": 1.1469271201898857e-05, "loss": 0.6588, "num_input_tokens_seen": 55654344, "step": 95855 }, { "epoch": 14.277628835269585, "grad_norm": 1.7362498044967651, "learning_rate": 1.1466538958471673e-05, "loss": 0.5392, "num_input_tokens_seen": 55657064, "step": 95860 }, { "epoch": 14.278373547810546, "grad_norm": 1.4577653408050537, "learning_rate": 1.1463806943678571e-05, "loss": 0.6281, "num_input_tokens_seen": 55660296, "step": 95865 }, { "epoch": 14.279118260351504, "grad_norm": 3.4987006187438965, "learning_rate": 1.1461075157565681e-05, "loss": 0.6213, "num_input_tokens_seen": 55663464, "step": 95870 }, { "epoch": 14.279862972892463, "grad_norm": 1.177880883216858, "learning_rate": 1.1458343600179175e-05, "loss": 0.6919, "num_input_tokens_seen": 55666600, "step": 95875 }, { "epoch": 14.280607685433422, "grad_norm": 1.5677330493927002, "learning_rate": 1.1455612271565192e-05, "loss": 0.6638, "num_input_tokens_seen": 55669640, "step": 95880 }, { "epoch": 14.281352397974382, "grad_norm": 2.5486948490142822, "learning_rate": 1.1452881171769872e-05, "loss": 0.4217, "num_input_tokens_seen": 55672328, "step": 95885 }, { "epoch": 14.282097110515341, "grad_norm": 1.5522172451019287, "learning_rate": 1.145015030083935e-05, "loss": 0.5718, "num_input_tokens_seen": 55675112, "step": 95890 }, { "epoch": 14.2828418230563, "grad_norm": 2.2664074897766113, "learning_rate": 1.1447419658819775e-05, "loss": 0.6441, "num_input_tokens_seen": 55677992, "step": 95895 }, { "epoch": 14.283586535597259, "grad_norm": 1.507144808769226, "learning_rate": 1.1444689245757268e-05, "loss": 0.5655, "num_input_tokens_seen": 55680904, "step": 95900 }, { "epoch": 14.28433124813822, "grad_norm": 1.493414044380188, "learning_rate": 1.1441959061697952e-05, "loss": 0.5445, "num_input_tokens_seen": 55683784, "step": 95905 }, { "epoch": 14.285075960679178, "grad_norm": 1.298919677734375, "learning_rate": 1.1439229106687969e-05, "loss": 0.7638, "num_input_tokens_seen": 55686792, "step": 95910 }, { "epoch": 14.285820673220137, "grad_norm": 2.783597946166992, "learning_rate": 1.1436499380773416e-05, "loss": 0.7157, "num_input_tokens_seen": 55689800, "step": 95915 }, { "epoch": 14.286565385761095, "grad_norm": 1.8056846857070923, "learning_rate": 1.1433769884000429e-05, "loss": 0.6443, "num_input_tokens_seen": 55692616, "step": 95920 }, { "epoch": 14.287310098302056, "grad_norm": 0.9971477389335632, "learning_rate": 1.1431040616415114e-05, "loss": 0.6731, "num_input_tokens_seen": 55695816, "step": 95925 }, { "epoch": 14.288054810843015, "grad_norm": 1.7484209537506104, "learning_rate": 1.1428311578063566e-05, "loss": 0.6291, "num_input_tokens_seen": 55699080, "step": 95930 }, { "epoch": 14.288799523383974, "grad_norm": 0.7399327158927917, "learning_rate": 1.142558276899191e-05, "loss": 0.5986, "num_input_tokens_seen": 55701992, "step": 95935 }, { "epoch": 14.289544235924932, "grad_norm": 1.2497081756591797, "learning_rate": 1.142285418924623e-05, "loss": 0.5989, "num_input_tokens_seen": 55705032, "step": 95940 }, { "epoch": 14.290288948465893, "grad_norm": 2.7469165325164795, "learning_rate": 1.1420125838872633e-05, "loss": 0.6096, "num_input_tokens_seen": 55708104, "step": 95945 }, { "epoch": 14.291033661006852, "grad_norm": 2.5892066955566406, "learning_rate": 1.1417397717917213e-05, "loss": 0.7421, "num_input_tokens_seen": 55711496, "step": 95950 }, { "epoch": 14.29177837354781, "grad_norm": 1.9196254014968872, "learning_rate": 1.1414669826426053e-05, "loss": 0.4979, "num_input_tokens_seen": 55714280, "step": 95955 }, { "epoch": 14.292523086088769, "grad_norm": 1.439220666885376, "learning_rate": 1.1411942164445228e-05, "loss": 0.5655, "num_input_tokens_seen": 55717160, "step": 95960 }, { "epoch": 14.29326779862973, "grad_norm": 4.122573375701904, "learning_rate": 1.140921473202084e-05, "loss": 0.6794, "num_input_tokens_seen": 55719816, "step": 95965 }, { "epoch": 14.294012511170688, "grad_norm": 3.234187364578247, "learning_rate": 1.1406487529198956e-05, "loss": 0.8338, "num_input_tokens_seen": 55722760, "step": 95970 }, { "epoch": 14.294757223711647, "grad_norm": 1.938821792602539, "learning_rate": 1.1403760556025638e-05, "loss": 0.5729, "num_input_tokens_seen": 55725608, "step": 95975 }, { "epoch": 14.295501936252606, "grad_norm": 1.033116340637207, "learning_rate": 1.140103381254698e-05, "loss": 0.5113, "num_input_tokens_seen": 55728584, "step": 95980 }, { "epoch": 14.296246648793566, "grad_norm": 0.8551843762397766, "learning_rate": 1.1398307298809022e-05, "loss": 0.4729, "num_input_tokens_seen": 55731432, "step": 95985 }, { "epoch": 14.296991361334525, "grad_norm": 1.7067335844039917, "learning_rate": 1.1395581014857848e-05, "loss": 0.6047, "num_input_tokens_seen": 55734440, "step": 95990 }, { "epoch": 14.297736073875484, "grad_norm": 1.5594613552093506, "learning_rate": 1.1392854960739497e-05, "loss": 0.6108, "num_input_tokens_seen": 55737128, "step": 95995 }, { "epoch": 14.298480786416443, "grad_norm": 2.1215591430664062, "learning_rate": 1.1390129136500041e-05, "loss": 0.6758, "num_input_tokens_seen": 55739752, "step": 96000 }, { "epoch": 14.299225498957403, "grad_norm": 1.613815426826477, "learning_rate": 1.138740354218552e-05, "loss": 0.592, "num_input_tokens_seen": 55742792, "step": 96005 }, { "epoch": 14.299970211498362, "grad_norm": 2.4003937244415283, "learning_rate": 1.1384678177841973e-05, "loss": 0.6467, "num_input_tokens_seen": 55745864, "step": 96010 }, { "epoch": 14.30071492403932, "grad_norm": 1.1011011600494385, "learning_rate": 1.1381953043515459e-05, "loss": 0.5376, "num_input_tokens_seen": 55748648, "step": 96015 }, { "epoch": 14.30145963658028, "grad_norm": 1.168919563293457, "learning_rate": 1.1379228139252007e-05, "loss": 0.73, "num_input_tokens_seen": 55751656, "step": 96020 }, { "epoch": 14.30220434912124, "grad_norm": 1.1381126642227173, "learning_rate": 1.1376503465097651e-05, "loss": 0.5767, "num_input_tokens_seen": 55754664, "step": 96025 }, { "epoch": 14.302949061662199, "grad_norm": 1.7877964973449707, "learning_rate": 1.1373779021098415e-05, "loss": 0.6178, "num_input_tokens_seen": 55757640, "step": 96030 }, { "epoch": 14.303693774203158, "grad_norm": 2.435812473297119, "learning_rate": 1.1371054807300344e-05, "loss": 0.6152, "num_input_tokens_seen": 55760712, "step": 96035 }, { "epoch": 14.304438486744116, "grad_norm": 1.512721300125122, "learning_rate": 1.1368330823749441e-05, "loss": 0.4309, "num_input_tokens_seen": 55763848, "step": 96040 }, { "epoch": 14.305183199285075, "grad_norm": 1.9558684825897217, "learning_rate": 1.1365607070491741e-05, "loss": 0.8247, "num_input_tokens_seen": 55766856, "step": 96045 }, { "epoch": 14.305927911826036, "grad_norm": 3.1816565990448, "learning_rate": 1.1362883547573252e-05, "loss": 0.8945, "num_input_tokens_seen": 55769544, "step": 96050 }, { "epoch": 14.306672624366994, "grad_norm": 4.0875678062438965, "learning_rate": 1.1360160255039976e-05, "loss": 0.7359, "num_input_tokens_seen": 55772392, "step": 96055 }, { "epoch": 14.307417336907953, "grad_norm": 1.3436787128448486, "learning_rate": 1.1357437192937943e-05, "loss": 0.582, "num_input_tokens_seen": 55775144, "step": 96060 }, { "epoch": 14.308162049448912, "grad_norm": 4.145236015319824, "learning_rate": 1.1354714361313128e-05, "loss": 0.5563, "num_input_tokens_seen": 55777992, "step": 96065 }, { "epoch": 14.308906761989872, "grad_norm": 2.7259814739227295, "learning_rate": 1.1351991760211558e-05, "loss": 0.5563, "num_input_tokens_seen": 55780552, "step": 96070 }, { "epoch": 14.309651474530831, "grad_norm": 1.8931090831756592, "learning_rate": 1.1349269389679203e-05, "loss": 0.6428, "num_input_tokens_seen": 55783656, "step": 96075 }, { "epoch": 14.31039618707179, "grad_norm": 2.5410826206207275, "learning_rate": 1.1346547249762082e-05, "loss": 0.5747, "num_input_tokens_seen": 55786344, "step": 96080 }, { "epoch": 14.311140899612749, "grad_norm": 2.678908109664917, "learning_rate": 1.1343825340506167e-05, "loss": 0.4189, "num_input_tokens_seen": 55789288, "step": 96085 }, { "epoch": 14.31188561215371, "grad_norm": 2.252666711807251, "learning_rate": 1.1341103661957441e-05, "loss": 0.576, "num_input_tokens_seen": 55792168, "step": 96090 }, { "epoch": 14.312630324694668, "grad_norm": 1.6911522150039673, "learning_rate": 1.1338382214161888e-05, "loss": 0.4587, "num_input_tokens_seen": 55795304, "step": 96095 }, { "epoch": 14.313375037235627, "grad_norm": 1.5673631429672241, "learning_rate": 1.1335660997165473e-05, "loss": 0.666, "num_input_tokens_seen": 55797960, "step": 96100 }, { "epoch": 14.314119749776586, "grad_norm": 1.4153157472610474, "learning_rate": 1.133294001101419e-05, "loss": 0.4853, "num_input_tokens_seen": 55801032, "step": 96105 }, { "epoch": 14.314864462317546, "grad_norm": 1.4172985553741455, "learning_rate": 1.1330219255753983e-05, "loss": 0.6522, "num_input_tokens_seen": 55803816, "step": 96110 }, { "epoch": 14.315609174858505, "grad_norm": 2.354088306427002, "learning_rate": 1.1327498731430835e-05, "loss": 0.4582, "num_input_tokens_seen": 55806952, "step": 96115 }, { "epoch": 14.316353887399464, "grad_norm": 1.7804017066955566, "learning_rate": 1.1324778438090694e-05, "loss": 0.6331, "num_input_tokens_seen": 55809800, "step": 96120 }, { "epoch": 14.317098599940422, "grad_norm": 0.8731206059455872, "learning_rate": 1.132205837577953e-05, "loss": 0.3593, "num_input_tokens_seen": 55812552, "step": 96125 }, { "epoch": 14.317843312481383, "grad_norm": 3.064394235610962, "learning_rate": 1.131933854454329e-05, "loss": 0.5149, "num_input_tokens_seen": 55815464, "step": 96130 }, { "epoch": 14.318588025022342, "grad_norm": 1.0576165914535522, "learning_rate": 1.131661894442791e-05, "loss": 0.4403, "num_input_tokens_seen": 55818312, "step": 96135 }, { "epoch": 14.3193327375633, "grad_norm": 1.9583714008331299, "learning_rate": 1.1313899575479355e-05, "loss": 0.7047, "num_input_tokens_seen": 55821064, "step": 96140 }, { "epoch": 14.32007745010426, "grad_norm": 1.6079000234603882, "learning_rate": 1.1311180437743549e-05, "loss": 0.6742, "num_input_tokens_seen": 55824104, "step": 96145 }, { "epoch": 14.32082216264522, "grad_norm": 1.2801010608673096, "learning_rate": 1.1308461531266442e-05, "loss": 0.6991, "num_input_tokens_seen": 55826728, "step": 96150 }, { "epoch": 14.321566875186178, "grad_norm": 3.1179187297821045, "learning_rate": 1.1305742856093964e-05, "loss": 0.6446, "num_input_tokens_seen": 55829544, "step": 96155 }, { "epoch": 14.322311587727137, "grad_norm": 1.6240583658218384, "learning_rate": 1.1303024412272046e-05, "loss": 0.4684, "num_input_tokens_seen": 55832488, "step": 96160 }, { "epoch": 14.323056300268096, "grad_norm": 1.4880472421646118, "learning_rate": 1.1300306199846605e-05, "loss": 0.6551, "num_input_tokens_seen": 55835624, "step": 96165 }, { "epoch": 14.323801012809056, "grad_norm": 1.2316869497299194, "learning_rate": 1.1297588218863561e-05, "loss": 0.6008, "num_input_tokens_seen": 55838408, "step": 96170 }, { "epoch": 14.324545725350015, "grad_norm": 1.293782114982605, "learning_rate": 1.1294870469368846e-05, "loss": 0.589, "num_input_tokens_seen": 55841448, "step": 96175 }, { "epoch": 14.325290437890974, "grad_norm": 1.1594511270523071, "learning_rate": 1.1292152951408356e-05, "loss": 0.5671, "num_input_tokens_seen": 55844904, "step": 96180 }, { "epoch": 14.326035150431933, "grad_norm": 1.1597063541412354, "learning_rate": 1.1289435665028016e-05, "loss": 0.5251, "num_input_tokens_seen": 55848072, "step": 96185 }, { "epoch": 14.326779862972893, "grad_norm": 1.4322329759597778, "learning_rate": 1.1286718610273719e-05, "loss": 0.6608, "num_input_tokens_seen": 55850856, "step": 96190 }, { "epoch": 14.327524575513852, "grad_norm": 1.1739833354949951, "learning_rate": 1.1284001787191381e-05, "loss": 0.6391, "num_input_tokens_seen": 55854120, "step": 96195 }, { "epoch": 14.32826928805481, "grad_norm": 3.3038675785064697, "learning_rate": 1.1281285195826884e-05, "loss": 0.6404, "num_input_tokens_seen": 55857064, "step": 96200 }, { "epoch": 14.32901400059577, "grad_norm": 0.8883534073829651, "learning_rate": 1.1278568836226142e-05, "loss": 0.7097, "num_input_tokens_seen": 55860296, "step": 96205 }, { "epoch": 14.32975871313673, "grad_norm": 1.5686004161834717, "learning_rate": 1.1275852708435033e-05, "loss": 0.6634, "num_input_tokens_seen": 55863016, "step": 96210 }, { "epoch": 14.330503425677689, "grad_norm": 1.7389594316482544, "learning_rate": 1.127313681249944e-05, "loss": 0.5902, "num_input_tokens_seen": 55866088, "step": 96215 }, { "epoch": 14.331248138218648, "grad_norm": 1.4221806526184082, "learning_rate": 1.1270421148465245e-05, "loss": 0.7666, "num_input_tokens_seen": 55868904, "step": 96220 }, { "epoch": 14.331992850759606, "grad_norm": 1.6502845287322998, "learning_rate": 1.1267705716378338e-05, "loss": 0.6782, "num_input_tokens_seen": 55872008, "step": 96225 }, { "epoch": 14.332737563300565, "grad_norm": 1.1793159246444702, "learning_rate": 1.1264990516284585e-05, "loss": 0.5634, "num_input_tokens_seen": 55874888, "step": 96230 }, { "epoch": 14.333482275841526, "grad_norm": 1.5866678953170776, "learning_rate": 1.126227554822985e-05, "loss": 0.4792, "num_input_tokens_seen": 55877736, "step": 96235 }, { "epoch": 14.334226988382484, "grad_norm": 1.2740551233291626, "learning_rate": 1.1259560812260014e-05, "loss": 0.5164, "num_input_tokens_seen": 55880648, "step": 96240 }, { "epoch": 14.334971700923443, "grad_norm": 0.859180748462677, "learning_rate": 1.1256846308420935e-05, "loss": 0.5085, "num_input_tokens_seen": 55883400, "step": 96245 }, { "epoch": 14.335716413464402, "grad_norm": 2.3440029621124268, "learning_rate": 1.125413203675846e-05, "loss": 0.6232, "num_input_tokens_seen": 55886504, "step": 96250 }, { "epoch": 14.336461126005362, "grad_norm": 1.6570515632629395, "learning_rate": 1.1251417997318464e-05, "loss": 0.6688, "num_input_tokens_seen": 55889224, "step": 96255 }, { "epoch": 14.337205838546321, "grad_norm": 0.9020674228668213, "learning_rate": 1.1248704190146778e-05, "loss": 0.6092, "num_input_tokens_seen": 55892072, "step": 96260 }, { "epoch": 14.33795055108728, "grad_norm": 0.9969997406005859, "learning_rate": 1.1245990615289264e-05, "loss": 0.6332, "num_input_tokens_seen": 55895016, "step": 96265 }, { "epoch": 14.338695263628239, "grad_norm": 3.72969388961792, "learning_rate": 1.1243277272791755e-05, "loss": 0.695, "num_input_tokens_seen": 55898024, "step": 96270 }, { "epoch": 14.3394399761692, "grad_norm": 1.8223620653152466, "learning_rate": 1.1240564162700101e-05, "loss": 0.6886, "num_input_tokens_seen": 55900968, "step": 96275 }, { "epoch": 14.340184688710158, "grad_norm": 0.8638744354248047, "learning_rate": 1.1237851285060133e-05, "loss": 0.5249, "num_input_tokens_seen": 55903912, "step": 96280 }, { "epoch": 14.340929401251117, "grad_norm": 2.1763854026794434, "learning_rate": 1.123513863991768e-05, "loss": 0.621, "num_input_tokens_seen": 55906952, "step": 96285 }, { "epoch": 14.341674113792076, "grad_norm": 1.531822919845581, "learning_rate": 1.1232426227318568e-05, "loss": 0.6719, "num_input_tokens_seen": 55909768, "step": 96290 }, { "epoch": 14.342418826333036, "grad_norm": 2.0580039024353027, "learning_rate": 1.1229714047308615e-05, "loss": 0.4787, "num_input_tokens_seen": 55912744, "step": 96295 }, { "epoch": 14.343163538873995, "grad_norm": 0.8039146065711975, "learning_rate": 1.1227002099933657e-05, "loss": 0.5185, "num_input_tokens_seen": 55915656, "step": 96300 }, { "epoch": 14.343908251414954, "grad_norm": 2.246110439300537, "learning_rate": 1.1224290385239488e-05, "loss": 0.5999, "num_input_tokens_seen": 55918440, "step": 96305 }, { "epoch": 14.344652963955912, "grad_norm": 1.1590182781219482, "learning_rate": 1.1221578903271943e-05, "loss": 0.561, "num_input_tokens_seen": 55921192, "step": 96310 }, { "epoch": 14.345397676496873, "grad_norm": 2.3834757804870605, "learning_rate": 1.1218867654076812e-05, "loss": 0.6546, "num_input_tokens_seen": 55923848, "step": 96315 }, { "epoch": 14.346142389037832, "grad_norm": 2.7412869930267334, "learning_rate": 1.1216156637699909e-05, "loss": 0.7582, "num_input_tokens_seen": 55926728, "step": 96320 }, { "epoch": 14.34688710157879, "grad_norm": 1.6948819160461426, "learning_rate": 1.1213445854187035e-05, "loss": 0.5737, "num_input_tokens_seen": 55929832, "step": 96325 }, { "epoch": 14.34763181411975, "grad_norm": 1.280031442642212, "learning_rate": 1.1210735303583972e-05, "loss": 0.6077, "num_input_tokens_seen": 55933128, "step": 96330 }, { "epoch": 14.34837652666071, "grad_norm": 2.0451102256774902, "learning_rate": 1.1208024985936527e-05, "loss": 0.737, "num_input_tokens_seen": 55935816, "step": 96335 }, { "epoch": 14.349121239201668, "grad_norm": 2.620694875717163, "learning_rate": 1.1205314901290475e-05, "loss": 0.7508, "num_input_tokens_seen": 55938696, "step": 96340 }, { "epoch": 14.349865951742627, "grad_norm": 1.3126519918441772, "learning_rate": 1.120260504969162e-05, "loss": 0.6966, "num_input_tokens_seen": 55941640, "step": 96345 }, { "epoch": 14.350610664283586, "grad_norm": 1.7662837505340576, "learning_rate": 1.1199895431185726e-05, "loss": 0.5339, "num_input_tokens_seen": 55944808, "step": 96350 }, { "epoch": 14.351355376824547, "grad_norm": 3.3815762996673584, "learning_rate": 1.1197186045818572e-05, "loss": 0.6291, "num_input_tokens_seen": 55947560, "step": 96355 }, { "epoch": 14.352100089365505, "grad_norm": 2.393831729888916, "learning_rate": 1.1194476893635924e-05, "loss": 0.5245, "num_input_tokens_seen": 55950536, "step": 96360 }, { "epoch": 14.352844801906464, "grad_norm": 0.9846044182777405, "learning_rate": 1.1191767974683567e-05, "loss": 0.5921, "num_input_tokens_seen": 55953576, "step": 96365 }, { "epoch": 14.353589514447423, "grad_norm": 2.351917266845703, "learning_rate": 1.1189059289007256e-05, "loss": 0.6615, "num_input_tokens_seen": 55956456, "step": 96370 }, { "epoch": 14.354334226988382, "grad_norm": 1.24082350730896, "learning_rate": 1.118635083665274e-05, "loss": 0.6162, "num_input_tokens_seen": 55958984, "step": 96375 }, { "epoch": 14.355078939529342, "grad_norm": 0.6169220805168152, "learning_rate": 1.1183642617665799e-05, "loss": 0.5034, "num_input_tokens_seen": 55962248, "step": 96380 }, { "epoch": 14.3558236520703, "grad_norm": 0.9210550785064697, "learning_rate": 1.1180934632092163e-05, "loss": 0.5545, "num_input_tokens_seen": 55965128, "step": 96385 }, { "epoch": 14.35656836461126, "grad_norm": 1.6242977380752563, "learning_rate": 1.11782268799776e-05, "loss": 0.5983, "num_input_tokens_seen": 55968136, "step": 96390 }, { "epoch": 14.357313077152218, "grad_norm": 1.8490434885025024, "learning_rate": 1.1175519361367837e-05, "loss": 0.6668, "num_input_tokens_seen": 55971176, "step": 96395 }, { "epoch": 14.358057789693179, "grad_norm": 1.648075819015503, "learning_rate": 1.1172812076308634e-05, "loss": 0.6177, "num_input_tokens_seen": 55973992, "step": 96400 }, { "epoch": 14.358802502234138, "grad_norm": 1.521339774131775, "learning_rate": 1.1170105024845718e-05, "loss": 0.523, "num_input_tokens_seen": 55976968, "step": 96405 }, { "epoch": 14.359547214775096, "grad_norm": 2.2695090770721436, "learning_rate": 1.1167398207024812e-05, "loss": 0.6086, "num_input_tokens_seen": 55979912, "step": 96410 }, { "epoch": 14.360291927316055, "grad_norm": 1.4842652082443237, "learning_rate": 1.1164691622891662e-05, "loss": 0.6417, "num_input_tokens_seen": 55982664, "step": 96415 }, { "epoch": 14.361036639857016, "grad_norm": 1.626794695854187, "learning_rate": 1.1161985272491986e-05, "loss": 0.6464, "num_input_tokens_seen": 55985704, "step": 96420 }, { "epoch": 14.361781352397974, "grad_norm": 1.0406697988510132, "learning_rate": 1.1159279155871507e-05, "loss": 0.7207, "num_input_tokens_seen": 55988648, "step": 96425 }, { "epoch": 14.362526064938933, "grad_norm": 0.8079155683517456, "learning_rate": 1.115657327307593e-05, "loss": 0.4227, "num_input_tokens_seen": 55991240, "step": 96430 }, { "epoch": 14.363270777479892, "grad_norm": 1.7924339771270752, "learning_rate": 1.1153867624150986e-05, "loss": 0.538, "num_input_tokens_seen": 55993992, "step": 96435 }, { "epoch": 14.364015490020853, "grad_norm": 1.4907339811325073, "learning_rate": 1.1151162209142362e-05, "loss": 0.6935, "num_input_tokens_seen": 55996904, "step": 96440 }, { "epoch": 14.364760202561811, "grad_norm": 1.7193129062652588, "learning_rate": 1.1148457028095794e-05, "loss": 0.5649, "num_input_tokens_seen": 56000104, "step": 96445 }, { "epoch": 14.36550491510277, "grad_norm": 2.492413282394409, "learning_rate": 1.1145752081056961e-05, "loss": 0.6951, "num_input_tokens_seen": 56003272, "step": 96450 }, { "epoch": 14.366249627643729, "grad_norm": 0.7469350695610046, "learning_rate": 1.114304736807156e-05, "loss": 0.5302, "num_input_tokens_seen": 56005928, "step": 96455 }, { "epoch": 14.36699434018469, "grad_norm": 0.7373510003089905, "learning_rate": 1.1140342889185299e-05, "loss": 0.5595, "num_input_tokens_seen": 56008648, "step": 96460 }, { "epoch": 14.367739052725648, "grad_norm": 1.8008521795272827, "learning_rate": 1.1137638644443846e-05, "loss": 0.6145, "num_input_tokens_seen": 56011816, "step": 96465 }, { "epoch": 14.368483765266607, "grad_norm": 1.2911553382873535, "learning_rate": 1.113493463389291e-05, "loss": 0.6873, "num_input_tokens_seen": 56014856, "step": 96470 }, { "epoch": 14.369228477807566, "grad_norm": 1.5957130193710327, "learning_rate": 1.1132230857578155e-05, "loss": 0.5273, "num_input_tokens_seen": 56017448, "step": 96475 }, { "epoch": 14.369973190348526, "grad_norm": 1.8351801633834839, "learning_rate": 1.1129527315545272e-05, "loss": 0.639, "num_input_tokens_seen": 56020680, "step": 96480 }, { "epoch": 14.370717902889485, "grad_norm": 1.1128125190734863, "learning_rate": 1.1126824007839927e-05, "loss": 0.536, "num_input_tokens_seen": 56023528, "step": 96485 }, { "epoch": 14.371462615430444, "grad_norm": 1.2091253995895386, "learning_rate": 1.1124120934507792e-05, "loss": 0.6524, "num_input_tokens_seen": 56026888, "step": 96490 }, { "epoch": 14.372207327971402, "grad_norm": 2.0820553302764893, "learning_rate": 1.112141809559453e-05, "loss": 0.5772, "num_input_tokens_seen": 56029672, "step": 96495 }, { "epoch": 14.372952040512363, "grad_norm": 1.786631464958191, "learning_rate": 1.1118715491145795e-05, "loss": 0.5348, "num_input_tokens_seen": 56032744, "step": 96500 }, { "epoch": 14.373696753053322, "grad_norm": 1.1052056550979614, "learning_rate": 1.1116013121207261e-05, "loss": 0.5621, "num_input_tokens_seen": 56035432, "step": 96505 }, { "epoch": 14.37444146559428, "grad_norm": 1.509537935256958, "learning_rate": 1.1113310985824566e-05, "loss": 0.5509, "num_input_tokens_seen": 56038056, "step": 96510 }, { "epoch": 14.37518617813524, "grad_norm": 1.6730268001556396, "learning_rate": 1.1110609085043378e-05, "loss": 0.6604, "num_input_tokens_seen": 56041128, "step": 96515 }, { "epoch": 14.3759308906762, "grad_norm": 1.9906766414642334, "learning_rate": 1.1107907418909324e-05, "loss": 0.4948, "num_input_tokens_seen": 56044040, "step": 96520 }, { "epoch": 14.376675603217159, "grad_norm": 1.1339963674545288, "learning_rate": 1.1105205987468064e-05, "loss": 0.6276, "num_input_tokens_seen": 56046952, "step": 96525 }, { "epoch": 14.377420315758117, "grad_norm": 0.805366575717926, "learning_rate": 1.1102504790765225e-05, "loss": 0.4866, "num_input_tokens_seen": 56049768, "step": 96530 }, { "epoch": 14.378165028299076, "grad_norm": 1.6368416547775269, "learning_rate": 1.1099803828846437e-05, "loss": 0.6011, "num_input_tokens_seen": 56052712, "step": 96535 }, { "epoch": 14.378909740840037, "grad_norm": 1.3910025358200073, "learning_rate": 1.1097103101757342e-05, "loss": 0.5579, "num_input_tokens_seen": 56055752, "step": 96540 }, { "epoch": 14.379654453380995, "grad_norm": 1.737115740776062, "learning_rate": 1.1094402609543561e-05, "loss": 0.4171, "num_input_tokens_seen": 56058600, "step": 96545 }, { "epoch": 14.380399165921954, "grad_norm": 1.211268663406372, "learning_rate": 1.1091702352250704e-05, "loss": 0.3471, "num_input_tokens_seen": 56061512, "step": 96550 }, { "epoch": 14.381143878462913, "grad_norm": 1.9976909160614014, "learning_rate": 1.108900232992441e-05, "loss": 0.553, "num_input_tokens_seen": 56064360, "step": 96555 }, { "epoch": 14.381888591003872, "grad_norm": 0.6852928996086121, "learning_rate": 1.1086302542610285e-05, "loss": 0.5035, "num_input_tokens_seen": 56067528, "step": 96560 }, { "epoch": 14.382633303544832, "grad_norm": 1.1401432752609253, "learning_rate": 1.1083602990353928e-05, "loss": 0.5595, "num_input_tokens_seen": 56070536, "step": 96565 }, { "epoch": 14.383378016085791, "grad_norm": 0.9184494614601135, "learning_rate": 1.1080903673200962e-05, "loss": 0.7378, "num_input_tokens_seen": 56073448, "step": 96570 }, { "epoch": 14.38412272862675, "grad_norm": 0.8073840737342834, "learning_rate": 1.107820459119698e-05, "loss": 0.5311, "num_input_tokens_seen": 56075976, "step": 96575 }, { "epoch": 14.384867441167708, "grad_norm": 1.4047635793685913, "learning_rate": 1.1075505744387577e-05, "loss": 0.5015, "num_input_tokens_seen": 56078568, "step": 96580 }, { "epoch": 14.385612153708669, "grad_norm": 1.7336007356643677, "learning_rate": 1.1072807132818358e-05, "loss": 0.6616, "num_input_tokens_seen": 56081832, "step": 96585 }, { "epoch": 14.386356866249628, "grad_norm": 0.9993895292282104, "learning_rate": 1.10701087565349e-05, "loss": 0.668, "num_input_tokens_seen": 56084488, "step": 96590 }, { "epoch": 14.387101578790586, "grad_norm": 1.4330676794052124, "learning_rate": 1.1067410615582808e-05, "loss": 0.7029, "num_input_tokens_seen": 56087560, "step": 96595 }, { "epoch": 14.387846291331545, "grad_norm": 2.099764823913574, "learning_rate": 1.106471271000764e-05, "loss": 0.5569, "num_input_tokens_seen": 56090472, "step": 96600 }, { "epoch": 14.388591003872506, "grad_norm": 1.6820446252822876, "learning_rate": 1.1062015039854997e-05, "loss": 0.8231, "num_input_tokens_seen": 56093416, "step": 96605 }, { "epoch": 14.389335716413465, "grad_norm": 1.7569341659545898, "learning_rate": 1.1059317605170447e-05, "loss": 0.64, "num_input_tokens_seen": 56096296, "step": 96610 }, { "epoch": 14.390080428954423, "grad_norm": 1.6959668397903442, "learning_rate": 1.1056620405999558e-05, "loss": 0.5754, "num_input_tokens_seen": 56099176, "step": 96615 }, { "epoch": 14.390825141495382, "grad_norm": 2.2743923664093018, "learning_rate": 1.1053923442387892e-05, "loss": 0.591, "num_input_tokens_seen": 56101768, "step": 96620 }, { "epoch": 14.391569854036343, "grad_norm": 2.0776171684265137, "learning_rate": 1.1051226714381008e-05, "loss": 0.5383, "num_input_tokens_seen": 56104648, "step": 96625 }, { "epoch": 14.392314566577301, "grad_norm": 0.8504213094711304, "learning_rate": 1.1048530222024481e-05, "loss": 0.6293, "num_input_tokens_seen": 56107656, "step": 96630 }, { "epoch": 14.39305927911826, "grad_norm": 1.4399654865264893, "learning_rate": 1.1045833965363847e-05, "loss": 0.6422, "num_input_tokens_seen": 56110376, "step": 96635 }, { "epoch": 14.393803991659219, "grad_norm": 2.4834704399108887, "learning_rate": 1.1043137944444673e-05, "loss": 0.5526, "num_input_tokens_seen": 56113352, "step": 96640 }, { "epoch": 14.39454870420018, "grad_norm": 1.4292980432510376, "learning_rate": 1.1040442159312491e-05, "loss": 0.6249, "num_input_tokens_seen": 56117672, "step": 96645 }, { "epoch": 14.395293416741138, "grad_norm": 0.9861571192741394, "learning_rate": 1.1037746610012861e-05, "loss": 0.3186, "num_input_tokens_seen": 56120776, "step": 96650 }, { "epoch": 14.396038129282097, "grad_norm": 1.6940573453903198, "learning_rate": 1.1035051296591309e-05, "loss": 0.7047, "num_input_tokens_seen": 56123464, "step": 96655 }, { "epoch": 14.396782841823056, "grad_norm": 1.523566722869873, "learning_rate": 1.1032356219093365e-05, "loss": 0.7059, "num_input_tokens_seen": 56126376, "step": 96660 }, { "epoch": 14.397527554364016, "grad_norm": 1.503231167793274, "learning_rate": 1.1029661377564576e-05, "loss": 0.4895, "num_input_tokens_seen": 56129320, "step": 96665 }, { "epoch": 14.398272266904975, "grad_norm": 3.005352258682251, "learning_rate": 1.1026966772050448e-05, "loss": 0.6517, "num_input_tokens_seen": 56132072, "step": 96670 }, { "epoch": 14.399016979445934, "grad_norm": 1.630634069442749, "learning_rate": 1.1024272402596526e-05, "loss": 0.674, "num_input_tokens_seen": 56134952, "step": 96675 }, { "epoch": 14.399761691986892, "grad_norm": 2.585268020629883, "learning_rate": 1.1021578269248314e-05, "loss": 0.7767, "num_input_tokens_seen": 56138152, "step": 96680 }, { "epoch": 14.400506404527853, "grad_norm": 4.202330112457275, "learning_rate": 1.1018884372051333e-05, "loss": 0.956, "num_input_tokens_seen": 56141352, "step": 96685 }, { "epoch": 14.401251117068812, "grad_norm": 1.3917510509490967, "learning_rate": 1.1016190711051092e-05, "loss": 0.4904, "num_input_tokens_seen": 56143976, "step": 96690 }, { "epoch": 14.40199582960977, "grad_norm": 2.260026216506958, "learning_rate": 1.1013497286293085e-05, "loss": 0.7662, "num_input_tokens_seen": 56146792, "step": 96695 }, { "epoch": 14.40274054215073, "grad_norm": 1.980429768562317, "learning_rate": 1.1010804097822836e-05, "loss": 0.5141, "num_input_tokens_seen": 56149928, "step": 96700 }, { "epoch": 14.40348525469169, "grad_norm": 3.037478446960449, "learning_rate": 1.1008111145685824e-05, "loss": 0.6465, "num_input_tokens_seen": 56152904, "step": 96705 }, { "epoch": 14.404229967232649, "grad_norm": 1.4111359119415283, "learning_rate": 1.1005418429927563e-05, "loss": 0.6627, "num_input_tokens_seen": 56155912, "step": 96710 }, { "epoch": 14.404974679773607, "grad_norm": 2.6491777896881104, "learning_rate": 1.1002725950593525e-05, "loss": 0.6796, "num_input_tokens_seen": 56158856, "step": 96715 }, { "epoch": 14.405719392314566, "grad_norm": 1.9931416511535645, "learning_rate": 1.1000033707729216e-05, "loss": 0.5954, "num_input_tokens_seen": 56161672, "step": 96720 }, { "epoch": 14.406464104855527, "grad_norm": 2.898268461227417, "learning_rate": 1.0997341701380099e-05, "loss": 0.8348, "num_input_tokens_seen": 56164648, "step": 96725 }, { "epoch": 14.407208817396485, "grad_norm": 1.8144344091415405, "learning_rate": 1.0994649931591669e-05, "loss": 0.6435, "num_input_tokens_seen": 56167880, "step": 96730 }, { "epoch": 14.407953529937444, "grad_norm": 0.857525646686554, "learning_rate": 1.0991958398409396e-05, "loss": 0.6935, "num_input_tokens_seen": 56170728, "step": 96735 }, { "epoch": 14.408698242478403, "grad_norm": 1.0617237091064453, "learning_rate": 1.0989267101878742e-05, "loss": 0.5019, "num_input_tokens_seen": 56173736, "step": 96740 }, { "epoch": 14.409442955019362, "grad_norm": 1.7479190826416016, "learning_rate": 1.0986576042045186e-05, "loss": 0.6011, "num_input_tokens_seen": 56176776, "step": 96745 }, { "epoch": 14.410187667560322, "grad_norm": 1.1374064683914185, "learning_rate": 1.0983885218954187e-05, "loss": 0.7067, "num_input_tokens_seen": 56179624, "step": 96750 }, { "epoch": 14.410932380101281, "grad_norm": 1.860649585723877, "learning_rate": 1.0981194632651201e-05, "loss": 0.6357, "num_input_tokens_seen": 56182280, "step": 96755 }, { "epoch": 14.41167709264224, "grad_norm": 1.5987321138381958, "learning_rate": 1.0978504283181674e-05, "loss": 0.4934, "num_input_tokens_seen": 56184936, "step": 96760 }, { "epoch": 14.412421805183198, "grad_norm": 0.872173011302948, "learning_rate": 1.0975814170591076e-05, "loss": 0.5596, "num_input_tokens_seen": 56187784, "step": 96765 }, { "epoch": 14.413166517724159, "grad_norm": 1.5075414180755615, "learning_rate": 1.0973124294924843e-05, "loss": 0.4946, "num_input_tokens_seen": 56190472, "step": 96770 }, { "epoch": 14.413911230265118, "grad_norm": 1.260451078414917, "learning_rate": 1.0970434656228412e-05, "loss": 0.6816, "num_input_tokens_seen": 56193576, "step": 96775 }, { "epoch": 14.414655942806077, "grad_norm": 1.3601044416427612, "learning_rate": 1.0967745254547238e-05, "loss": 0.704, "num_input_tokens_seen": 56196584, "step": 96780 }, { "epoch": 14.415400655347035, "grad_norm": 2.007591724395752, "learning_rate": 1.0965056089926734e-05, "loss": 0.532, "num_input_tokens_seen": 56199880, "step": 96785 }, { "epoch": 14.416145367887996, "grad_norm": 1.5348671674728394, "learning_rate": 1.0962367162412354e-05, "loss": 0.4034, "num_input_tokens_seen": 56203144, "step": 96790 }, { "epoch": 14.416890080428955, "grad_norm": 1.5977860689163208, "learning_rate": 1.0959678472049502e-05, "loss": 0.6169, "num_input_tokens_seen": 56206088, "step": 96795 }, { "epoch": 14.417634792969913, "grad_norm": 1.9356458187103271, "learning_rate": 1.0956990018883625e-05, "loss": 0.5325, "num_input_tokens_seen": 56208968, "step": 96800 }, { "epoch": 14.418379505510872, "grad_norm": 2.321417808532715, "learning_rate": 1.0954301802960118e-05, "loss": 0.6146, "num_input_tokens_seen": 56211624, "step": 96805 }, { "epoch": 14.419124218051833, "grad_norm": 1.4753732681274414, "learning_rate": 1.0951613824324417e-05, "loss": 0.5868, "num_input_tokens_seen": 56214344, "step": 96810 }, { "epoch": 14.419868930592791, "grad_norm": 1.819395661354065, "learning_rate": 1.0948926083021921e-05, "loss": 0.6485, "num_input_tokens_seen": 56217064, "step": 96815 }, { "epoch": 14.42061364313375, "grad_norm": 1.6153134107589722, "learning_rate": 1.0946238579098036e-05, "loss": 0.6684, "num_input_tokens_seen": 56220168, "step": 96820 }, { "epoch": 14.421358355674709, "grad_norm": 1.9202362298965454, "learning_rate": 1.0943551312598172e-05, "loss": 0.5758, "num_input_tokens_seen": 56222792, "step": 96825 }, { "epoch": 14.42210306821567, "grad_norm": 2.948071002960205, "learning_rate": 1.0940864283567708e-05, "loss": 0.3697, "num_input_tokens_seen": 56225480, "step": 96830 }, { "epoch": 14.422847780756628, "grad_norm": 1.6144616603851318, "learning_rate": 1.0938177492052064e-05, "loss": 0.4867, "num_input_tokens_seen": 56228296, "step": 96835 }, { "epoch": 14.423592493297587, "grad_norm": 1.7030932903289795, "learning_rate": 1.093549093809661e-05, "loss": 0.6056, "num_input_tokens_seen": 56231336, "step": 96840 }, { "epoch": 14.424337205838546, "grad_norm": 1.720703363418579, "learning_rate": 1.0932804621746751e-05, "loss": 0.7604, "num_input_tokens_seen": 56234248, "step": 96845 }, { "epoch": 14.425081918379506, "grad_norm": 1.2741148471832275, "learning_rate": 1.0930118543047862e-05, "loss": 0.544, "num_input_tokens_seen": 56237224, "step": 96850 }, { "epoch": 14.425826630920465, "grad_norm": 1.5894100666046143, "learning_rate": 1.0927432702045309e-05, "loss": 0.6933, "num_input_tokens_seen": 56240552, "step": 96855 }, { "epoch": 14.426571343461424, "grad_norm": 1.8893122673034668, "learning_rate": 1.0924747098784488e-05, "loss": 0.6582, "num_input_tokens_seen": 56243528, "step": 96860 }, { "epoch": 14.427316056002383, "grad_norm": 1.8749479055404663, "learning_rate": 1.0922061733310751e-05, "loss": 0.5468, "num_input_tokens_seen": 56246184, "step": 96865 }, { "epoch": 14.428060768543343, "grad_norm": 0.9358572959899902, "learning_rate": 1.0919376605669481e-05, "loss": 0.5565, "num_input_tokens_seen": 56248968, "step": 96870 }, { "epoch": 14.428805481084302, "grad_norm": 2.4467616081237793, "learning_rate": 1.0916691715906034e-05, "loss": 0.6483, "num_input_tokens_seen": 56252296, "step": 96875 }, { "epoch": 14.42955019362526, "grad_norm": 1.1846258640289307, "learning_rate": 1.0914007064065754e-05, "loss": 0.6471, "num_input_tokens_seen": 56255048, "step": 96880 }, { "epoch": 14.43029490616622, "grad_norm": 1.5830682516098022, "learning_rate": 1.091132265019402e-05, "loss": 0.6302, "num_input_tokens_seen": 56257960, "step": 96885 }, { "epoch": 14.43103961870718, "grad_norm": 1.85404372215271, "learning_rate": 1.0908638474336172e-05, "loss": 0.7686, "num_input_tokens_seen": 56260840, "step": 96890 }, { "epoch": 14.431784331248139, "grad_norm": 1.3737430572509766, "learning_rate": 1.0905954536537551e-05, "loss": 0.5681, "num_input_tokens_seen": 56263592, "step": 96895 }, { "epoch": 14.432529043789097, "grad_norm": 1.3655544519424438, "learning_rate": 1.0903270836843499e-05, "loss": 0.5084, "num_input_tokens_seen": 56266440, "step": 96900 }, { "epoch": 14.433273756330056, "grad_norm": 1.120810627937317, "learning_rate": 1.0900587375299365e-05, "loss": 0.6361, "num_input_tokens_seen": 56269448, "step": 96905 }, { "epoch": 14.434018468871017, "grad_norm": 1.3315025568008423, "learning_rate": 1.0897904151950469e-05, "loss": 0.6908, "num_input_tokens_seen": 56272392, "step": 96910 }, { "epoch": 14.434763181411975, "grad_norm": 1.3522433042526245, "learning_rate": 1.089522116684216e-05, "loss": 0.6214, "num_input_tokens_seen": 56274920, "step": 96915 }, { "epoch": 14.435507893952934, "grad_norm": 1.1851938962936401, "learning_rate": 1.0892538420019744e-05, "loss": 0.7573, "num_input_tokens_seen": 56277928, "step": 96920 }, { "epoch": 14.436252606493893, "grad_norm": 0.8861302733421326, "learning_rate": 1.0889855911528562e-05, "loss": 0.392, "num_input_tokens_seen": 56280552, "step": 96925 }, { "epoch": 14.436997319034852, "grad_norm": 2.0265755653381348, "learning_rate": 1.0887173641413923e-05, "loss": 0.5028, "num_input_tokens_seen": 56283592, "step": 96930 }, { "epoch": 14.437742031575812, "grad_norm": 1.0356062650680542, "learning_rate": 1.0884491609721133e-05, "loss": 0.5111, "num_input_tokens_seen": 56286600, "step": 96935 }, { "epoch": 14.438486744116771, "grad_norm": 1.605484962463379, "learning_rate": 1.088180981649552e-05, "loss": 0.5176, "num_input_tokens_seen": 56289096, "step": 96940 }, { "epoch": 14.43923145665773, "grad_norm": 0.722908616065979, "learning_rate": 1.0879128261782382e-05, "loss": 0.6505, "num_input_tokens_seen": 56292072, "step": 96945 }, { "epoch": 14.439976169198689, "grad_norm": 2.7434661388397217, "learning_rate": 1.0876446945627019e-05, "loss": 0.687, "num_input_tokens_seen": 56294920, "step": 96950 }, { "epoch": 14.440720881739649, "grad_norm": 1.3582379817962646, "learning_rate": 1.0873765868074723e-05, "loss": 0.7022, "num_input_tokens_seen": 56298184, "step": 96955 }, { "epoch": 14.441465594280608, "grad_norm": 2.2270467281341553, "learning_rate": 1.0871085029170802e-05, "loss": 0.6725, "num_input_tokens_seen": 56301256, "step": 96960 }, { "epoch": 14.442210306821567, "grad_norm": 0.9540914297103882, "learning_rate": 1.0868404428960532e-05, "loss": 0.4887, "num_input_tokens_seen": 56303944, "step": 96965 }, { "epoch": 14.442955019362525, "grad_norm": 1.4450006484985352, "learning_rate": 1.0865724067489214e-05, "loss": 0.5614, "num_input_tokens_seen": 56306920, "step": 96970 }, { "epoch": 14.443699731903486, "grad_norm": 2.1795763969421387, "learning_rate": 1.0863043944802123e-05, "loss": 0.6728, "num_input_tokens_seen": 56309736, "step": 96975 }, { "epoch": 14.444444444444445, "grad_norm": 1.7266048192977905, "learning_rate": 1.0860364060944527e-05, "loss": 0.4394, "num_input_tokens_seen": 56312616, "step": 96980 }, { "epoch": 14.445189156985403, "grad_norm": 3.315971851348877, "learning_rate": 1.0857684415961721e-05, "loss": 0.7309, "num_input_tokens_seen": 56315752, "step": 96985 }, { "epoch": 14.445933869526362, "grad_norm": 1.3905912637710571, "learning_rate": 1.0855005009898953e-05, "loss": 0.4808, "num_input_tokens_seen": 56318536, "step": 96990 }, { "epoch": 14.446678582067323, "grad_norm": 1.2618075609207153, "learning_rate": 1.0852325842801506e-05, "loss": 0.4699, "num_input_tokens_seen": 56321160, "step": 96995 }, { "epoch": 14.447423294608281, "grad_norm": 2.8085005283355713, "learning_rate": 1.0849646914714628e-05, "loss": 0.8594, "num_input_tokens_seen": 56324392, "step": 97000 }, { "epoch": 14.44816800714924, "grad_norm": 1.2744848728179932, "learning_rate": 1.0846968225683591e-05, "loss": 0.76, "num_input_tokens_seen": 56327272, "step": 97005 }, { "epoch": 14.448912719690199, "grad_norm": 1.5973844528198242, "learning_rate": 1.0844289775753645e-05, "loss": 0.5877, "num_input_tokens_seen": 56330216, "step": 97010 }, { "epoch": 14.44965743223116, "grad_norm": 0.7294389009475708, "learning_rate": 1.084161156497003e-05, "loss": 0.5032, "num_input_tokens_seen": 56333000, "step": 97015 }, { "epoch": 14.450402144772118, "grad_norm": 2.1260855197906494, "learning_rate": 1.0838933593378e-05, "loss": 0.6292, "num_input_tokens_seen": 56335944, "step": 97020 }, { "epoch": 14.451146857313077, "grad_norm": 2.177976131439209, "learning_rate": 1.0836255861022788e-05, "loss": 0.5136, "num_input_tokens_seen": 56339048, "step": 97025 }, { "epoch": 14.451891569854036, "grad_norm": 1.3431211709976196, "learning_rate": 1.0833578367949646e-05, "loss": 0.5495, "num_input_tokens_seen": 56341800, "step": 97030 }, { "epoch": 14.452636282394996, "grad_norm": 1.8167976140975952, "learning_rate": 1.0830901114203786e-05, "loss": 0.4706, "num_input_tokens_seen": 56344520, "step": 97035 }, { "epoch": 14.453380994935955, "grad_norm": 1.9979265928268433, "learning_rate": 1.0828224099830464e-05, "loss": 0.6316, "num_input_tokens_seen": 56347816, "step": 97040 }, { "epoch": 14.454125707476914, "grad_norm": 1.3086546659469604, "learning_rate": 1.0825547324874883e-05, "loss": 0.5477, "num_input_tokens_seen": 56350888, "step": 97045 }, { "epoch": 14.454870420017873, "grad_norm": 1.2587151527404785, "learning_rate": 1.0822870789382283e-05, "loss": 0.5578, "num_input_tokens_seen": 56353800, "step": 97050 }, { "epoch": 14.455615132558833, "grad_norm": 1.14518141746521, "learning_rate": 1.082019449339787e-05, "loss": 0.3766, "num_input_tokens_seen": 56356680, "step": 97055 }, { "epoch": 14.456359845099792, "grad_norm": 1.3140826225280762, "learning_rate": 1.0817518436966852e-05, "loss": 0.5342, "num_input_tokens_seen": 56359624, "step": 97060 }, { "epoch": 14.45710455764075, "grad_norm": 1.1834685802459717, "learning_rate": 1.0814842620134456e-05, "loss": 0.6331, "num_input_tokens_seen": 56362632, "step": 97065 }, { "epoch": 14.45784927018171, "grad_norm": 3.1425282955169678, "learning_rate": 1.0812167042945864e-05, "loss": 0.6428, "num_input_tokens_seen": 56365352, "step": 97070 }, { "epoch": 14.458593982722668, "grad_norm": 1.266772747039795, "learning_rate": 1.08094917054463e-05, "loss": 0.4692, "num_input_tokens_seen": 56368200, "step": 97075 }, { "epoch": 14.459338695263629, "grad_norm": 2.01350998878479, "learning_rate": 1.0806816607680954e-05, "loss": 0.5105, "num_input_tokens_seen": 56370888, "step": 97080 }, { "epoch": 14.460083407804587, "grad_norm": 1.7109414339065552, "learning_rate": 1.0804141749695012e-05, "loss": 0.591, "num_input_tokens_seen": 56373928, "step": 97085 }, { "epoch": 14.460828120345546, "grad_norm": 2.2607429027557373, "learning_rate": 1.0801467131533669e-05, "loss": 0.5976, "num_input_tokens_seen": 56376552, "step": 97090 }, { "epoch": 14.461572832886505, "grad_norm": 2.0635387897491455, "learning_rate": 1.0798792753242099e-05, "loss": 0.6043, "num_input_tokens_seen": 56379400, "step": 97095 }, { "epoch": 14.462317545427466, "grad_norm": 1.4642685651779175, "learning_rate": 1.0796118614865503e-05, "loss": 0.7175, "num_input_tokens_seen": 56382408, "step": 97100 }, { "epoch": 14.463062257968424, "grad_norm": 1.5776416063308716, "learning_rate": 1.0793444716449033e-05, "loss": 0.5425, "num_input_tokens_seen": 56385000, "step": 97105 }, { "epoch": 14.463806970509383, "grad_norm": 2.7422471046447754, "learning_rate": 1.0790771058037889e-05, "loss": 0.6745, "num_input_tokens_seen": 56388200, "step": 97110 }, { "epoch": 14.464551683050342, "grad_norm": 1.4940567016601562, "learning_rate": 1.0788097639677216e-05, "loss": 0.6463, "num_input_tokens_seen": 56390984, "step": 97115 }, { "epoch": 14.465296395591302, "grad_norm": 3.1120707988739014, "learning_rate": 1.0785424461412197e-05, "loss": 0.5787, "num_input_tokens_seen": 56393704, "step": 97120 }, { "epoch": 14.466041108132261, "grad_norm": 2.081193447113037, "learning_rate": 1.0782751523287977e-05, "loss": 0.6853, "num_input_tokens_seen": 56396616, "step": 97125 }, { "epoch": 14.46678582067322, "grad_norm": 2.023292064666748, "learning_rate": 1.0780078825349729e-05, "loss": 0.5642, "num_input_tokens_seen": 56399528, "step": 97130 }, { "epoch": 14.467530533214179, "grad_norm": 1.16190505027771, "learning_rate": 1.0777406367642595e-05, "loss": 0.4004, "num_input_tokens_seen": 56402312, "step": 97135 }, { "epoch": 14.46827524575514, "grad_norm": 2.8941545486450195, "learning_rate": 1.0774734150211718e-05, "loss": 0.5419, "num_input_tokens_seen": 56405032, "step": 97140 }, { "epoch": 14.469019958296098, "grad_norm": 0.9981409907341003, "learning_rate": 1.077206217310226e-05, "loss": 0.6227, "num_input_tokens_seen": 56407624, "step": 97145 }, { "epoch": 14.469764670837057, "grad_norm": 0.8596842885017395, "learning_rate": 1.0769390436359348e-05, "loss": 0.5283, "num_input_tokens_seen": 56410568, "step": 97150 }, { "epoch": 14.470509383378015, "grad_norm": 1.5894683599472046, "learning_rate": 1.0766718940028123e-05, "loss": 0.4289, "num_input_tokens_seen": 56413736, "step": 97155 }, { "epoch": 14.471254095918976, "grad_norm": 1.9079235792160034, "learning_rate": 1.0764047684153705e-05, "loss": 0.5065, "num_input_tokens_seen": 56416936, "step": 97160 }, { "epoch": 14.471998808459935, "grad_norm": 2.012469530105591, "learning_rate": 1.0761376668781244e-05, "loss": 0.5762, "num_input_tokens_seen": 56420008, "step": 97165 }, { "epoch": 14.472743521000893, "grad_norm": 1.7118027210235596, "learning_rate": 1.0758705893955843e-05, "loss": 0.612, "num_input_tokens_seen": 56422760, "step": 97170 }, { "epoch": 14.473488233541852, "grad_norm": 2.499028444290161, "learning_rate": 1.0756035359722639e-05, "loss": 0.5407, "num_input_tokens_seen": 56425832, "step": 97175 }, { "epoch": 14.474232946082813, "grad_norm": 1.9511996507644653, "learning_rate": 1.0753365066126741e-05, "loss": 0.5106, "num_input_tokens_seen": 56428552, "step": 97180 }, { "epoch": 14.474977658623772, "grad_norm": 1.1725703477859497, "learning_rate": 1.0750695013213251e-05, "loss": 0.7411, "num_input_tokens_seen": 56431272, "step": 97185 }, { "epoch": 14.47572237116473, "grad_norm": 1.4333758354187012, "learning_rate": 1.0748025201027298e-05, "loss": 0.5177, "num_input_tokens_seen": 56434056, "step": 97190 }, { "epoch": 14.476467083705689, "grad_norm": 1.9389499425888062, "learning_rate": 1.0745355629613965e-05, "loss": 0.7043, "num_input_tokens_seen": 56437064, "step": 97195 }, { "epoch": 14.47721179624665, "grad_norm": 1.6405104398727417, "learning_rate": 1.0742686299018368e-05, "loss": 0.4622, "num_input_tokens_seen": 56439944, "step": 97200 }, { "epoch": 14.477956508787608, "grad_norm": 2.939011573791504, "learning_rate": 1.0740017209285597e-05, "loss": 0.6231, "num_input_tokens_seen": 56442856, "step": 97205 }, { "epoch": 14.478701221328567, "grad_norm": 2.3417134284973145, "learning_rate": 1.0737348360460733e-05, "loss": 0.5971, "num_input_tokens_seen": 56445864, "step": 97210 }, { "epoch": 14.479445933869526, "grad_norm": 1.4436237812042236, "learning_rate": 1.073467975258888e-05, "loss": 0.6155, "num_input_tokens_seen": 56448648, "step": 97215 }, { "epoch": 14.480190646410486, "grad_norm": 1.215097188949585, "learning_rate": 1.0732011385715116e-05, "loss": 0.6716, "num_input_tokens_seen": 56451304, "step": 97220 }, { "epoch": 14.480935358951445, "grad_norm": 1.28305983543396, "learning_rate": 1.0729343259884516e-05, "loss": 0.6156, "num_input_tokens_seen": 56454312, "step": 97225 }, { "epoch": 14.481680071492404, "grad_norm": 1.5759861469268799, "learning_rate": 1.0726675375142151e-05, "loss": 0.6715, "num_input_tokens_seen": 56457608, "step": 97230 }, { "epoch": 14.482424784033363, "grad_norm": 2.2249393463134766, "learning_rate": 1.0724007731533107e-05, "loss": 0.6746, "num_input_tokens_seen": 56460584, "step": 97235 }, { "epoch": 14.483169496574323, "grad_norm": 2.6401519775390625, "learning_rate": 1.072134032910243e-05, "loss": 0.6634, "num_input_tokens_seen": 56463624, "step": 97240 }, { "epoch": 14.483914209115282, "grad_norm": 1.1626008749008179, "learning_rate": 1.071867316789521e-05, "loss": 0.3824, "num_input_tokens_seen": 56466920, "step": 97245 }, { "epoch": 14.48465892165624, "grad_norm": 1.5245895385742188, "learning_rate": 1.0716006247956481e-05, "loss": 0.6041, "num_input_tokens_seen": 56470056, "step": 97250 }, { "epoch": 14.4854036341972, "grad_norm": 1.29891836643219, "learning_rate": 1.0713339569331318e-05, "loss": 0.5592, "num_input_tokens_seen": 56473224, "step": 97255 }, { "epoch": 14.486148346738158, "grad_norm": 3.0837643146514893, "learning_rate": 1.0710673132064764e-05, "loss": 0.7048, "num_input_tokens_seen": 56476104, "step": 97260 }, { "epoch": 14.486893059279119, "grad_norm": 1.3174771070480347, "learning_rate": 1.0708006936201853e-05, "loss": 0.6452, "num_input_tokens_seen": 56479272, "step": 97265 }, { "epoch": 14.487637771820078, "grad_norm": 1.5028362274169922, "learning_rate": 1.0705340981787648e-05, "loss": 0.7662, "num_input_tokens_seen": 56482280, "step": 97270 }, { "epoch": 14.488382484361036, "grad_norm": 2.028752565383911, "learning_rate": 1.070267526886718e-05, "loss": 0.641, "num_input_tokens_seen": 56485032, "step": 97275 }, { "epoch": 14.489127196901995, "grad_norm": 1.7487887144088745, "learning_rate": 1.0700009797485483e-05, "loss": 0.6594, "num_input_tokens_seen": 56488072, "step": 97280 }, { "epoch": 14.489871909442956, "grad_norm": 1.8520054817199707, "learning_rate": 1.0697344567687575e-05, "loss": 0.6389, "num_input_tokens_seen": 56490920, "step": 97285 }, { "epoch": 14.490616621983914, "grad_norm": 2.838735580444336, "learning_rate": 1.0694679579518508e-05, "loss": 0.7126, "num_input_tokens_seen": 56493672, "step": 97290 }, { "epoch": 14.491361334524873, "grad_norm": 1.718475103378296, "learning_rate": 1.0692014833023283e-05, "loss": 0.6507, "num_input_tokens_seen": 56496424, "step": 97295 }, { "epoch": 14.492106047065832, "grad_norm": 0.5149000883102417, "learning_rate": 1.0689350328246922e-05, "loss": 0.3028, "num_input_tokens_seen": 56499144, "step": 97300 }, { "epoch": 14.492850759606792, "grad_norm": 1.5684243440628052, "learning_rate": 1.068668606523445e-05, "loss": 0.5024, "num_input_tokens_seen": 56502056, "step": 97305 }, { "epoch": 14.493595472147751, "grad_norm": 2.3348212242126465, "learning_rate": 1.0684022044030861e-05, "loss": 0.5534, "num_input_tokens_seen": 56504648, "step": 97310 }, { "epoch": 14.49434018468871, "grad_norm": 0.8051902055740356, "learning_rate": 1.0681358264681176e-05, "loss": 0.7512, "num_input_tokens_seen": 56507496, "step": 97315 }, { "epoch": 14.495084897229669, "grad_norm": 2.352097749710083, "learning_rate": 1.0678694727230384e-05, "loss": 0.7359, "num_input_tokens_seen": 56510312, "step": 97320 }, { "epoch": 14.49582960977063, "grad_norm": 2.1826601028442383, "learning_rate": 1.0676031431723497e-05, "loss": 0.4883, "num_input_tokens_seen": 56513288, "step": 97325 }, { "epoch": 14.496574322311588, "grad_norm": 1.7404701709747314, "learning_rate": 1.0673368378205492e-05, "loss": 0.5125, "num_input_tokens_seen": 56515944, "step": 97330 }, { "epoch": 14.497319034852547, "grad_norm": 0.9336320757865906, "learning_rate": 1.0670705566721376e-05, "loss": 0.7204, "num_input_tokens_seen": 56518888, "step": 97335 }, { "epoch": 14.498063747393505, "grad_norm": 1.4862918853759766, "learning_rate": 1.0668042997316126e-05, "loss": 0.5055, "num_input_tokens_seen": 56521736, "step": 97340 }, { "epoch": 14.498808459934466, "grad_norm": 2.994311809539795, "learning_rate": 1.0665380670034725e-05, "loss": 0.6397, "num_input_tokens_seen": 56524328, "step": 97345 }, { "epoch": 14.499553172475425, "grad_norm": 3.0165512561798096, "learning_rate": 1.0662718584922145e-05, "loss": 0.6372, "num_input_tokens_seen": 56527336, "step": 97350 }, { "epoch": 14.500297885016384, "grad_norm": 1.8226698637008667, "learning_rate": 1.0660056742023355e-05, "loss": 0.6117, "num_input_tokens_seen": 56530088, "step": 97355 }, { "epoch": 14.501042597557342, "grad_norm": 1.0942578315734863, "learning_rate": 1.0657395141383342e-05, "loss": 0.4125, "num_input_tokens_seen": 56532904, "step": 97360 }, { "epoch": 14.501787310098303, "grad_norm": 1.6679234504699707, "learning_rate": 1.0654733783047052e-05, "loss": 0.5258, "num_input_tokens_seen": 56536040, "step": 97365 }, { "epoch": 14.502532022639262, "grad_norm": 2.982822895050049, "learning_rate": 1.0652072667059462e-05, "loss": 0.7622, "num_input_tokens_seen": 56538952, "step": 97370 }, { "epoch": 14.50327673518022, "grad_norm": 1.7925209999084473, "learning_rate": 1.0649411793465525e-05, "loss": 0.674, "num_input_tokens_seen": 56541928, "step": 97375 }, { "epoch": 14.504021447721179, "grad_norm": 1.2452905178070068, "learning_rate": 1.0646751162310178e-05, "loss": 0.5789, "num_input_tokens_seen": 56544808, "step": 97380 }, { "epoch": 14.50476616026214, "grad_norm": 1.6568996906280518, "learning_rate": 1.0644090773638394e-05, "loss": 0.5313, "num_input_tokens_seen": 56547400, "step": 97385 }, { "epoch": 14.505510872803098, "grad_norm": 0.8164774179458618, "learning_rate": 1.0641430627495094e-05, "loss": 0.523, "num_input_tokens_seen": 56550056, "step": 97390 }, { "epoch": 14.506255585344057, "grad_norm": 0.8640784621238708, "learning_rate": 1.063877072392524e-05, "loss": 0.5485, "num_input_tokens_seen": 56552872, "step": 97395 }, { "epoch": 14.507000297885016, "grad_norm": 1.4267935752868652, "learning_rate": 1.063611106297375e-05, "loss": 0.4596, "num_input_tokens_seen": 56555848, "step": 97400 }, { "epoch": 14.507745010425975, "grad_norm": 1.1973457336425781, "learning_rate": 1.0633451644685572e-05, "loss": 0.428, "num_input_tokens_seen": 56558952, "step": 97405 }, { "epoch": 14.508489722966935, "grad_norm": 1.872606873512268, "learning_rate": 1.063079246910563e-05, "loss": 0.653, "num_input_tokens_seen": 56562088, "step": 97410 }, { "epoch": 14.509234435507894, "grad_norm": 1.0322448015213013, "learning_rate": 1.0628133536278842e-05, "loss": 0.5493, "num_input_tokens_seen": 56564872, "step": 97415 }, { "epoch": 14.509979148048853, "grad_norm": 1.6785475015640259, "learning_rate": 1.0625474846250134e-05, "loss": 0.7785, "num_input_tokens_seen": 56567848, "step": 97420 }, { "epoch": 14.510723860589813, "grad_norm": 2.8797144889831543, "learning_rate": 1.062281639906441e-05, "loss": 0.9994, "num_input_tokens_seen": 56570696, "step": 97425 }, { "epoch": 14.511468573130772, "grad_norm": 0.8524937033653259, "learning_rate": 1.0620158194766597e-05, "loss": 0.5539, "num_input_tokens_seen": 56573448, "step": 97430 }, { "epoch": 14.51221328567173, "grad_norm": 1.0630496740341187, "learning_rate": 1.0617500233401587e-05, "loss": 0.4925, "num_input_tokens_seen": 56576424, "step": 97435 }, { "epoch": 14.51295799821269, "grad_norm": 1.731955647468567, "learning_rate": 1.0614842515014303e-05, "loss": 0.7006, "num_input_tokens_seen": 56579432, "step": 97440 }, { "epoch": 14.513702710753648, "grad_norm": 1.5148371458053589, "learning_rate": 1.0612185039649625e-05, "loss": 0.6443, "num_input_tokens_seen": 56582728, "step": 97445 }, { "epoch": 14.514447423294609, "grad_norm": 1.8519552946090698, "learning_rate": 1.0609527807352469e-05, "loss": 0.5388, "num_input_tokens_seen": 56585672, "step": 97450 }, { "epoch": 14.515192135835568, "grad_norm": 1.273348093032837, "learning_rate": 1.060687081816771e-05, "loss": 0.6544, "num_input_tokens_seen": 56588776, "step": 97455 }, { "epoch": 14.515936848376526, "grad_norm": 2.7092065811157227, "learning_rate": 1.0604214072140233e-05, "loss": 0.6362, "num_input_tokens_seen": 56591656, "step": 97460 }, { "epoch": 14.516681560917485, "grad_norm": 1.7376682758331299, "learning_rate": 1.0601557569314941e-05, "loss": 0.5222, "num_input_tokens_seen": 56594600, "step": 97465 }, { "epoch": 14.517426273458446, "grad_norm": 1.3605647087097168, "learning_rate": 1.0598901309736686e-05, "loss": 0.4788, "num_input_tokens_seen": 56597448, "step": 97470 }, { "epoch": 14.518170985999404, "grad_norm": 1.724701166152954, "learning_rate": 1.0596245293450368e-05, "loss": 0.5846, "num_input_tokens_seen": 56600424, "step": 97475 }, { "epoch": 14.518915698540363, "grad_norm": 2.1354806423187256, "learning_rate": 1.0593589520500846e-05, "loss": 0.5066, "num_input_tokens_seen": 56603464, "step": 97480 }, { "epoch": 14.519660411081322, "grad_norm": 0.8025646805763245, "learning_rate": 1.0590933990932989e-05, "loss": 0.5396, "num_input_tokens_seen": 56606344, "step": 97485 }, { "epoch": 14.520405123622282, "grad_norm": 1.0716246366500854, "learning_rate": 1.0588278704791646e-05, "loss": 0.6206, "num_input_tokens_seen": 56609384, "step": 97490 }, { "epoch": 14.521149836163241, "grad_norm": 1.8750697374343872, "learning_rate": 1.0585623662121696e-05, "loss": 0.7284, "num_input_tokens_seen": 56612008, "step": 97495 }, { "epoch": 14.5218945487042, "grad_norm": 1.773057222366333, "learning_rate": 1.0582968862967984e-05, "loss": 0.6252, "num_input_tokens_seen": 56614824, "step": 97500 }, { "epoch": 14.522639261245159, "grad_norm": 1.8556241989135742, "learning_rate": 1.0580314307375352e-05, "loss": 0.8906, "num_input_tokens_seen": 56617576, "step": 97505 }, { "epoch": 14.52338397378612, "grad_norm": 1.6912508010864258, "learning_rate": 1.0577659995388664e-05, "loss": 0.6254, "num_input_tokens_seen": 56620584, "step": 97510 }, { "epoch": 14.524128686327078, "grad_norm": 1.7215073108673096, "learning_rate": 1.0575005927052743e-05, "loss": 0.6272, "num_input_tokens_seen": 56623272, "step": 97515 }, { "epoch": 14.524873398868037, "grad_norm": 2.1221859455108643, "learning_rate": 1.0572352102412442e-05, "loss": 0.5993, "num_input_tokens_seen": 56626088, "step": 97520 }, { "epoch": 14.525618111408996, "grad_norm": 1.185351848602295, "learning_rate": 1.0569698521512583e-05, "loss": 0.5246, "num_input_tokens_seen": 56628904, "step": 97525 }, { "epoch": 14.526362823949956, "grad_norm": 1.340514063835144, "learning_rate": 1.0567045184398009e-05, "loss": 0.6456, "num_input_tokens_seen": 56631624, "step": 97530 }, { "epoch": 14.527107536490915, "grad_norm": 2.1218039989471436, "learning_rate": 1.0564392091113537e-05, "loss": 0.6718, "num_input_tokens_seen": 56634536, "step": 97535 }, { "epoch": 14.527852249031874, "grad_norm": 1.9695175886154175, "learning_rate": 1.056173924170398e-05, "loss": 0.6735, "num_input_tokens_seen": 56637384, "step": 97540 }, { "epoch": 14.528596961572832, "grad_norm": 0.9907840490341187, "learning_rate": 1.0559086636214174e-05, "loss": 0.5439, "num_input_tokens_seen": 56640072, "step": 97545 }, { "epoch": 14.529341674113793, "grad_norm": 1.9617056846618652, "learning_rate": 1.0556434274688923e-05, "loss": 0.6346, "num_input_tokens_seen": 56643112, "step": 97550 }, { "epoch": 14.530086386654752, "grad_norm": 1.977974534034729, "learning_rate": 1.0553782157173034e-05, "loss": 0.7278, "num_input_tokens_seen": 56646024, "step": 97555 }, { "epoch": 14.53083109919571, "grad_norm": 2.0756590366363525, "learning_rate": 1.0551130283711302e-05, "loss": 0.8216, "num_input_tokens_seen": 56649032, "step": 97560 }, { "epoch": 14.53157581173667, "grad_norm": 1.256609320640564, "learning_rate": 1.0548478654348551e-05, "loss": 0.5562, "num_input_tokens_seen": 56651752, "step": 97565 }, { "epoch": 14.53232052427763, "grad_norm": 1.3703887462615967, "learning_rate": 1.0545827269129554e-05, "loss": 0.4285, "num_input_tokens_seen": 56654536, "step": 97570 }, { "epoch": 14.533065236818588, "grad_norm": 1.1531192064285278, "learning_rate": 1.0543176128099126e-05, "loss": 0.6579, "num_input_tokens_seen": 56657256, "step": 97575 }, { "epoch": 14.533809949359547, "grad_norm": 1.2140260934829712, "learning_rate": 1.0540525231302043e-05, "loss": 0.4037, "num_input_tokens_seen": 56659880, "step": 97580 }, { "epoch": 14.534554661900506, "grad_norm": 1.7510261535644531, "learning_rate": 1.053787457878308e-05, "loss": 0.5381, "num_input_tokens_seen": 56662760, "step": 97585 }, { "epoch": 14.535299374441465, "grad_norm": 2.2515218257904053, "learning_rate": 1.0535224170587038e-05, "loss": 0.517, "num_input_tokens_seen": 56665672, "step": 97590 }, { "epoch": 14.536044086982425, "grad_norm": 1.2481460571289062, "learning_rate": 1.0532574006758673e-05, "loss": 0.4454, "num_input_tokens_seen": 56668712, "step": 97595 }, { "epoch": 14.536788799523384, "grad_norm": 1.530695915222168, "learning_rate": 1.0529924087342774e-05, "loss": 0.7067, "num_input_tokens_seen": 56671496, "step": 97600 }, { "epoch": 14.537533512064343, "grad_norm": 1.3132352828979492, "learning_rate": 1.05272744123841e-05, "loss": 0.6162, "num_input_tokens_seen": 56674312, "step": 97605 }, { "epoch": 14.538278224605303, "grad_norm": 1.1987003087997437, "learning_rate": 1.0524624981927416e-05, "loss": 0.532, "num_input_tokens_seen": 56677192, "step": 97610 }, { "epoch": 14.539022937146262, "grad_norm": 1.1998811960220337, "learning_rate": 1.0521975796017483e-05, "loss": 0.5578, "num_input_tokens_seen": 56680168, "step": 97615 }, { "epoch": 14.53976764968722, "grad_norm": 1.241721272468567, "learning_rate": 1.0519326854699043e-05, "loss": 0.5246, "num_input_tokens_seen": 56682728, "step": 97620 }, { "epoch": 14.54051236222818, "grad_norm": 1.2543736696243286, "learning_rate": 1.0516678158016868e-05, "loss": 0.4856, "num_input_tokens_seen": 56685608, "step": 97625 }, { "epoch": 14.541257074769138, "grad_norm": 1.2541989088058472, "learning_rate": 1.0514029706015687e-05, "loss": 0.6088, "num_input_tokens_seen": 56688776, "step": 97630 }, { "epoch": 14.542001787310099, "grad_norm": 1.3053230047225952, "learning_rate": 1.051138149874026e-05, "loss": 0.5518, "num_input_tokens_seen": 56691528, "step": 97635 }, { "epoch": 14.542746499851058, "grad_norm": 1.4963732957839966, "learning_rate": 1.0508733536235307e-05, "loss": 0.7596, "num_input_tokens_seen": 56694568, "step": 97640 }, { "epoch": 14.543491212392016, "grad_norm": 3.2889621257781982, "learning_rate": 1.0506085818545582e-05, "loss": 0.6387, "num_input_tokens_seen": 56697768, "step": 97645 }, { "epoch": 14.544235924932975, "grad_norm": 1.968565821647644, "learning_rate": 1.0503438345715798e-05, "loss": 0.5152, "num_input_tokens_seen": 56700936, "step": 97650 }, { "epoch": 14.544980637473936, "grad_norm": 2.5348196029663086, "learning_rate": 1.0500791117790699e-05, "loss": 0.5773, "num_input_tokens_seen": 56703560, "step": 97655 }, { "epoch": 14.545725350014894, "grad_norm": 1.8714468479156494, "learning_rate": 1.0498144134814996e-05, "loss": 0.6606, "num_input_tokens_seen": 56706568, "step": 97660 }, { "epoch": 14.546470062555853, "grad_norm": 1.4267922639846802, "learning_rate": 1.04954973968334e-05, "loss": 0.5219, "num_input_tokens_seen": 56709512, "step": 97665 }, { "epoch": 14.547214775096812, "grad_norm": 1.6160892248153687, "learning_rate": 1.0492850903890644e-05, "loss": 0.6044, "num_input_tokens_seen": 56712616, "step": 97670 }, { "epoch": 14.547959487637772, "grad_norm": 2.9002161026000977, "learning_rate": 1.0490204656031427e-05, "loss": 0.4845, "num_input_tokens_seen": 56715272, "step": 97675 }, { "epoch": 14.548704200178731, "grad_norm": 0.8928161859512329, "learning_rate": 1.0487558653300455e-05, "loss": 0.4345, "num_input_tokens_seen": 56718248, "step": 97680 }, { "epoch": 14.54944891271969, "grad_norm": 1.1527369022369385, "learning_rate": 1.0484912895742422e-05, "loss": 0.7515, "num_input_tokens_seen": 56720808, "step": 97685 }, { "epoch": 14.550193625260649, "grad_norm": 1.0522807836532593, "learning_rate": 1.0482267383402041e-05, "loss": 0.5522, "num_input_tokens_seen": 56723752, "step": 97690 }, { "epoch": 14.55093833780161, "grad_norm": 1.7024435997009277, "learning_rate": 1.0479622116323997e-05, "loss": 0.5848, "num_input_tokens_seen": 56726792, "step": 97695 }, { "epoch": 14.551683050342568, "grad_norm": 1.4388591051101685, "learning_rate": 1.047697709455297e-05, "loss": 0.5619, "num_input_tokens_seen": 56729576, "step": 97700 }, { "epoch": 14.552427762883527, "grad_norm": 1.4701838493347168, "learning_rate": 1.0474332318133664e-05, "loss": 0.6926, "num_input_tokens_seen": 56732424, "step": 97705 }, { "epoch": 14.553172475424486, "grad_norm": 1.0707223415374756, "learning_rate": 1.0471687787110743e-05, "loss": 0.5945, "num_input_tokens_seen": 56735208, "step": 97710 }, { "epoch": 14.553917187965446, "grad_norm": 1.462269902229309, "learning_rate": 1.04690435015289e-05, "loss": 0.5138, "num_input_tokens_seen": 56737992, "step": 97715 }, { "epoch": 14.554661900506405, "grad_norm": 1.9740986824035645, "learning_rate": 1.0466399461432785e-05, "loss": 0.6955, "num_input_tokens_seen": 56740968, "step": 97720 }, { "epoch": 14.555406613047364, "grad_norm": 1.4905425310134888, "learning_rate": 1.0463755666867093e-05, "loss": 0.6091, "num_input_tokens_seen": 56743880, "step": 97725 }, { "epoch": 14.556151325588322, "grad_norm": 1.102974534034729, "learning_rate": 1.0461112117876464e-05, "loss": 0.654, "num_input_tokens_seen": 56746568, "step": 97730 }, { "epoch": 14.556896038129283, "grad_norm": 2.1390678882598877, "learning_rate": 1.0458468814505578e-05, "loss": 0.6373, "num_input_tokens_seen": 56749448, "step": 97735 }, { "epoch": 14.557640750670242, "grad_norm": 1.1093376874923706, "learning_rate": 1.0455825756799084e-05, "loss": 0.5714, "num_input_tokens_seen": 56752392, "step": 97740 }, { "epoch": 14.5583854632112, "grad_norm": 1.3563404083251953, "learning_rate": 1.0453182944801631e-05, "loss": 0.5698, "num_input_tokens_seen": 56755624, "step": 97745 }, { "epoch": 14.55913017575216, "grad_norm": 1.9582505226135254, "learning_rate": 1.045054037855787e-05, "loss": 0.6358, "num_input_tokens_seen": 56758792, "step": 97750 }, { "epoch": 14.55987488829312, "grad_norm": 1.9494673013687134, "learning_rate": 1.0447898058112427e-05, "loss": 0.4405, "num_input_tokens_seen": 56762184, "step": 97755 }, { "epoch": 14.560619600834078, "grad_norm": 0.9527035355567932, "learning_rate": 1.0445255983509969e-05, "loss": 0.5624, "num_input_tokens_seen": 56764872, "step": 97760 }, { "epoch": 14.561364313375037, "grad_norm": 2.2431256771087646, "learning_rate": 1.0442614154795107e-05, "loss": 0.6299, "num_input_tokens_seen": 56767848, "step": 97765 }, { "epoch": 14.562109025915996, "grad_norm": 1.1666406393051147, "learning_rate": 1.0439972572012496e-05, "loss": 0.4859, "num_input_tokens_seen": 56770888, "step": 97770 }, { "epoch": 14.562853738456955, "grad_norm": 2.5964698791503906, "learning_rate": 1.0437331235206737e-05, "loss": 0.7707, "num_input_tokens_seen": 56773608, "step": 97775 }, { "epoch": 14.563598450997915, "grad_norm": 2.5221049785614014, "learning_rate": 1.043469014442248e-05, "loss": 0.6067, "num_input_tokens_seen": 56776680, "step": 97780 }, { "epoch": 14.564343163538874, "grad_norm": 1.8358662128448486, "learning_rate": 1.0432049299704324e-05, "loss": 0.514, "num_input_tokens_seen": 56779432, "step": 97785 }, { "epoch": 14.565087876079833, "grad_norm": 2.777149200439453, "learning_rate": 1.0429408701096884e-05, "loss": 0.528, "num_input_tokens_seen": 56782120, "step": 97790 }, { "epoch": 14.565832588620792, "grad_norm": 2.0411181449890137, "learning_rate": 1.0426768348644782e-05, "loss": 0.6295, "num_input_tokens_seen": 56785064, "step": 97795 }, { "epoch": 14.566577301161752, "grad_norm": 1.5655403137207031, "learning_rate": 1.042412824239261e-05, "loss": 0.5354, "num_input_tokens_seen": 56787848, "step": 97800 }, { "epoch": 14.56732201370271, "grad_norm": 2.676297903060913, "learning_rate": 1.0421488382384986e-05, "loss": 0.6741, "num_input_tokens_seen": 56790536, "step": 97805 }, { "epoch": 14.56806672624367, "grad_norm": 1.9535794258117676, "learning_rate": 1.0418848768666498e-05, "loss": 0.5806, "num_input_tokens_seen": 56793416, "step": 97810 }, { "epoch": 14.568811438784628, "grad_norm": 5.264516830444336, "learning_rate": 1.041620940128174e-05, "loss": 0.5415, "num_input_tokens_seen": 56796136, "step": 97815 }, { "epoch": 14.569556151325589, "grad_norm": 1.4387861490249634, "learning_rate": 1.04135702802753e-05, "loss": 0.6954, "num_input_tokens_seen": 56799240, "step": 97820 }, { "epoch": 14.570300863866548, "grad_norm": 1.7471632957458496, "learning_rate": 1.041093140569176e-05, "loss": 0.5967, "num_input_tokens_seen": 56802184, "step": 97825 }, { "epoch": 14.571045576407506, "grad_norm": 2.3452486991882324, "learning_rate": 1.0408292777575712e-05, "loss": 0.6876, "num_input_tokens_seen": 56804840, "step": 97830 }, { "epoch": 14.571790288948465, "grad_norm": 2.761253833770752, "learning_rate": 1.040565439597172e-05, "loss": 0.6434, "num_input_tokens_seen": 56807624, "step": 97835 }, { "epoch": 14.572535001489426, "grad_norm": 1.56341552734375, "learning_rate": 1.0403016260924373e-05, "loss": 0.5486, "num_input_tokens_seen": 56810600, "step": 97840 }, { "epoch": 14.573279714030384, "grad_norm": 1.359037160873413, "learning_rate": 1.040037837247822e-05, "loss": 0.5969, "num_input_tokens_seen": 56813416, "step": 97845 }, { "epoch": 14.574024426571343, "grad_norm": 1.5648900270462036, "learning_rate": 1.0397740730677845e-05, "loss": 0.6691, "num_input_tokens_seen": 56816200, "step": 97850 }, { "epoch": 14.574769139112302, "grad_norm": 1.2060840129852295, "learning_rate": 1.0395103335567794e-05, "loss": 0.5699, "num_input_tokens_seen": 56819368, "step": 97855 }, { "epoch": 14.575513851653263, "grad_norm": 1.461224913597107, "learning_rate": 1.0392466187192634e-05, "loss": 0.4592, "num_input_tokens_seen": 56822216, "step": 97860 }, { "epoch": 14.576258564194221, "grad_norm": 1.9285598993301392, "learning_rate": 1.0389829285596914e-05, "loss": 0.6354, "num_input_tokens_seen": 56825064, "step": 97865 }, { "epoch": 14.57700327673518, "grad_norm": 2.6482386589050293, "learning_rate": 1.0387192630825168e-05, "loss": 0.6226, "num_input_tokens_seen": 56828264, "step": 97870 }, { "epoch": 14.577747989276139, "grad_norm": 1.3220922946929932, "learning_rate": 1.038455622292196e-05, "loss": 0.5053, "num_input_tokens_seen": 56831176, "step": 97875 }, { "epoch": 14.5784927018171, "grad_norm": 1.2411593198776245, "learning_rate": 1.0381920061931818e-05, "loss": 0.5125, "num_input_tokens_seen": 56834216, "step": 97880 }, { "epoch": 14.579237414358058, "grad_norm": 0.9123067855834961, "learning_rate": 1.0379284147899281e-05, "loss": 0.6057, "num_input_tokens_seen": 56837160, "step": 97885 }, { "epoch": 14.579982126899017, "grad_norm": 2.572370767593384, "learning_rate": 1.0376648480868872e-05, "loss": 0.7399, "num_input_tokens_seen": 56840200, "step": 97890 }, { "epoch": 14.580726839439976, "grad_norm": 1.3332455158233643, "learning_rate": 1.0374013060885133e-05, "loss": 0.5537, "num_input_tokens_seen": 56843016, "step": 97895 }, { "epoch": 14.581471551980936, "grad_norm": 0.8514898419380188, "learning_rate": 1.0371377887992575e-05, "loss": 0.5501, "num_input_tokens_seen": 56845992, "step": 97900 }, { "epoch": 14.582216264521895, "grad_norm": 1.0706474781036377, "learning_rate": 1.036874296223571e-05, "loss": 0.5516, "num_input_tokens_seen": 56849032, "step": 97905 }, { "epoch": 14.582960977062854, "grad_norm": 1.441811442375183, "learning_rate": 1.0366108283659077e-05, "loss": 0.488, "num_input_tokens_seen": 56851912, "step": 97910 }, { "epoch": 14.583705689603812, "grad_norm": 1.293105125427246, "learning_rate": 1.0363473852307157e-05, "loss": 0.5952, "num_input_tokens_seen": 56854728, "step": 97915 }, { "epoch": 14.584450402144771, "grad_norm": 2.1631956100463867, "learning_rate": 1.0360839668224481e-05, "loss": 0.7839, "num_input_tokens_seen": 56857352, "step": 97920 }, { "epoch": 14.585195114685732, "grad_norm": 3.1788058280944824, "learning_rate": 1.0358205731455531e-05, "loss": 0.7401, "num_input_tokens_seen": 56860200, "step": 97925 }, { "epoch": 14.58593982722669, "grad_norm": 2.6321516036987305, "learning_rate": 1.0355572042044823e-05, "loss": 0.6341, "num_input_tokens_seen": 56863144, "step": 97930 }, { "epoch": 14.58668453976765, "grad_norm": 1.2050656080245972, "learning_rate": 1.0352938600036843e-05, "loss": 0.5094, "num_input_tokens_seen": 56866152, "step": 97935 }, { "epoch": 14.58742925230861, "grad_norm": 1.9913181066513062, "learning_rate": 1.0350305405476076e-05, "loss": 0.4729, "num_input_tokens_seen": 56869064, "step": 97940 }, { "epoch": 14.588173964849569, "grad_norm": 1.0630227327346802, "learning_rate": 1.0347672458407012e-05, "loss": 0.4361, "num_input_tokens_seen": 56871656, "step": 97945 }, { "epoch": 14.588918677390527, "grad_norm": 1.1761515140533447, "learning_rate": 1.034503975887412e-05, "loss": 0.7752, "num_input_tokens_seen": 56874536, "step": 97950 }, { "epoch": 14.589663389931486, "grad_norm": 1.5853915214538574, "learning_rate": 1.0342407306921894e-05, "loss": 0.526, "num_input_tokens_seen": 56877416, "step": 97955 }, { "epoch": 14.590408102472445, "grad_norm": 2.237210512161255, "learning_rate": 1.0339775102594793e-05, "loss": 0.4418, "num_input_tokens_seen": 56880136, "step": 97960 }, { "epoch": 14.591152815013405, "grad_norm": 1.12977135181427, "learning_rate": 1.0337143145937301e-05, "loss": 0.7011, "num_input_tokens_seen": 56883176, "step": 97965 }, { "epoch": 14.591897527554364, "grad_norm": 1.880122423171997, "learning_rate": 1.0334511436993863e-05, "loss": 0.8226, "num_input_tokens_seen": 56886280, "step": 97970 }, { "epoch": 14.592642240095323, "grad_norm": 1.071134090423584, "learning_rate": 1.0331879975808956e-05, "loss": 0.6173, "num_input_tokens_seen": 56888744, "step": 97975 }, { "epoch": 14.593386952636282, "grad_norm": 1.0457541942596436, "learning_rate": 1.032924876242703e-05, "loss": 0.5857, "num_input_tokens_seen": 56891848, "step": 97980 }, { "epoch": 14.594131665177242, "grad_norm": 1.9374794960021973, "learning_rate": 1.0326617796892527e-05, "loss": 0.4663, "num_input_tokens_seen": 56894632, "step": 97985 }, { "epoch": 14.594876377718201, "grad_norm": 2.4592833518981934, "learning_rate": 1.0323987079249911e-05, "loss": 0.6666, "num_input_tokens_seen": 56897608, "step": 97990 }, { "epoch": 14.59562109025916, "grad_norm": 1.7531906366348267, "learning_rate": 1.0321356609543608e-05, "loss": 0.8573, "num_input_tokens_seen": 56900552, "step": 97995 }, { "epoch": 14.596365802800118, "grad_norm": 1.6118800640106201, "learning_rate": 1.0318726387818078e-05, "loss": 0.6293, "num_input_tokens_seen": 56903400, "step": 98000 }, { "epoch": 14.597110515341079, "grad_norm": 1.7232692241668701, "learning_rate": 1.0316096414117744e-05, "loss": 0.4802, "num_input_tokens_seen": 56906536, "step": 98005 }, { "epoch": 14.597855227882038, "grad_norm": 1.5079965591430664, "learning_rate": 1.0313466688487034e-05, "loss": 0.5922, "num_input_tokens_seen": 56909352, "step": 98010 }, { "epoch": 14.598599940422996, "grad_norm": 2.036041498184204, "learning_rate": 1.031083721097037e-05, "loss": 0.6869, "num_input_tokens_seen": 56912584, "step": 98015 }, { "epoch": 14.599344652963955, "grad_norm": 1.744941234588623, "learning_rate": 1.0308207981612191e-05, "loss": 0.6836, "num_input_tokens_seen": 56915432, "step": 98020 }, { "epoch": 14.600089365504916, "grad_norm": 1.9810521602630615, "learning_rate": 1.0305579000456907e-05, "loss": 0.4837, "num_input_tokens_seen": 56918280, "step": 98025 }, { "epoch": 14.600834078045875, "grad_norm": 1.8645353317260742, "learning_rate": 1.0302950267548922e-05, "loss": 0.4805, "num_input_tokens_seen": 56921128, "step": 98030 }, { "epoch": 14.601578790586833, "grad_norm": 2.2040116786956787, "learning_rate": 1.0300321782932663e-05, "loss": 0.699, "num_input_tokens_seen": 56923816, "step": 98035 }, { "epoch": 14.602323503127792, "grad_norm": 1.1987684965133667, "learning_rate": 1.0297693546652518e-05, "loss": 0.7766, "num_input_tokens_seen": 56927176, "step": 98040 }, { "epoch": 14.603068215668753, "grad_norm": 1.253994107246399, "learning_rate": 1.0295065558752905e-05, "loss": 0.5064, "num_input_tokens_seen": 56929992, "step": 98045 }, { "epoch": 14.603812928209711, "grad_norm": 1.174328327178955, "learning_rate": 1.0292437819278208e-05, "loss": 0.5974, "num_input_tokens_seen": 56932840, "step": 98050 }, { "epoch": 14.60455764075067, "grad_norm": 2.0357937812805176, "learning_rate": 1.0289810328272836e-05, "loss": 0.5348, "num_input_tokens_seen": 56935848, "step": 98055 }, { "epoch": 14.605302353291629, "grad_norm": 1.3990955352783203, "learning_rate": 1.0287183085781165e-05, "loss": 0.5748, "num_input_tokens_seen": 56938664, "step": 98060 }, { "epoch": 14.60604706583259, "grad_norm": 1.3834362030029297, "learning_rate": 1.0284556091847575e-05, "loss": 0.4394, "num_input_tokens_seen": 56941384, "step": 98065 }, { "epoch": 14.606791778373548, "grad_norm": 1.5505280494689941, "learning_rate": 1.028192934651646e-05, "loss": 0.5178, "num_input_tokens_seen": 56944072, "step": 98070 }, { "epoch": 14.607536490914507, "grad_norm": 3.264819860458374, "learning_rate": 1.0279302849832192e-05, "loss": 0.7502, "num_input_tokens_seen": 56947080, "step": 98075 }, { "epoch": 14.608281203455466, "grad_norm": 1.7860758304595947, "learning_rate": 1.027667660183914e-05, "loss": 0.5564, "num_input_tokens_seen": 56949992, "step": 98080 }, { "epoch": 14.609025915996426, "grad_norm": 2.3076179027557373, "learning_rate": 1.0274050602581667e-05, "loss": 0.6017, "num_input_tokens_seen": 56952488, "step": 98085 }, { "epoch": 14.609770628537385, "grad_norm": 1.18604576587677, "learning_rate": 1.0271424852104147e-05, "loss": 0.5353, "num_input_tokens_seen": 56955272, "step": 98090 }, { "epoch": 14.610515341078344, "grad_norm": 2.8194618225097656, "learning_rate": 1.0268799350450928e-05, "loss": 0.6749, "num_input_tokens_seen": 56957928, "step": 98095 }, { "epoch": 14.611260053619302, "grad_norm": 1.4138402938842773, "learning_rate": 1.026617409766638e-05, "loss": 0.5688, "num_input_tokens_seen": 56960904, "step": 98100 }, { "epoch": 14.612004766160261, "grad_norm": 1.4081777334213257, "learning_rate": 1.0263549093794847e-05, "loss": 0.5308, "num_input_tokens_seen": 56963880, "step": 98105 }, { "epoch": 14.612749478701222, "grad_norm": 1.4696831703186035, "learning_rate": 1.0260924338880665e-05, "loss": 0.7742, "num_input_tokens_seen": 56966888, "step": 98110 }, { "epoch": 14.61349419124218, "grad_norm": 1.4283109903335571, "learning_rate": 1.0258299832968196e-05, "loss": 0.4196, "num_input_tokens_seen": 56969544, "step": 98115 }, { "epoch": 14.61423890378314, "grad_norm": 1.733528733253479, "learning_rate": 1.0255675576101759e-05, "loss": 0.5274, "num_input_tokens_seen": 56972456, "step": 98120 }, { "epoch": 14.6149836163241, "grad_norm": 0.9175347089767456, "learning_rate": 1.0253051568325705e-05, "loss": 0.5091, "num_input_tokens_seen": 56975400, "step": 98125 }, { "epoch": 14.615728328865059, "grad_norm": 1.4796029329299927, "learning_rate": 1.0250427809684349e-05, "loss": 0.5129, "num_input_tokens_seen": 56978440, "step": 98130 }, { "epoch": 14.616473041406017, "grad_norm": 1.4477174282073975, "learning_rate": 1.0247804300222034e-05, "loss": 0.5832, "num_input_tokens_seen": 56981448, "step": 98135 }, { "epoch": 14.617217753946976, "grad_norm": 1.801994800567627, "learning_rate": 1.0245181039983068e-05, "loss": 0.6336, "num_input_tokens_seen": 56984360, "step": 98140 }, { "epoch": 14.617962466487935, "grad_norm": 2.028212785720825, "learning_rate": 1.0242558029011776e-05, "loss": 0.5776, "num_input_tokens_seen": 56987208, "step": 98145 }, { "epoch": 14.618707179028895, "grad_norm": 1.285851240158081, "learning_rate": 1.0239935267352466e-05, "loss": 0.612, "num_input_tokens_seen": 56990056, "step": 98150 }, { "epoch": 14.619451891569854, "grad_norm": 1.2415341138839722, "learning_rate": 1.023731275504944e-05, "loss": 0.507, "num_input_tokens_seen": 56992968, "step": 98155 }, { "epoch": 14.620196604110813, "grad_norm": 2.4056715965270996, "learning_rate": 1.023469049214702e-05, "loss": 0.6133, "num_input_tokens_seen": 56996168, "step": 98160 }, { "epoch": 14.620941316651772, "grad_norm": 1.5060352087020874, "learning_rate": 1.0232068478689488e-05, "loss": 0.5924, "num_input_tokens_seen": 56998856, "step": 98165 }, { "epoch": 14.621686029192732, "grad_norm": 1.8863131999969482, "learning_rate": 1.0229446714721158e-05, "loss": 0.6003, "num_input_tokens_seen": 57001608, "step": 98170 }, { "epoch": 14.622430741733691, "grad_norm": 1.3782267570495605, "learning_rate": 1.0226825200286306e-05, "loss": 0.5697, "num_input_tokens_seen": 57004552, "step": 98175 }, { "epoch": 14.62317545427465, "grad_norm": 1.046499490737915, "learning_rate": 1.0224203935429235e-05, "loss": 0.4869, "num_input_tokens_seen": 57007496, "step": 98180 }, { "epoch": 14.623920166815608, "grad_norm": 0.9834750294685364, "learning_rate": 1.0221582920194223e-05, "loss": 0.7689, "num_input_tokens_seen": 57010376, "step": 98185 }, { "epoch": 14.624664879356569, "grad_norm": 2.302689552307129, "learning_rate": 1.0218962154625535e-05, "loss": 0.6761, "num_input_tokens_seen": 57013256, "step": 98190 }, { "epoch": 14.625409591897528, "grad_norm": 1.6086719036102295, "learning_rate": 1.021634163876747e-05, "loss": 0.4644, "num_input_tokens_seen": 57016104, "step": 98195 }, { "epoch": 14.626154304438487, "grad_norm": 2.466731071472168, "learning_rate": 1.0213721372664279e-05, "loss": 0.6635, "num_input_tokens_seen": 57018792, "step": 98200 }, { "epoch": 14.626899016979445, "grad_norm": 1.092305064201355, "learning_rate": 1.0211101356360245e-05, "loss": 0.5004, "num_input_tokens_seen": 57021864, "step": 98205 }, { "epoch": 14.627643729520406, "grad_norm": 2.244050979614258, "learning_rate": 1.0208481589899623e-05, "loss": 0.5049, "num_input_tokens_seen": 57024488, "step": 98210 }, { "epoch": 14.628388442061365, "grad_norm": 1.6950486898422241, "learning_rate": 1.0205862073326673e-05, "loss": 0.4867, "num_input_tokens_seen": 57027272, "step": 98215 }, { "epoch": 14.629133154602323, "grad_norm": 2.115987777709961, "learning_rate": 1.0203242806685645e-05, "loss": 0.5802, "num_input_tokens_seen": 57030248, "step": 98220 }, { "epoch": 14.629877867143282, "grad_norm": 1.2164710760116577, "learning_rate": 1.0200623790020782e-05, "loss": 0.756, "num_input_tokens_seen": 57033160, "step": 98225 }, { "epoch": 14.630622579684243, "grad_norm": 2.4682412147521973, "learning_rate": 1.0198005023376347e-05, "loss": 0.6937, "num_input_tokens_seen": 57035880, "step": 98230 }, { "epoch": 14.631367292225201, "grad_norm": 1.1288756132125854, "learning_rate": 1.0195386506796567e-05, "loss": 0.5791, "num_input_tokens_seen": 57038632, "step": 98235 }, { "epoch": 14.63211200476616, "grad_norm": 1.7578823566436768, "learning_rate": 1.0192768240325693e-05, "loss": 0.5689, "num_input_tokens_seen": 57041384, "step": 98240 }, { "epoch": 14.632856717307119, "grad_norm": 1.115772008895874, "learning_rate": 1.019015022400794e-05, "loss": 0.7896, "num_input_tokens_seen": 57044360, "step": 98245 }, { "epoch": 14.63360142984808, "grad_norm": 2.966470718383789, "learning_rate": 1.0187532457887557e-05, "loss": 0.5649, "num_input_tokens_seen": 57047304, "step": 98250 }, { "epoch": 14.634346142389038, "grad_norm": 1.0370467901229858, "learning_rate": 1.018491494200875e-05, "loss": 0.4938, "num_input_tokens_seen": 57049896, "step": 98255 }, { "epoch": 14.635090854929997, "grad_norm": 1.247092843055725, "learning_rate": 1.0182297676415755e-05, "loss": 0.393, "num_input_tokens_seen": 57052648, "step": 98260 }, { "epoch": 14.635835567470956, "grad_norm": 1.9673362970352173, "learning_rate": 1.0179680661152782e-05, "loss": 0.567, "num_input_tokens_seen": 57055432, "step": 98265 }, { "epoch": 14.636580280011916, "grad_norm": 1.6993200778961182, "learning_rate": 1.0177063896264042e-05, "loss": 0.5266, "num_input_tokens_seen": 57058216, "step": 98270 }, { "epoch": 14.637324992552875, "grad_norm": 1.2093095779418945, "learning_rate": 1.0174447381793739e-05, "loss": 0.8241, "num_input_tokens_seen": 57061384, "step": 98275 }, { "epoch": 14.638069705093834, "grad_norm": 1.8652775287628174, "learning_rate": 1.0171831117786074e-05, "loss": 0.6073, "num_input_tokens_seen": 57064296, "step": 98280 }, { "epoch": 14.638814417634793, "grad_norm": 1.4908778667449951, "learning_rate": 1.016921510428526e-05, "loss": 0.5804, "num_input_tokens_seen": 57066920, "step": 98285 }, { "epoch": 14.639559130175751, "grad_norm": 1.7587515115737915, "learning_rate": 1.0166599341335473e-05, "loss": 0.7347, "num_input_tokens_seen": 57069896, "step": 98290 }, { "epoch": 14.640303842716712, "grad_norm": 0.7515833377838135, "learning_rate": 1.0163983828980922e-05, "loss": 0.6175, "num_input_tokens_seen": 57072808, "step": 98295 }, { "epoch": 14.64104855525767, "grad_norm": 1.1322318315505981, "learning_rate": 1.016136856726579e-05, "loss": 0.6137, "num_input_tokens_seen": 57075944, "step": 98300 }, { "epoch": 14.64179326779863, "grad_norm": 1.9379066228866577, "learning_rate": 1.015875355623424e-05, "loss": 0.515, "num_input_tokens_seen": 57078952, "step": 98305 }, { "epoch": 14.642537980339588, "grad_norm": 1.377457618713379, "learning_rate": 1.0156138795930479e-05, "loss": 0.445, "num_input_tokens_seen": 57082216, "step": 98310 }, { "epoch": 14.643282692880549, "grad_norm": 1.398354411125183, "learning_rate": 1.0153524286398656e-05, "loss": 0.7356, "num_input_tokens_seen": 57084936, "step": 98315 }, { "epoch": 14.644027405421507, "grad_norm": 1.766676664352417, "learning_rate": 1.0150910027682958e-05, "loss": 0.545, "num_input_tokens_seen": 57087848, "step": 98320 }, { "epoch": 14.644772117962466, "grad_norm": 2.1017513275146484, "learning_rate": 1.0148296019827535e-05, "loss": 0.7519, "num_input_tokens_seen": 57090600, "step": 98325 }, { "epoch": 14.645516830503425, "grad_norm": 1.8157590627670288, "learning_rate": 1.0145682262876566e-05, "loss": 0.6393, "num_input_tokens_seen": 57093384, "step": 98330 }, { "epoch": 14.646261543044385, "grad_norm": 1.6942920684814453, "learning_rate": 1.0143068756874197e-05, "loss": 0.6279, "num_input_tokens_seen": 57096488, "step": 98335 }, { "epoch": 14.647006255585344, "grad_norm": 2.1476619243621826, "learning_rate": 1.0140455501864583e-05, "loss": 0.803, "num_input_tokens_seen": 57099432, "step": 98340 }, { "epoch": 14.647750968126303, "grad_norm": 1.9417701959609985, "learning_rate": 1.013784249789187e-05, "loss": 0.557, "num_input_tokens_seen": 57102376, "step": 98345 }, { "epoch": 14.648495680667262, "grad_norm": 1.9041582345962524, "learning_rate": 1.013522974500019e-05, "loss": 0.6457, "num_input_tokens_seen": 57105544, "step": 98350 }, { "epoch": 14.649240393208222, "grad_norm": 1.2083992958068848, "learning_rate": 1.013261724323371e-05, "loss": 0.8545, "num_input_tokens_seen": 57108296, "step": 98355 }, { "epoch": 14.649985105749181, "grad_norm": 3.3345024585723877, "learning_rate": 1.0130004992636541e-05, "loss": 0.7466, "num_input_tokens_seen": 57111400, "step": 98360 }, { "epoch": 14.65072981829014, "grad_norm": 2.923074960708618, "learning_rate": 1.0127392993252832e-05, "loss": 0.7674, "num_input_tokens_seen": 57113992, "step": 98365 }, { "epoch": 14.651474530831099, "grad_norm": 1.6057510375976562, "learning_rate": 1.0124781245126695e-05, "loss": 0.5875, "num_input_tokens_seen": 57116456, "step": 98370 }, { "epoch": 14.652219243372059, "grad_norm": 2.166252613067627, "learning_rate": 1.0122169748302265e-05, "loss": 0.8031, "num_input_tokens_seen": 57119560, "step": 98375 }, { "epoch": 14.652963955913018, "grad_norm": 0.7946777939796448, "learning_rate": 1.011955850282365e-05, "loss": 0.5003, "num_input_tokens_seen": 57122760, "step": 98380 }, { "epoch": 14.653708668453977, "grad_norm": 1.5489445924758911, "learning_rate": 1.0116947508734981e-05, "loss": 0.5908, "num_input_tokens_seen": 57125416, "step": 98385 }, { "epoch": 14.654453380994935, "grad_norm": 2.2116804122924805, "learning_rate": 1.0114336766080356e-05, "loss": 0.5202, "num_input_tokens_seen": 57128392, "step": 98390 }, { "epoch": 14.655198093535896, "grad_norm": 1.2244441509246826, "learning_rate": 1.0111726274903873e-05, "loss": 0.785, "num_input_tokens_seen": 57131496, "step": 98395 }, { "epoch": 14.655942806076855, "grad_norm": 1.0523895025253296, "learning_rate": 1.0109116035249652e-05, "loss": 0.6136, "num_input_tokens_seen": 57134312, "step": 98400 }, { "epoch": 14.656687518617813, "grad_norm": 0.9464806914329529, "learning_rate": 1.0106506047161782e-05, "loss": 0.5622, "num_input_tokens_seen": 57137288, "step": 98405 }, { "epoch": 14.657432231158772, "grad_norm": 2.6210381984710693, "learning_rate": 1.0103896310684356e-05, "loss": 0.802, "num_input_tokens_seen": 57140136, "step": 98410 }, { "epoch": 14.658176943699733, "grad_norm": 1.3783570528030396, "learning_rate": 1.010128682586145e-05, "loss": 0.6882, "num_input_tokens_seen": 57142984, "step": 98415 }, { "epoch": 14.658921656240691, "grad_norm": 1.2106196880340576, "learning_rate": 1.009867759273717e-05, "loss": 0.4649, "num_input_tokens_seen": 57145896, "step": 98420 }, { "epoch": 14.65966636878165, "grad_norm": 1.588091492652893, "learning_rate": 1.0096068611355588e-05, "loss": 0.56, "num_input_tokens_seen": 57149000, "step": 98425 }, { "epoch": 14.660411081322609, "grad_norm": 1.0807361602783203, "learning_rate": 1.0093459881760772e-05, "loss": 0.5018, "num_input_tokens_seen": 57151784, "step": 98430 }, { "epoch": 14.66115579386357, "grad_norm": 1.3085862398147583, "learning_rate": 1.0090851403996809e-05, "loss": 0.5038, "num_input_tokens_seen": 57154856, "step": 98435 }, { "epoch": 14.661900506404528, "grad_norm": 0.9271964430809021, "learning_rate": 1.0088243178107748e-05, "loss": 0.556, "num_input_tokens_seen": 57157928, "step": 98440 }, { "epoch": 14.662645218945487, "grad_norm": 1.1439729928970337, "learning_rate": 1.0085635204137672e-05, "loss": 0.5479, "num_input_tokens_seen": 57160776, "step": 98445 }, { "epoch": 14.663389931486446, "grad_norm": 1.855538249015808, "learning_rate": 1.0083027482130625e-05, "loss": 0.7162, "num_input_tokens_seen": 57163720, "step": 98450 }, { "epoch": 14.664134644027406, "grad_norm": 2.0776114463806152, "learning_rate": 1.0080420012130673e-05, "loss": 0.5556, "num_input_tokens_seen": 57166952, "step": 98455 }, { "epoch": 14.664879356568365, "grad_norm": 1.2012704610824585, "learning_rate": 1.0077812794181854e-05, "loss": 0.5473, "num_input_tokens_seen": 57169704, "step": 98460 }, { "epoch": 14.665624069109324, "grad_norm": 1.3989317417144775, "learning_rate": 1.0075205828328232e-05, "loss": 0.4839, "num_input_tokens_seen": 57172360, "step": 98465 }, { "epoch": 14.666368781650283, "grad_norm": 3.0974135398864746, "learning_rate": 1.0072599114613837e-05, "loss": 0.6971, "num_input_tokens_seen": 57175368, "step": 98470 }, { "epoch": 14.667113494191241, "grad_norm": 2.242403984069824, "learning_rate": 1.0069992653082707e-05, "loss": 0.5888, "num_input_tokens_seen": 57178504, "step": 98475 }, { "epoch": 14.667858206732202, "grad_norm": 2.1423144340515137, "learning_rate": 1.0067386443778879e-05, "loss": 0.6617, "num_input_tokens_seen": 57181448, "step": 98480 }, { "epoch": 14.66860291927316, "grad_norm": 1.4981141090393066, "learning_rate": 1.006478048674637e-05, "loss": 0.6158, "num_input_tokens_seen": 57184264, "step": 98485 }, { "epoch": 14.66934763181412, "grad_norm": 1.40636146068573, "learning_rate": 1.0062174782029227e-05, "loss": 0.5591, "num_input_tokens_seen": 57187208, "step": 98490 }, { "epoch": 14.670092344355078, "grad_norm": 1.3864716291427612, "learning_rate": 1.0059569329671448e-05, "loss": 0.6199, "num_input_tokens_seen": 57189896, "step": 98495 }, { "epoch": 14.670837056896039, "grad_norm": 0.84428870677948, "learning_rate": 1.0056964129717067e-05, "loss": 0.3638, "num_input_tokens_seen": 57192936, "step": 98500 }, { "epoch": 14.671581769436997, "grad_norm": 1.0530683994293213, "learning_rate": 1.0054359182210093e-05, "loss": 0.5605, "num_input_tokens_seen": 57195656, "step": 98505 }, { "epoch": 14.672326481977956, "grad_norm": 2.255033254623413, "learning_rate": 1.005175448719452e-05, "loss": 0.4596, "num_input_tokens_seen": 57198696, "step": 98510 }, { "epoch": 14.673071194518915, "grad_norm": 1.7808759212493896, "learning_rate": 1.0049150044714373e-05, "loss": 0.6988, "num_input_tokens_seen": 57201576, "step": 98515 }, { "epoch": 14.673815907059875, "grad_norm": 1.6029810905456543, "learning_rate": 1.004654585481363e-05, "loss": 0.5282, "num_input_tokens_seen": 57204680, "step": 98520 }, { "epoch": 14.674560619600834, "grad_norm": 2.2285194396972656, "learning_rate": 1.0043941917536303e-05, "loss": 0.6047, "num_input_tokens_seen": 57207656, "step": 98525 }, { "epoch": 14.675305332141793, "grad_norm": 2.3062376976013184, "learning_rate": 1.0041338232926373e-05, "loss": 0.6214, "num_input_tokens_seen": 57210600, "step": 98530 }, { "epoch": 14.676050044682752, "grad_norm": 3.433012008666992, "learning_rate": 1.0038734801027836e-05, "loss": 0.7386, "num_input_tokens_seen": 57213448, "step": 98535 }, { "epoch": 14.676794757223712, "grad_norm": 1.1002659797668457, "learning_rate": 1.003613162188467e-05, "loss": 0.5187, "num_input_tokens_seen": 57216360, "step": 98540 }, { "epoch": 14.677539469764671, "grad_norm": 1.6416722536087036, "learning_rate": 1.003352869554085e-05, "loss": 0.6472, "num_input_tokens_seen": 57219368, "step": 98545 }, { "epoch": 14.67828418230563, "grad_norm": 1.5950322151184082, "learning_rate": 1.0030926022040355e-05, "loss": 0.5172, "num_input_tokens_seen": 57222184, "step": 98550 }, { "epoch": 14.679028894846589, "grad_norm": 1.3950077295303345, "learning_rate": 1.002832360142714e-05, "loss": 0.5801, "num_input_tokens_seen": 57224936, "step": 98555 }, { "epoch": 14.679773607387549, "grad_norm": 3.1378300189971924, "learning_rate": 1.0025721433745188e-05, "loss": 0.7607, "num_input_tokens_seen": 57227784, "step": 98560 }, { "epoch": 14.680518319928508, "grad_norm": 1.1654984951019287, "learning_rate": 1.0023119519038445e-05, "loss": 0.4849, "num_input_tokens_seen": 57230856, "step": 98565 }, { "epoch": 14.681263032469467, "grad_norm": 1.044495940208435, "learning_rate": 1.0020517857350886e-05, "loss": 0.4332, "num_input_tokens_seen": 57234024, "step": 98570 }, { "epoch": 14.682007745010425, "grad_norm": 0.9783728122711182, "learning_rate": 1.0017916448726444e-05, "loss": 0.5714, "num_input_tokens_seen": 57238088, "step": 98575 }, { "epoch": 14.682752457551386, "grad_norm": 1.5476508140563965, "learning_rate": 1.0015315293209087e-05, "loss": 0.4961, "num_input_tokens_seen": 57240680, "step": 98580 }, { "epoch": 14.683497170092345, "grad_norm": 2.119590997695923, "learning_rate": 1.0012714390842748e-05, "loss": 0.8029, "num_input_tokens_seen": 57243560, "step": 98585 }, { "epoch": 14.684241882633303, "grad_norm": 2.300645589828491, "learning_rate": 1.0010113741671356e-05, "loss": 0.629, "num_input_tokens_seen": 57246312, "step": 98590 }, { "epoch": 14.684986595174262, "grad_norm": 1.999854564666748, "learning_rate": 1.0007513345738867e-05, "loss": 0.5444, "num_input_tokens_seen": 57249384, "step": 98595 }, { "epoch": 14.685731307715223, "grad_norm": 1.1677418947219849, "learning_rate": 1.0004913203089202e-05, "loss": 0.4625, "num_input_tokens_seen": 57252520, "step": 98600 }, { "epoch": 14.686476020256181, "grad_norm": 1.9073089361190796, "learning_rate": 1.000231331376629e-05, "loss": 0.6359, "num_input_tokens_seen": 57255464, "step": 98605 }, { "epoch": 14.68722073279714, "grad_norm": 2.2019405364990234, "learning_rate": 9.99971367781404e-06, "loss": 0.6473, "num_input_tokens_seen": 57258184, "step": 98610 }, { "epoch": 14.687965445338099, "grad_norm": 1.0145857334136963, "learning_rate": 9.997114295276395e-06, "loss": 0.5822, "num_input_tokens_seen": 57260808, "step": 98615 }, { "epoch": 14.688710157879058, "grad_norm": 1.6422553062438965, "learning_rate": 9.994515166197241e-06, "loss": 0.6603, "num_input_tokens_seen": 57263656, "step": 98620 }, { "epoch": 14.689454870420018, "grad_norm": 1.5587835311889648, "learning_rate": 9.991916290620515e-06, "loss": 0.6118, "num_input_tokens_seen": 57266696, "step": 98625 }, { "epoch": 14.690199582960977, "grad_norm": 2.518510103225708, "learning_rate": 9.98931766859011e-06, "loss": 0.5867, "num_input_tokens_seen": 57269384, "step": 98630 }, { "epoch": 14.690944295501936, "grad_norm": 2.6199796199798584, "learning_rate": 9.986719300149915e-06, "loss": 0.6574, "num_input_tokens_seen": 57272136, "step": 98635 }, { "epoch": 14.691689008042896, "grad_norm": 1.6264147758483887, "learning_rate": 9.98412118534385e-06, "loss": 0.5924, "num_input_tokens_seen": 57275048, "step": 98640 }, { "epoch": 14.692433720583855, "grad_norm": 0.8359386324882507, "learning_rate": 9.981523324215786e-06, "loss": 0.474, "num_input_tokens_seen": 57277960, "step": 98645 }, { "epoch": 14.693178433124814, "grad_norm": 2.0539119243621826, "learning_rate": 9.978925716809631e-06, "loss": 0.5625, "num_input_tokens_seen": 57281064, "step": 98650 }, { "epoch": 14.693923145665773, "grad_norm": 2.081036329269409, "learning_rate": 9.976328363169252e-06, "loss": 0.7231, "num_input_tokens_seen": 57283944, "step": 98655 }, { "epoch": 14.694667858206731, "grad_norm": 0.7605381011962891, "learning_rate": 9.973731263338542e-06, "loss": 0.6733, "num_input_tokens_seen": 57286728, "step": 98660 }, { "epoch": 14.695412570747692, "grad_norm": 2.5123260021209717, "learning_rate": 9.971134417361371e-06, "loss": 0.5736, "num_input_tokens_seen": 57289992, "step": 98665 }, { "epoch": 14.69615728328865, "grad_norm": 1.8441083431243896, "learning_rate": 9.96853782528161e-06, "loss": 0.657, "num_input_tokens_seen": 57293064, "step": 98670 }, { "epoch": 14.69690199582961, "grad_norm": 1.177013635635376, "learning_rate": 9.965941487143123e-06, "loss": 0.3482, "num_input_tokens_seen": 57295848, "step": 98675 }, { "epoch": 14.697646708370568, "grad_norm": 1.567583680152893, "learning_rate": 9.963345402989768e-06, "loss": 0.698, "num_input_tokens_seen": 57298696, "step": 98680 }, { "epoch": 14.698391420911529, "grad_norm": 0.8756324648857117, "learning_rate": 9.96074957286542e-06, "loss": 0.4953, "num_input_tokens_seen": 57301512, "step": 98685 }, { "epoch": 14.699136133452487, "grad_norm": 1.2206958532333374, "learning_rate": 9.958153996813912e-06, "loss": 0.6023, "num_input_tokens_seen": 57304232, "step": 98690 }, { "epoch": 14.699880845993446, "grad_norm": 1.637678623199463, "learning_rate": 9.955558674879115e-06, "loss": 0.6426, "num_input_tokens_seen": 57307112, "step": 98695 }, { "epoch": 14.700625558534405, "grad_norm": 1.3410515785217285, "learning_rate": 9.952963607104851e-06, "loss": 0.5423, "num_input_tokens_seen": 57309832, "step": 98700 }, { "epoch": 14.701370271075366, "grad_norm": 1.6550034284591675, "learning_rate": 9.950368793534986e-06, "loss": 0.4398, "num_input_tokens_seen": 57312584, "step": 98705 }, { "epoch": 14.702114983616324, "grad_norm": 1.7805874347686768, "learning_rate": 9.947774234213342e-06, "loss": 0.6107, "num_input_tokens_seen": 57315592, "step": 98710 }, { "epoch": 14.702859696157283, "grad_norm": 1.6322144269943237, "learning_rate": 9.945179929183749e-06, "loss": 0.5603, "num_input_tokens_seen": 57318376, "step": 98715 }, { "epoch": 14.703604408698242, "grad_norm": 1.0922315120697021, "learning_rate": 9.942585878490046e-06, "loss": 0.6431, "num_input_tokens_seen": 57321576, "step": 98720 }, { "epoch": 14.704349121239202, "grad_norm": 0.9298755526542664, "learning_rate": 9.939992082176041e-06, "loss": 0.5829, "num_input_tokens_seen": 57324552, "step": 98725 }, { "epoch": 14.705093833780161, "grad_norm": 1.4536856412887573, "learning_rate": 9.937398540285575e-06, "loss": 0.5788, "num_input_tokens_seen": 57327624, "step": 98730 }, { "epoch": 14.70583854632112, "grad_norm": 1.2723459005355835, "learning_rate": 9.93480525286245e-06, "loss": 0.5537, "num_input_tokens_seen": 57330312, "step": 98735 }, { "epoch": 14.706583258862079, "grad_norm": 1.0102744102478027, "learning_rate": 9.93221221995048e-06, "loss": 0.7202, "num_input_tokens_seen": 57333192, "step": 98740 }, { "epoch": 14.70732797140304, "grad_norm": 2.316580295562744, "learning_rate": 9.929619441593469e-06, "loss": 0.6622, "num_input_tokens_seen": 57336328, "step": 98745 }, { "epoch": 14.708072683943998, "grad_norm": 1.2674891948699951, "learning_rate": 9.927026917835211e-06, "loss": 0.6336, "num_input_tokens_seen": 57339272, "step": 98750 }, { "epoch": 14.708817396484957, "grad_norm": 1.3493930101394653, "learning_rate": 9.924434648719525e-06, "loss": 0.6842, "num_input_tokens_seen": 57342120, "step": 98755 }, { "epoch": 14.709562109025915, "grad_norm": 2.0725913047790527, "learning_rate": 9.921842634290182e-06, "loss": 0.5423, "num_input_tokens_seen": 57345224, "step": 98760 }, { "epoch": 14.710306821566876, "grad_norm": 1.6874445676803589, "learning_rate": 9.919250874590993e-06, "loss": 0.5769, "num_input_tokens_seen": 57348040, "step": 98765 }, { "epoch": 14.711051534107835, "grad_norm": 1.3525362014770508, "learning_rate": 9.916659369665726e-06, "loss": 0.4768, "num_input_tokens_seen": 57350888, "step": 98770 }, { "epoch": 14.711796246648793, "grad_norm": 1.8401927947998047, "learning_rate": 9.914068119558177e-06, "loss": 0.7161, "num_input_tokens_seen": 57353896, "step": 98775 }, { "epoch": 14.712540959189752, "grad_norm": 1.889574646949768, "learning_rate": 9.911477124312104e-06, "loss": 0.5445, "num_input_tokens_seen": 57356680, "step": 98780 }, { "epoch": 14.713285671730713, "grad_norm": 1.6709469556808472, "learning_rate": 9.9088863839713e-06, "loss": 0.562, "num_input_tokens_seen": 57359592, "step": 98785 }, { "epoch": 14.714030384271672, "grad_norm": 1.526368260383606, "learning_rate": 9.90629589857952e-06, "loss": 0.6713, "num_input_tokens_seen": 57362280, "step": 98790 }, { "epoch": 14.71477509681263, "grad_norm": 1.9103463888168335, "learning_rate": 9.903705668180524e-06, "loss": 0.6869, "num_input_tokens_seen": 57365032, "step": 98795 }, { "epoch": 14.715519809353589, "grad_norm": 1.948294997215271, "learning_rate": 9.901115692818085e-06, "loss": 0.8039, "num_input_tokens_seen": 57367752, "step": 98800 }, { "epoch": 14.716264521894548, "grad_norm": 2.1284565925598145, "learning_rate": 9.898525972535952e-06, "loss": 0.6484, "num_input_tokens_seen": 57370568, "step": 98805 }, { "epoch": 14.717009234435508, "grad_norm": 1.0539288520812988, "learning_rate": 9.895936507377873e-06, "loss": 0.4453, "num_input_tokens_seen": 57373320, "step": 98810 }, { "epoch": 14.717753946976467, "grad_norm": 1.5494072437286377, "learning_rate": 9.89334729738759e-06, "loss": 0.6036, "num_input_tokens_seen": 57376264, "step": 98815 }, { "epoch": 14.718498659517426, "grad_norm": 1.2105567455291748, "learning_rate": 9.890758342608856e-06, "loss": 0.5642, "num_input_tokens_seen": 57379272, "step": 98820 }, { "epoch": 14.719243372058386, "grad_norm": 1.2098158597946167, "learning_rate": 9.888169643085404e-06, "loss": 0.4805, "num_input_tokens_seen": 57381832, "step": 98825 }, { "epoch": 14.719988084599345, "grad_norm": 1.4772239923477173, "learning_rate": 9.885581198860958e-06, "loss": 0.4669, "num_input_tokens_seen": 57384936, "step": 98830 }, { "epoch": 14.720732797140304, "grad_norm": 1.473645806312561, "learning_rate": 9.882993009979265e-06, "loss": 0.5034, "num_input_tokens_seen": 57387496, "step": 98835 }, { "epoch": 14.721477509681263, "grad_norm": 2.1925415992736816, "learning_rate": 9.880405076484034e-06, "loss": 0.5767, "num_input_tokens_seen": 57390728, "step": 98840 }, { "epoch": 14.722222222222221, "grad_norm": 1.4516968727111816, "learning_rate": 9.877817398418998e-06, "loss": 0.619, "num_input_tokens_seen": 57393512, "step": 98845 }, { "epoch": 14.722966934763182, "grad_norm": 1.468916654586792, "learning_rate": 9.87522997582786e-06, "loss": 0.6221, "num_input_tokens_seen": 57396584, "step": 98850 }, { "epoch": 14.72371164730414, "grad_norm": 1.0265859365463257, "learning_rate": 9.872642808754348e-06, "loss": 0.6215, "num_input_tokens_seen": 57399464, "step": 98855 }, { "epoch": 14.7244563598451, "grad_norm": 1.632673740386963, "learning_rate": 9.870055897242152e-06, "loss": 0.5892, "num_input_tokens_seen": 57402888, "step": 98860 }, { "epoch": 14.725201072386058, "grad_norm": 2.484773635864258, "learning_rate": 9.867469241334994e-06, "loss": 0.4729, "num_input_tokens_seen": 57405576, "step": 98865 }, { "epoch": 14.725945784927019, "grad_norm": 1.5655770301818848, "learning_rate": 9.864882841076564e-06, "loss": 0.7649, "num_input_tokens_seen": 57408424, "step": 98870 }, { "epoch": 14.726690497467978, "grad_norm": 1.5971107482910156, "learning_rate": 9.862296696510557e-06, "loss": 0.6505, "num_input_tokens_seen": 57411528, "step": 98875 }, { "epoch": 14.727435210008936, "grad_norm": 1.564935564994812, "learning_rate": 9.859710807680658e-06, "loss": 0.4441, "num_input_tokens_seen": 57414344, "step": 98880 }, { "epoch": 14.728179922549895, "grad_norm": 2.14444899559021, "learning_rate": 9.85712517463055e-06, "loss": 0.5448, "num_input_tokens_seen": 57417320, "step": 98885 }, { "epoch": 14.728924635090856, "grad_norm": 2.5070927143096924, "learning_rate": 9.85453979740393e-06, "loss": 0.7823, "num_input_tokens_seen": 57419816, "step": 98890 }, { "epoch": 14.729669347631814, "grad_norm": 2.514045000076294, "learning_rate": 9.851954676044458e-06, "loss": 0.5611, "num_input_tokens_seen": 57422888, "step": 98895 }, { "epoch": 14.730414060172773, "grad_norm": 1.852563500404358, "learning_rate": 9.849369810595827e-06, "loss": 0.5252, "num_input_tokens_seen": 57425896, "step": 98900 }, { "epoch": 14.731158772713732, "grad_norm": 0.9607270359992981, "learning_rate": 9.846785201101691e-06, "loss": 0.5749, "num_input_tokens_seen": 57428776, "step": 98905 }, { "epoch": 14.731903485254692, "grad_norm": 1.2123026847839355, "learning_rate": 9.84420084760571e-06, "loss": 0.5455, "num_input_tokens_seen": 57431624, "step": 98910 }, { "epoch": 14.732648197795651, "grad_norm": 1.0106616020202637, "learning_rate": 9.841616750151565e-06, "loss": 0.6237, "num_input_tokens_seen": 57434440, "step": 98915 }, { "epoch": 14.73339291033661, "grad_norm": 1.149133563041687, "learning_rate": 9.839032908782885e-06, "loss": 0.5052, "num_input_tokens_seen": 57437352, "step": 98920 }, { "epoch": 14.734137622877569, "grad_norm": 2.528196334838867, "learning_rate": 9.836449323543345e-06, "loss": 0.6624, "num_input_tokens_seen": 57440296, "step": 98925 }, { "epoch": 14.73488233541853, "grad_norm": 0.7985025644302368, "learning_rate": 9.833865994476584e-06, "loss": 0.5274, "num_input_tokens_seen": 57442984, "step": 98930 }, { "epoch": 14.735627047959488, "grad_norm": 1.7166728973388672, "learning_rate": 9.831282921626242e-06, "loss": 0.5857, "num_input_tokens_seen": 57446088, "step": 98935 }, { "epoch": 14.736371760500447, "grad_norm": 2.2183923721313477, "learning_rate": 9.82870010503595e-06, "loss": 0.6026, "num_input_tokens_seen": 57449064, "step": 98940 }, { "epoch": 14.737116473041405, "grad_norm": 1.0630955696105957, "learning_rate": 9.826117544749357e-06, "loss": 0.6188, "num_input_tokens_seen": 57451912, "step": 98945 }, { "epoch": 14.737861185582366, "grad_norm": 3.540813446044922, "learning_rate": 9.823535240810089e-06, "loss": 0.4892, "num_input_tokens_seen": 57455048, "step": 98950 }, { "epoch": 14.738605898123325, "grad_norm": 2.5490710735321045, "learning_rate": 9.820953193261756e-06, "loss": 0.566, "num_input_tokens_seen": 57458248, "step": 98955 }, { "epoch": 14.739350610664284, "grad_norm": 2.8326609134674072, "learning_rate": 9.818371402148002e-06, "loss": 0.7232, "num_input_tokens_seen": 57461256, "step": 98960 }, { "epoch": 14.740095323205242, "grad_norm": 1.254141926765442, "learning_rate": 9.815789867512427e-06, "loss": 0.5564, "num_input_tokens_seen": 57463784, "step": 98965 }, { "epoch": 14.740840035746203, "grad_norm": 1.5708266496658325, "learning_rate": 9.813208589398654e-06, "loss": 0.488, "num_input_tokens_seen": 57466504, "step": 98970 }, { "epoch": 14.741584748287162, "grad_norm": 2.9018635749816895, "learning_rate": 9.81062756785028e-06, "loss": 0.6722, "num_input_tokens_seen": 57469288, "step": 98975 }, { "epoch": 14.74232946082812, "grad_norm": 1.7154979705810547, "learning_rate": 9.808046802910926e-06, "loss": 0.7534, "num_input_tokens_seen": 57472264, "step": 98980 }, { "epoch": 14.743074173369079, "grad_norm": 1.4177424907684326, "learning_rate": 9.80546629462417e-06, "loss": 0.7329, "num_input_tokens_seen": 57475144, "step": 98985 }, { "epoch": 14.743818885910038, "grad_norm": 1.1970489025115967, "learning_rate": 9.802886043033626e-06, "loss": 0.4803, "num_input_tokens_seen": 57478056, "step": 98990 }, { "epoch": 14.744563598450998, "grad_norm": 0.8440534472465515, "learning_rate": 9.800306048182878e-06, "loss": 0.5121, "num_input_tokens_seen": 57480904, "step": 98995 }, { "epoch": 14.745308310991957, "grad_norm": 1.6239413022994995, "learning_rate": 9.79772631011551e-06, "loss": 0.5852, "num_input_tokens_seen": 57483880, "step": 99000 }, { "epoch": 14.746053023532916, "grad_norm": 1.2701287269592285, "learning_rate": 9.795146828875107e-06, "loss": 0.382, "num_input_tokens_seen": 57487048, "step": 99005 }, { "epoch": 14.746797736073875, "grad_norm": 3.424872398376465, "learning_rate": 9.792567604505234e-06, "loss": 0.708, "num_input_tokens_seen": 57489800, "step": 99010 }, { "epoch": 14.747542448614835, "grad_norm": 1.1022727489471436, "learning_rate": 9.789988637049485e-06, "loss": 0.4595, "num_input_tokens_seen": 57492744, "step": 99015 }, { "epoch": 14.748287161155794, "grad_norm": 0.41120830178260803, "learning_rate": 9.787409926551411e-06, "loss": 0.5218, "num_input_tokens_seen": 57495464, "step": 99020 }, { "epoch": 14.749031873696753, "grad_norm": 0.785078763961792, "learning_rate": 9.784831473054592e-06, "loss": 0.5005, "num_input_tokens_seen": 57498600, "step": 99025 }, { "epoch": 14.749776586237711, "grad_norm": 1.377571702003479, "learning_rate": 9.782253276602582e-06, "loss": 0.5786, "num_input_tokens_seen": 57501288, "step": 99030 }, { "epoch": 14.750521298778672, "grad_norm": 1.8164517879486084, "learning_rate": 9.779675337238928e-06, "loss": 0.7653, "num_input_tokens_seen": 57504136, "step": 99035 }, { "epoch": 14.75126601131963, "grad_norm": 2.293506145477295, "learning_rate": 9.777097655007197e-06, "loss": 0.4636, "num_input_tokens_seen": 57507016, "step": 99040 }, { "epoch": 14.75201072386059, "grad_norm": 0.9724615216255188, "learning_rate": 9.774520229950923e-06, "loss": 0.6758, "num_input_tokens_seen": 57509992, "step": 99045 }, { "epoch": 14.752755436401548, "grad_norm": 1.392879843711853, "learning_rate": 9.771943062113664e-06, "loss": 0.3785, "num_input_tokens_seen": 57513064, "step": 99050 }, { "epoch": 14.753500148942509, "grad_norm": 3.4979987144470215, "learning_rate": 9.76936615153894e-06, "loss": 0.8526, "num_input_tokens_seen": 57515816, "step": 99055 }, { "epoch": 14.754244861483468, "grad_norm": 1.9998286962509155, "learning_rate": 9.766789498270304e-06, "loss": 0.5486, "num_input_tokens_seen": 57518696, "step": 99060 }, { "epoch": 14.754989574024426, "grad_norm": 3.1106626987457275, "learning_rate": 9.764213102351275e-06, "loss": 0.511, "num_input_tokens_seen": 57521544, "step": 99065 }, { "epoch": 14.755734286565385, "grad_norm": 0.8497111201286316, "learning_rate": 9.761636963825382e-06, "loss": 0.5708, "num_input_tokens_seen": 57524264, "step": 99070 }, { "epoch": 14.756478999106346, "grad_norm": 1.9259814023971558, "learning_rate": 9.759061082736145e-06, "loss": 0.6059, "num_input_tokens_seen": 57527272, "step": 99075 }, { "epoch": 14.757223711647304, "grad_norm": 1.7265949249267578, "learning_rate": 9.756485459127073e-06, "loss": 0.5121, "num_input_tokens_seen": 57529960, "step": 99080 }, { "epoch": 14.757968424188263, "grad_norm": 1.1345170736312866, "learning_rate": 9.753910093041696e-06, "loss": 0.5842, "num_input_tokens_seen": 57532616, "step": 99085 }, { "epoch": 14.758713136729222, "grad_norm": 1.7817739248275757, "learning_rate": 9.751334984523502e-06, "loss": 0.59, "num_input_tokens_seen": 57535368, "step": 99090 }, { "epoch": 14.759457849270182, "grad_norm": 1.8120423555374146, "learning_rate": 9.748760133616015e-06, "loss": 0.6786, "num_input_tokens_seen": 57538216, "step": 99095 }, { "epoch": 14.760202561811141, "grad_norm": 1.3133207559585571, "learning_rate": 9.746185540362714e-06, "loss": 0.6068, "num_input_tokens_seen": 57541192, "step": 99100 }, { "epoch": 14.7609472743521, "grad_norm": 2.8276591300964355, "learning_rate": 9.743611204807118e-06, "loss": 0.7531, "num_input_tokens_seen": 57544040, "step": 99105 }, { "epoch": 14.761691986893059, "grad_norm": 1.2276679277420044, "learning_rate": 9.741037126992702e-06, "loss": 0.7107, "num_input_tokens_seen": 57546888, "step": 99110 }, { "epoch": 14.76243669943402, "grad_norm": 1.8353135585784912, "learning_rate": 9.738463306962947e-06, "loss": 0.5403, "num_input_tokens_seen": 57549640, "step": 99115 }, { "epoch": 14.763181411974978, "grad_norm": 1.7569832801818848, "learning_rate": 9.73588974476135e-06, "loss": 0.6265, "num_input_tokens_seen": 57552552, "step": 99120 }, { "epoch": 14.763926124515937, "grad_norm": 1.866698980331421, "learning_rate": 9.733316440431375e-06, "loss": 0.5935, "num_input_tokens_seen": 57555272, "step": 99125 }, { "epoch": 14.764670837056896, "grad_norm": 1.7412773370742798, "learning_rate": 9.730743394016512e-06, "loss": 0.5928, "num_input_tokens_seen": 57558152, "step": 99130 }, { "epoch": 14.765415549597854, "grad_norm": 1.071298360824585, "learning_rate": 9.72817060556022e-06, "loss": 0.4718, "num_input_tokens_seen": 57561224, "step": 99135 }, { "epoch": 14.766160262138815, "grad_norm": 1.8954731225967407, "learning_rate": 9.725598075105963e-06, "loss": 0.5084, "num_input_tokens_seen": 57564040, "step": 99140 }, { "epoch": 14.766904974679774, "grad_norm": 0.7939632534980774, "learning_rate": 9.723025802697195e-06, "loss": 0.4999, "num_input_tokens_seen": 57566792, "step": 99145 }, { "epoch": 14.767649687220732, "grad_norm": 1.5318461656570435, "learning_rate": 9.720453788377387e-06, "loss": 0.5202, "num_input_tokens_seen": 57569704, "step": 99150 }, { "epoch": 14.768394399761693, "grad_norm": 1.803723931312561, "learning_rate": 9.71788203218998e-06, "loss": 0.6642, "num_input_tokens_seen": 57572488, "step": 99155 }, { "epoch": 14.769139112302652, "grad_norm": 0.9562133550643921, "learning_rate": 9.71531053417842e-06, "loss": 0.5552, "num_input_tokens_seen": 57575560, "step": 99160 }, { "epoch": 14.76988382484361, "grad_norm": 1.9240225553512573, "learning_rate": 9.712739294386161e-06, "loss": 0.5152, "num_input_tokens_seen": 57578472, "step": 99165 }, { "epoch": 14.77062853738457, "grad_norm": 1.1042875051498413, "learning_rate": 9.710168312856626e-06, "loss": 0.5429, "num_input_tokens_seen": 57581224, "step": 99170 }, { "epoch": 14.771373249925528, "grad_norm": 1.5839823484420776, "learning_rate": 9.707597589633267e-06, "loss": 0.4886, "num_input_tokens_seen": 57584008, "step": 99175 }, { "epoch": 14.772117962466488, "grad_norm": 1.3139574527740479, "learning_rate": 9.705027124759495e-06, "loss": 0.509, "num_input_tokens_seen": 57586760, "step": 99180 }, { "epoch": 14.772862675007447, "grad_norm": 2.429924488067627, "learning_rate": 9.702456918278752e-06, "loss": 0.6487, "num_input_tokens_seen": 57589736, "step": 99185 }, { "epoch": 14.773607387548406, "grad_norm": 0.9373040199279785, "learning_rate": 9.69988697023445e-06, "loss": 0.4167, "num_input_tokens_seen": 57592744, "step": 99190 }, { "epoch": 14.774352100089365, "grad_norm": 2.0955264568328857, "learning_rate": 9.69731728067e-06, "loss": 0.6857, "num_input_tokens_seen": 57595688, "step": 99195 }, { "epoch": 14.775096812630325, "grad_norm": 1.698554277420044, "learning_rate": 9.694747849628833e-06, "loss": 0.6656, "num_input_tokens_seen": 57598632, "step": 99200 }, { "epoch": 14.775841525171284, "grad_norm": 1.7751336097717285, "learning_rate": 9.692178677154342e-06, "loss": 0.7736, "num_input_tokens_seen": 57601672, "step": 99205 }, { "epoch": 14.776586237712243, "grad_norm": 1.1917725801467896, "learning_rate": 9.689609763289936e-06, "loss": 0.6857, "num_input_tokens_seen": 57604520, "step": 99210 }, { "epoch": 14.777330950253202, "grad_norm": 2.498969793319702, "learning_rate": 9.687041108079003e-06, "loss": 0.3745, "num_input_tokens_seen": 57607432, "step": 99215 }, { "epoch": 14.778075662794162, "grad_norm": 2.3720524311065674, "learning_rate": 9.684472711564957e-06, "loss": 0.7014, "num_input_tokens_seen": 57610472, "step": 99220 }, { "epoch": 14.77882037533512, "grad_norm": 1.8049604892730713, "learning_rate": 9.681904573791168e-06, "loss": 0.5217, "num_input_tokens_seen": 57613064, "step": 99225 }, { "epoch": 14.77956508787608, "grad_norm": 1.6318377256393433, "learning_rate": 9.679336694801041e-06, "loss": 0.6548, "num_input_tokens_seen": 57615784, "step": 99230 }, { "epoch": 14.780309800417038, "grad_norm": 1.7653151750564575, "learning_rate": 9.67676907463795e-06, "loss": 0.4562, "num_input_tokens_seen": 57618504, "step": 99235 }, { "epoch": 14.781054512957999, "grad_norm": 1.3281240463256836, "learning_rate": 9.674201713345265e-06, "loss": 0.7502, "num_input_tokens_seen": 57621448, "step": 99240 }, { "epoch": 14.781799225498958, "grad_norm": 1.354257583618164, "learning_rate": 9.671634610966373e-06, "loss": 0.5776, "num_input_tokens_seen": 57624488, "step": 99245 }, { "epoch": 14.782543938039916, "grad_norm": 1.0286972522735596, "learning_rate": 9.669067767544626e-06, "loss": 0.5744, "num_input_tokens_seen": 57627560, "step": 99250 }, { "epoch": 14.783288650580875, "grad_norm": 1.0283089876174927, "learning_rate": 9.666501183123406e-06, "loss": 0.2394, "num_input_tokens_seen": 57630984, "step": 99255 }, { "epoch": 14.784033363121836, "grad_norm": 1.2332183122634888, "learning_rate": 9.663934857746065e-06, "loss": 0.3912, "num_input_tokens_seen": 57633736, "step": 99260 }, { "epoch": 14.784778075662794, "grad_norm": 2.0264899730682373, "learning_rate": 9.661368791455957e-06, "loss": 0.7021, "num_input_tokens_seen": 57636424, "step": 99265 }, { "epoch": 14.785522788203753, "grad_norm": 2.31697940826416, "learning_rate": 9.658802984296426e-06, "loss": 0.6559, "num_input_tokens_seen": 57639272, "step": 99270 }, { "epoch": 14.786267500744712, "grad_norm": 1.636860728263855, "learning_rate": 9.656237436310834e-06, "loss": 0.694, "num_input_tokens_seen": 57642280, "step": 99275 }, { "epoch": 14.787012213285673, "grad_norm": 3.9895644187927246, "learning_rate": 9.653672147542515e-06, "loss": 0.6125, "num_input_tokens_seen": 57645352, "step": 99280 }, { "epoch": 14.787756925826631, "grad_norm": 2.114980459213257, "learning_rate": 9.651107118034799e-06, "loss": 0.7621, "num_input_tokens_seen": 57648296, "step": 99285 }, { "epoch": 14.78850163836759, "grad_norm": 1.1882662773132324, "learning_rate": 9.648542347831041e-06, "loss": 0.4568, "num_input_tokens_seen": 57651080, "step": 99290 }, { "epoch": 14.789246350908549, "grad_norm": 1.3377907276153564, "learning_rate": 9.645977836974545e-06, "loss": 0.7038, "num_input_tokens_seen": 57654312, "step": 99295 }, { "epoch": 14.78999106344951, "grad_norm": 1.573423147201538, "learning_rate": 9.643413585508659e-06, "loss": 0.5902, "num_input_tokens_seen": 57657064, "step": 99300 }, { "epoch": 14.790735775990468, "grad_norm": 1.1720267534255981, "learning_rate": 9.640849593476684e-06, "loss": 0.4827, "num_input_tokens_seen": 57660136, "step": 99305 }, { "epoch": 14.791480488531427, "grad_norm": 1.0535203218460083, "learning_rate": 9.63828586092195e-06, "loss": 0.5733, "num_input_tokens_seen": 57662920, "step": 99310 }, { "epoch": 14.792225201072386, "grad_norm": 1.493899941444397, "learning_rate": 9.635722387887766e-06, "loss": 0.496, "num_input_tokens_seen": 57665992, "step": 99315 }, { "epoch": 14.792969913613344, "grad_norm": 0.9201103448867798, "learning_rate": 9.63315917441743e-06, "loss": 0.4974, "num_input_tokens_seen": 57669064, "step": 99320 }, { "epoch": 14.793714626154305, "grad_norm": 1.2836887836456299, "learning_rate": 9.630596220554259e-06, "loss": 0.586, "num_input_tokens_seen": 57671880, "step": 99325 }, { "epoch": 14.794459338695264, "grad_norm": 1.886792540550232, "learning_rate": 9.628033526341542e-06, "loss": 0.6093, "num_input_tokens_seen": 57674600, "step": 99330 }, { "epoch": 14.795204051236222, "grad_norm": 1.3300013542175293, "learning_rate": 9.625471091822576e-06, "loss": 0.5507, "num_input_tokens_seen": 57677512, "step": 99335 }, { "epoch": 14.795948763777183, "grad_norm": 2.2771267890930176, "learning_rate": 9.622908917040643e-06, "loss": 0.6025, "num_input_tokens_seen": 57680264, "step": 99340 }, { "epoch": 14.796693476318142, "grad_norm": 2.0771279335021973, "learning_rate": 9.620347002039042e-06, "loss": 0.7127, "num_input_tokens_seen": 57683240, "step": 99345 }, { "epoch": 14.7974381888591, "grad_norm": 2.974039316177368, "learning_rate": 9.61778534686105e-06, "loss": 0.5038, "num_input_tokens_seen": 57685832, "step": 99350 }, { "epoch": 14.79818290140006, "grad_norm": 0.9810683131217957, "learning_rate": 9.615223951549929e-06, "loss": 0.4501, "num_input_tokens_seen": 57688712, "step": 99355 }, { "epoch": 14.798927613941018, "grad_norm": 2.046322822570801, "learning_rate": 9.612662816148974e-06, "loss": 0.437, "num_input_tokens_seen": 57691656, "step": 99360 }, { "epoch": 14.799672326481979, "grad_norm": 2.323760986328125, "learning_rate": 9.61010194070143e-06, "loss": 0.4259, "num_input_tokens_seen": 57694440, "step": 99365 }, { "epoch": 14.800417039022937, "grad_norm": 1.6259934902191162, "learning_rate": 9.607541325250582e-06, "loss": 0.5656, "num_input_tokens_seen": 57697448, "step": 99370 }, { "epoch": 14.801161751563896, "grad_norm": 1.6056697368621826, "learning_rate": 9.604980969839672e-06, "loss": 0.5811, "num_input_tokens_seen": 57700168, "step": 99375 }, { "epoch": 14.801906464104855, "grad_norm": 1.9052518606185913, "learning_rate": 9.60242087451197e-06, "loss": 0.654, "num_input_tokens_seen": 57702984, "step": 99380 }, { "epoch": 14.802651176645815, "grad_norm": 1.1359403133392334, "learning_rate": 9.599861039310709e-06, "loss": 0.4839, "num_input_tokens_seen": 57705960, "step": 99385 }, { "epoch": 14.803395889186774, "grad_norm": 1.225022315979004, "learning_rate": 9.597301464279151e-06, "loss": 0.4464, "num_input_tokens_seen": 57708936, "step": 99390 }, { "epoch": 14.804140601727733, "grad_norm": 1.704992651939392, "learning_rate": 9.59474214946053e-06, "loss": 0.7647, "num_input_tokens_seen": 57711816, "step": 99395 }, { "epoch": 14.804885314268692, "grad_norm": 1.2793575525283813, "learning_rate": 9.592183094898086e-06, "loss": 0.6499, "num_input_tokens_seen": 57714600, "step": 99400 }, { "epoch": 14.805630026809652, "grad_norm": 1.2545299530029297, "learning_rate": 9.589624300635047e-06, "loss": 0.6924, "num_input_tokens_seen": 57717704, "step": 99405 }, { "epoch": 14.80637473935061, "grad_norm": 1.5463353395462036, "learning_rate": 9.587065766714635e-06, "loss": 0.5959, "num_input_tokens_seen": 57720232, "step": 99410 }, { "epoch": 14.80711945189157, "grad_norm": 0.8735523223876953, "learning_rate": 9.584507493180089e-06, "loss": 0.5959, "num_input_tokens_seen": 57723432, "step": 99415 }, { "epoch": 14.807864164432528, "grad_norm": 1.663020133972168, "learning_rate": 9.581949480074615e-06, "loss": 0.584, "num_input_tokens_seen": 57726120, "step": 99420 }, { "epoch": 14.808608876973489, "grad_norm": 1.5278507471084595, "learning_rate": 9.579391727441442e-06, "loss": 0.4602, "num_input_tokens_seen": 57728872, "step": 99425 }, { "epoch": 14.809353589514448, "grad_norm": 1.6068376302719116, "learning_rate": 9.576834235323773e-06, "loss": 0.7455, "num_input_tokens_seen": 57732072, "step": 99430 }, { "epoch": 14.810098302055406, "grad_norm": 1.9649540185928345, "learning_rate": 9.574277003764807e-06, "loss": 0.5412, "num_input_tokens_seen": 57734888, "step": 99435 }, { "epoch": 14.810843014596365, "grad_norm": 2.0653247833251953, "learning_rate": 9.571720032807758e-06, "loss": 0.6019, "num_input_tokens_seen": 57737896, "step": 99440 }, { "epoch": 14.811587727137326, "grad_norm": 1.388709306716919, "learning_rate": 9.569163322495811e-06, "loss": 0.5246, "num_input_tokens_seen": 57741224, "step": 99445 }, { "epoch": 14.812332439678285, "grad_norm": 1.1190474033355713, "learning_rate": 9.566606872872178e-06, "loss": 0.5222, "num_input_tokens_seen": 57744008, "step": 99450 }, { "epoch": 14.813077152219243, "grad_norm": 1.2241888046264648, "learning_rate": 9.564050683980025e-06, "loss": 0.5512, "num_input_tokens_seen": 57746888, "step": 99455 }, { "epoch": 14.813821864760202, "grad_norm": 1.473313331604004, "learning_rate": 9.561494755862554e-06, "loss": 0.7074, "num_input_tokens_seen": 57749768, "step": 99460 }, { "epoch": 14.814566577301163, "grad_norm": 1.4108388423919678, "learning_rate": 9.55893908856294e-06, "loss": 0.6403, "num_input_tokens_seen": 57752872, "step": 99465 }, { "epoch": 14.815311289842121, "grad_norm": 0.807072639465332, "learning_rate": 9.55638368212436e-06, "loss": 0.5255, "num_input_tokens_seen": 57755560, "step": 99470 }, { "epoch": 14.81605600238308, "grad_norm": 2.5036184787750244, "learning_rate": 9.553828536589976e-06, "loss": 0.6651, "num_input_tokens_seen": 57758472, "step": 99475 }, { "epoch": 14.816800714924039, "grad_norm": 1.3253538608551025, "learning_rate": 9.551273652002955e-06, "loss": 0.5698, "num_input_tokens_seen": 57761192, "step": 99480 }, { "epoch": 14.817545427465, "grad_norm": 1.3801274299621582, "learning_rate": 9.548719028406472e-06, "loss": 0.5583, "num_input_tokens_seen": 57764136, "step": 99485 }, { "epoch": 14.818290140005958, "grad_norm": 1.2967069149017334, "learning_rate": 9.546164665843669e-06, "loss": 0.4899, "num_input_tokens_seen": 57767112, "step": 99490 }, { "epoch": 14.819034852546917, "grad_norm": 1.6200624704360962, "learning_rate": 9.543610564357714e-06, "loss": 0.65, "num_input_tokens_seen": 57770248, "step": 99495 }, { "epoch": 14.819779565087876, "grad_norm": 1.3537527322769165, "learning_rate": 9.541056723991739e-06, "loss": 0.4388, "num_input_tokens_seen": 57773128, "step": 99500 }, { "epoch": 14.820524277628834, "grad_norm": 1.130548119544983, "learning_rate": 9.538503144788914e-06, "loss": 0.5567, "num_input_tokens_seen": 57776328, "step": 99505 }, { "epoch": 14.821268990169795, "grad_norm": 1.8630363941192627, "learning_rate": 9.535949826792358e-06, "loss": 0.8819, "num_input_tokens_seen": 57779016, "step": 99510 }, { "epoch": 14.822013702710754, "grad_norm": 2.6831490993499756, "learning_rate": 9.533396770045208e-06, "loss": 0.572, "num_input_tokens_seen": 57781832, "step": 99515 }, { "epoch": 14.822758415251712, "grad_norm": 2.6307549476623535, "learning_rate": 9.530843974590606e-06, "loss": 0.7015, "num_input_tokens_seen": 57784616, "step": 99520 }, { "epoch": 14.823503127792671, "grad_norm": 0.9719961881637573, "learning_rate": 9.528291440471665e-06, "loss": 0.57, "num_input_tokens_seen": 57787624, "step": 99525 }, { "epoch": 14.824247840333632, "grad_norm": 1.0201590061187744, "learning_rate": 9.525739167731527e-06, "loss": 0.517, "num_input_tokens_seen": 57790440, "step": 99530 }, { "epoch": 14.82499255287459, "grad_norm": 1.636629581451416, "learning_rate": 9.523187156413294e-06, "loss": 0.6325, "num_input_tokens_seen": 57793192, "step": 99535 }, { "epoch": 14.82573726541555, "grad_norm": 2.057103395462036, "learning_rate": 9.520635406560086e-06, "loss": 0.5793, "num_input_tokens_seen": 57795976, "step": 99540 }, { "epoch": 14.826481977956508, "grad_norm": 1.4861574172973633, "learning_rate": 9.518083918215e-06, "loss": 0.7327, "num_input_tokens_seen": 57798632, "step": 99545 }, { "epoch": 14.827226690497469, "grad_norm": 1.2574055194854736, "learning_rate": 9.515532691421162e-06, "loss": 0.787, "num_input_tokens_seen": 57801352, "step": 99550 }, { "epoch": 14.827971403038427, "grad_norm": 1.2955636978149414, "learning_rate": 9.512981726221661e-06, "loss": 0.4638, "num_input_tokens_seen": 57804296, "step": 99555 }, { "epoch": 14.828716115579386, "grad_norm": 1.831391453742981, "learning_rate": 9.510431022659586e-06, "loss": 0.5376, "num_input_tokens_seen": 57807432, "step": 99560 }, { "epoch": 14.829460828120345, "grad_norm": 2.395967721939087, "learning_rate": 9.507880580778042e-06, "loss": 0.6301, "num_input_tokens_seen": 57810024, "step": 99565 }, { "epoch": 14.830205540661305, "grad_norm": 1.5273842811584473, "learning_rate": 9.505330400620101e-06, "loss": 0.5375, "num_input_tokens_seen": 57812808, "step": 99570 }, { "epoch": 14.830950253202264, "grad_norm": 1.616044282913208, "learning_rate": 9.502780482228866e-06, "loss": 0.642, "num_input_tokens_seen": 57815688, "step": 99575 }, { "epoch": 14.831694965743223, "grad_norm": 3.1287055015563965, "learning_rate": 9.500230825647394e-06, "loss": 0.582, "num_input_tokens_seen": 57818664, "step": 99580 }, { "epoch": 14.832439678284182, "grad_norm": 2.704392433166504, "learning_rate": 9.497681430918778e-06, "loss": 0.6639, "num_input_tokens_seen": 57821608, "step": 99585 }, { "epoch": 14.833184390825142, "grad_norm": 0.86911940574646, "learning_rate": 9.495132298086079e-06, "loss": 0.4897, "num_input_tokens_seen": 57824264, "step": 99590 }, { "epoch": 14.833929103366101, "grad_norm": 0.8734671473503113, "learning_rate": 9.492583427192361e-06, "loss": 0.4968, "num_input_tokens_seen": 57826952, "step": 99595 }, { "epoch": 14.83467381590706, "grad_norm": 1.111710548400879, "learning_rate": 9.490034818280677e-06, "loss": 0.6359, "num_input_tokens_seen": 57829864, "step": 99600 }, { "epoch": 14.835418528448018, "grad_norm": 0.9502987861633301, "learning_rate": 9.487486471394096e-06, "loss": 0.6031, "num_input_tokens_seen": 57832456, "step": 99605 }, { "epoch": 14.836163240988979, "grad_norm": 1.9781278371810913, "learning_rate": 9.48493838657567e-06, "loss": 0.7355, "num_input_tokens_seen": 57835624, "step": 99610 }, { "epoch": 14.836907953529938, "grad_norm": 0.719369649887085, "learning_rate": 9.482390563868429e-06, "loss": 0.4863, "num_input_tokens_seen": 57838504, "step": 99615 }, { "epoch": 14.837652666070897, "grad_norm": 3.2849347591400146, "learning_rate": 9.479843003315439e-06, "loss": 0.6889, "num_input_tokens_seen": 57841480, "step": 99620 }, { "epoch": 14.838397378611855, "grad_norm": 2.3745455741882324, "learning_rate": 9.477295704959718e-06, "loss": 0.6867, "num_input_tokens_seen": 57844328, "step": 99625 }, { "epoch": 14.839142091152816, "grad_norm": 2.042964458465576, "learning_rate": 9.474748668844316e-06, "loss": 0.5938, "num_input_tokens_seen": 57847240, "step": 99630 }, { "epoch": 14.839886803693775, "grad_norm": 1.7328784465789795, "learning_rate": 9.47220189501226e-06, "loss": 0.5667, "num_input_tokens_seen": 57850120, "step": 99635 }, { "epoch": 14.840631516234733, "grad_norm": 1.7167425155639648, "learning_rate": 9.46965538350656e-06, "loss": 0.6235, "num_input_tokens_seen": 57853160, "step": 99640 }, { "epoch": 14.841376228775692, "grad_norm": 0.6691939234733582, "learning_rate": 9.467109134370255e-06, "loss": 0.4635, "num_input_tokens_seen": 57856040, "step": 99645 }, { "epoch": 14.842120941316653, "grad_norm": 1.422847867012024, "learning_rate": 9.46456314764635e-06, "loss": 0.4846, "num_input_tokens_seen": 57859016, "step": 99650 }, { "epoch": 14.842865653857611, "grad_norm": 1.3480275869369507, "learning_rate": 9.462017423377867e-06, "loss": 0.4499, "num_input_tokens_seen": 57862024, "step": 99655 }, { "epoch": 14.84361036639857, "grad_norm": 1.3706976175308228, "learning_rate": 9.459471961607808e-06, "loss": 0.6104, "num_input_tokens_seen": 57864968, "step": 99660 }, { "epoch": 14.844355078939529, "grad_norm": 1.4394983053207397, "learning_rate": 9.456926762379175e-06, "loss": 0.4397, "num_input_tokens_seen": 57867656, "step": 99665 }, { "epoch": 14.84509979148049, "grad_norm": 1.429430603981018, "learning_rate": 9.45438182573496e-06, "loss": 0.5518, "num_input_tokens_seen": 57870568, "step": 99670 }, { "epoch": 14.845844504021448, "grad_norm": 1.3267748355865479, "learning_rate": 9.451837151718171e-06, "loss": 0.5981, "num_input_tokens_seen": 57873288, "step": 99675 }, { "epoch": 14.846589216562407, "grad_norm": 3.8122856616973877, "learning_rate": 9.449292740371793e-06, "loss": 0.4712, "num_input_tokens_seen": 57876328, "step": 99680 }, { "epoch": 14.847333929103366, "grad_norm": 1.45810067653656, "learning_rate": 9.4467485917388e-06, "loss": 0.5585, "num_input_tokens_seen": 57879208, "step": 99685 }, { "epoch": 14.848078641644324, "grad_norm": 0.7039119601249695, "learning_rate": 9.444204705862189e-06, "loss": 0.4894, "num_input_tokens_seen": 57882184, "step": 99690 }, { "epoch": 14.848823354185285, "grad_norm": 1.1382402181625366, "learning_rate": 9.441661082784919e-06, "loss": 0.4531, "num_input_tokens_seen": 57884808, "step": 99695 }, { "epoch": 14.849568066726244, "grad_norm": 2.1700782775878906, "learning_rate": 9.439117722549984e-06, "loss": 0.7627, "num_input_tokens_seen": 57888072, "step": 99700 }, { "epoch": 14.850312779267203, "grad_norm": 1.7776741981506348, "learning_rate": 9.436574625200332e-06, "loss": 0.6309, "num_input_tokens_seen": 57890856, "step": 99705 }, { "epoch": 14.851057491808161, "grad_norm": 1.8077625036239624, "learning_rate": 9.434031790778941e-06, "loss": 0.6825, "num_input_tokens_seen": 57894088, "step": 99710 }, { "epoch": 14.851802204349122, "grad_norm": 1.8418657779693604, "learning_rate": 9.431489219328759e-06, "loss": 0.7138, "num_input_tokens_seen": 57896776, "step": 99715 }, { "epoch": 14.85254691689008, "grad_norm": 1.4512608051300049, "learning_rate": 9.42894691089274e-06, "loss": 0.5523, "num_input_tokens_seen": 57899976, "step": 99720 }, { "epoch": 14.85329162943104, "grad_norm": 2.3421688079833984, "learning_rate": 9.426404865513843e-06, "loss": 0.7082, "num_input_tokens_seen": 57902728, "step": 99725 }, { "epoch": 14.854036341971998, "grad_norm": 1.0685769319534302, "learning_rate": 9.42386308323501e-06, "loss": 0.7064, "num_input_tokens_seen": 57905800, "step": 99730 }, { "epoch": 14.854781054512959, "grad_norm": 1.3144625425338745, "learning_rate": 9.421321564099175e-06, "loss": 0.6158, "num_input_tokens_seen": 57908808, "step": 99735 }, { "epoch": 14.855525767053917, "grad_norm": 1.6153020858764648, "learning_rate": 9.418780308149276e-06, "loss": 0.6371, "num_input_tokens_seen": 57911784, "step": 99740 }, { "epoch": 14.856270479594876, "grad_norm": 1.2920570373535156, "learning_rate": 9.416239315428252e-06, "loss": 0.5934, "num_input_tokens_seen": 57914632, "step": 99745 }, { "epoch": 14.857015192135835, "grad_norm": 1.7073441743850708, "learning_rate": 9.413698585979016e-06, "loss": 0.5026, "num_input_tokens_seen": 57917448, "step": 99750 }, { "epoch": 14.857759904676795, "grad_norm": 0.9493374824523926, "learning_rate": 9.411158119844512e-06, "loss": 0.57, "num_input_tokens_seen": 57920104, "step": 99755 }, { "epoch": 14.858504617217754, "grad_norm": 1.7296826839447021, "learning_rate": 9.40861791706765e-06, "loss": 0.6952, "num_input_tokens_seen": 57922728, "step": 99760 }, { "epoch": 14.859249329758713, "grad_norm": 2.6172304153442383, "learning_rate": 9.40607797769133e-06, "loss": 0.6385, "num_input_tokens_seen": 57925896, "step": 99765 }, { "epoch": 14.859994042299672, "grad_norm": 2.009580612182617, "learning_rate": 9.403538301758486e-06, "loss": 0.6952, "num_input_tokens_seen": 57929096, "step": 99770 }, { "epoch": 14.860738754840632, "grad_norm": 3.955268144607544, "learning_rate": 9.400998889311999e-06, "loss": 0.5389, "num_input_tokens_seen": 57932136, "step": 99775 }, { "epoch": 14.861483467381591, "grad_norm": 1.34544038772583, "learning_rate": 9.398459740394792e-06, "loss": 0.4983, "num_input_tokens_seen": 57935400, "step": 99780 }, { "epoch": 14.86222817992255, "grad_norm": 1.374945044517517, "learning_rate": 9.395920855049739e-06, "loss": 0.6013, "num_input_tokens_seen": 57938152, "step": 99785 }, { "epoch": 14.862972892463509, "grad_norm": 0.6449264883995056, "learning_rate": 9.393382233319757e-06, "loss": 0.5377, "num_input_tokens_seen": 57940840, "step": 99790 }, { "epoch": 14.863717605004469, "grad_norm": 1.235944151878357, "learning_rate": 9.390843875247717e-06, "loss": 0.5425, "num_input_tokens_seen": 57943880, "step": 99795 }, { "epoch": 14.864462317545428, "grad_norm": 1.9290761947631836, "learning_rate": 9.388305780876508e-06, "loss": 0.6139, "num_input_tokens_seen": 57946792, "step": 99800 }, { "epoch": 14.865207030086387, "grad_norm": 1.2458744049072266, "learning_rate": 9.385767950249003e-06, "loss": 0.6056, "num_input_tokens_seen": 57949672, "step": 99805 }, { "epoch": 14.865951742627345, "grad_norm": 1.3376972675323486, "learning_rate": 9.383230383408073e-06, "loss": 0.5549, "num_input_tokens_seen": 57952648, "step": 99810 }, { "epoch": 14.866696455168306, "grad_norm": 1.4912219047546387, "learning_rate": 9.380693080396599e-06, "loss": 0.5609, "num_input_tokens_seen": 57955176, "step": 99815 }, { "epoch": 14.867441167709265, "grad_norm": 1.680413842201233, "learning_rate": 9.378156041257436e-06, "loss": 0.5894, "num_input_tokens_seen": 57958280, "step": 99820 }, { "epoch": 14.868185880250223, "grad_norm": 1.9708023071289062, "learning_rate": 9.375619266033456e-06, "loss": 0.4013, "num_input_tokens_seen": 57961224, "step": 99825 }, { "epoch": 14.868930592791182, "grad_norm": 1.5625649690628052, "learning_rate": 9.373082754767497e-06, "loss": 0.6813, "num_input_tokens_seen": 57964232, "step": 99830 }, { "epoch": 14.86967530533214, "grad_norm": 1.684072494506836, "learning_rate": 9.370546507502433e-06, "loss": 0.534, "num_input_tokens_seen": 57967176, "step": 99835 }, { "epoch": 14.870420017873101, "grad_norm": 0.9844964146614075, "learning_rate": 9.368010524281104e-06, "loss": 0.6422, "num_input_tokens_seen": 57970024, "step": 99840 }, { "epoch": 14.87116473041406, "grad_norm": 1.5135924816131592, "learning_rate": 9.365474805146337e-06, "loss": 0.6876, "num_input_tokens_seen": 57973000, "step": 99845 }, { "epoch": 14.871909442955019, "grad_norm": 2.0443637371063232, "learning_rate": 9.362939350140992e-06, "loss": 0.6468, "num_input_tokens_seen": 57976296, "step": 99850 }, { "epoch": 14.87265415549598, "grad_norm": 1.340214490890503, "learning_rate": 9.360404159307887e-06, "loss": 0.6102, "num_input_tokens_seen": 57979080, "step": 99855 }, { "epoch": 14.873398868036938, "grad_norm": 2.1940364837646484, "learning_rate": 9.357869232689867e-06, "loss": 0.4875, "num_input_tokens_seen": 57981800, "step": 99860 }, { "epoch": 14.874143580577897, "grad_norm": 1.5944136381149292, "learning_rate": 9.355334570329746e-06, "loss": 0.4903, "num_input_tokens_seen": 57984424, "step": 99865 }, { "epoch": 14.874888293118856, "grad_norm": 2.2700092792510986, "learning_rate": 9.352800172270352e-06, "loss": 0.6111, "num_input_tokens_seen": 57987304, "step": 99870 }, { "epoch": 14.875633005659815, "grad_norm": 1.4068959951400757, "learning_rate": 9.35026603855449e-06, "loss": 0.7174, "num_input_tokens_seen": 57990088, "step": 99875 }, { "epoch": 14.876377718200775, "grad_norm": 1.8712881803512573, "learning_rate": 9.347732169224972e-06, "loss": 0.7783, "num_input_tokens_seen": 57992904, "step": 99880 }, { "epoch": 14.877122430741734, "grad_norm": 1.6252800226211548, "learning_rate": 9.345198564324616e-06, "loss": 0.6674, "num_input_tokens_seen": 57995688, "step": 99885 }, { "epoch": 14.877867143282693, "grad_norm": 1.7491599321365356, "learning_rate": 9.342665223896216e-06, "loss": 0.5663, "num_input_tokens_seen": 57998632, "step": 99890 }, { "epoch": 14.878611855823651, "grad_norm": 1.4273955821990967, "learning_rate": 9.34013214798258e-06, "loss": 0.4287, "num_input_tokens_seen": 58001320, "step": 99895 }, { "epoch": 14.879356568364612, "grad_norm": 1.201805830001831, "learning_rate": 9.337599336626488e-06, "loss": 0.5077, "num_input_tokens_seen": 58004520, "step": 99900 }, { "epoch": 14.88010128090557, "grad_norm": 2.963470935821533, "learning_rate": 9.335066789870741e-06, "loss": 0.7009, "num_input_tokens_seen": 58007464, "step": 99905 }, { "epoch": 14.88084599344653, "grad_norm": 1.0804495811462402, "learning_rate": 9.332534507758114e-06, "loss": 0.6903, "num_input_tokens_seen": 58010312, "step": 99910 }, { "epoch": 14.881590705987488, "grad_norm": 0.5359153151512146, "learning_rate": 9.330002490331402e-06, "loss": 0.5615, "num_input_tokens_seen": 58013160, "step": 99915 }, { "epoch": 14.882335418528449, "grad_norm": 3.1506590843200684, "learning_rate": 9.32747073763337e-06, "loss": 0.7368, "num_input_tokens_seen": 58016104, "step": 99920 }, { "epoch": 14.883080131069407, "grad_norm": 2.5736982822418213, "learning_rate": 9.324939249706793e-06, "loss": 0.7263, "num_input_tokens_seen": 58019016, "step": 99925 }, { "epoch": 14.883824843610366, "grad_norm": 0.9861221313476562, "learning_rate": 9.322408026594427e-06, "loss": 0.557, "num_input_tokens_seen": 58021864, "step": 99930 }, { "epoch": 14.884569556151325, "grad_norm": 1.688306450843811, "learning_rate": 9.319877068339051e-06, "loss": 0.5361, "num_input_tokens_seen": 58024584, "step": 99935 }, { "epoch": 14.885314268692285, "grad_norm": 2.884648323059082, "learning_rate": 9.317346374983416e-06, "loss": 0.4933, "num_input_tokens_seen": 58027624, "step": 99940 }, { "epoch": 14.886058981233244, "grad_norm": 1.1237677335739136, "learning_rate": 9.314815946570263e-06, "loss": 0.5601, "num_input_tokens_seen": 58030792, "step": 99945 }, { "epoch": 14.886803693774203, "grad_norm": 1.424522042274475, "learning_rate": 9.312285783142366e-06, "loss": 0.8054, "num_input_tokens_seen": 58034056, "step": 99950 }, { "epoch": 14.887548406315162, "grad_norm": 2.069790840148926, "learning_rate": 9.309755884742455e-06, "loss": 0.7343, "num_input_tokens_seen": 58036712, "step": 99955 }, { "epoch": 14.888293118856122, "grad_norm": 1.047548770904541, "learning_rate": 9.307226251413262e-06, "loss": 0.5342, "num_input_tokens_seen": 58039496, "step": 99960 }, { "epoch": 14.889037831397081, "grad_norm": 1.1698452234268188, "learning_rate": 9.304696883197542e-06, "loss": 0.5774, "num_input_tokens_seen": 58042216, "step": 99965 }, { "epoch": 14.88978254393804, "grad_norm": 1.2274649143218994, "learning_rate": 9.302167780138005e-06, "loss": 0.6359, "num_input_tokens_seen": 58045128, "step": 99970 }, { "epoch": 14.890527256478999, "grad_norm": 1.5169442892074585, "learning_rate": 9.2996389422774e-06, "loss": 0.6529, "num_input_tokens_seen": 58047816, "step": 99975 }, { "epoch": 14.891271969019959, "grad_norm": 1.1186103820800781, "learning_rate": 9.297110369658426e-06, "loss": 0.5107, "num_input_tokens_seen": 58050376, "step": 99980 }, { "epoch": 14.892016681560918, "grad_norm": 1.842421531677246, "learning_rate": 9.294582062323825e-06, "loss": 0.5391, "num_input_tokens_seen": 58053416, "step": 99985 }, { "epoch": 14.892761394101877, "grad_norm": 1.2237414121627808, "learning_rate": 9.292054020316297e-06, "loss": 0.8067, "num_input_tokens_seen": 58056328, "step": 99990 }, { "epoch": 14.893506106642835, "grad_norm": 1.3442602157592773, "learning_rate": 9.28952624367855e-06, "loss": 0.6215, "num_input_tokens_seen": 58059368, "step": 99995 }, { "epoch": 14.894250819183796, "grad_norm": 1.9459058046340942, "learning_rate": 9.286998732453292e-06, "loss": 0.5797, "num_input_tokens_seen": 58062504, "step": 100000 }, { "epoch": 14.894995531724755, "grad_norm": 3.3623881340026855, "learning_rate": 9.28447148668321e-06, "loss": 0.6564, "num_input_tokens_seen": 58065608, "step": 100005 }, { "epoch": 14.895740244265713, "grad_norm": 1.9623581171035767, "learning_rate": 9.28194450641102e-06, "loss": 0.5431, "num_input_tokens_seen": 58068616, "step": 100010 }, { "epoch": 14.896484956806672, "grad_norm": 2.2743427753448486, "learning_rate": 9.27941779167939e-06, "loss": 0.6455, "num_input_tokens_seen": 58071400, "step": 100015 }, { "epoch": 14.897229669347631, "grad_norm": 1.523714303970337, "learning_rate": 9.27689134253103e-06, "loss": 0.5638, "num_input_tokens_seen": 58074344, "step": 100020 }, { "epoch": 14.897974381888591, "grad_norm": 1.8113151788711548, "learning_rate": 9.274365159008602e-06, "loss": 0.6323, "num_input_tokens_seen": 58077256, "step": 100025 }, { "epoch": 14.89871909442955, "grad_norm": 1.793031096458435, "learning_rate": 9.2718392411548e-06, "loss": 0.6201, "num_input_tokens_seen": 58080072, "step": 100030 }, { "epoch": 14.899463806970509, "grad_norm": 2.975405216217041, "learning_rate": 9.26931358901229e-06, "loss": 0.485, "num_input_tokens_seen": 58082696, "step": 100035 }, { "epoch": 14.900208519511468, "grad_norm": 1.1860074996948242, "learning_rate": 9.26678820262373e-06, "loss": 0.506, "num_input_tokens_seen": 58085832, "step": 100040 }, { "epoch": 14.900953232052428, "grad_norm": 1.992524266242981, "learning_rate": 9.2642630820318e-06, "loss": 0.642, "num_input_tokens_seen": 58088776, "step": 100045 }, { "epoch": 14.901697944593387, "grad_norm": 2.872429370880127, "learning_rate": 9.261738227279144e-06, "loss": 0.6564, "num_input_tokens_seen": 58091688, "step": 100050 }, { "epoch": 14.902442657134346, "grad_norm": 1.2813537120819092, "learning_rate": 9.259213638408434e-06, "loss": 0.6985, "num_input_tokens_seen": 58094408, "step": 100055 }, { "epoch": 14.903187369675305, "grad_norm": 0.8195048570632935, "learning_rate": 9.25668931546231e-06, "loss": 0.4431, "num_input_tokens_seen": 58097288, "step": 100060 }, { "epoch": 14.903932082216265, "grad_norm": 1.151330590248108, "learning_rate": 9.254165258483421e-06, "loss": 0.5254, "num_input_tokens_seen": 58100360, "step": 100065 }, { "epoch": 14.904676794757224, "grad_norm": 1.285556435585022, "learning_rate": 9.251641467514399e-06, "loss": 0.5513, "num_input_tokens_seen": 58103336, "step": 100070 }, { "epoch": 14.905421507298183, "grad_norm": 1.7790744304656982, "learning_rate": 9.249117942597895e-06, "loss": 0.4902, "num_input_tokens_seen": 58106120, "step": 100075 }, { "epoch": 14.906166219839141, "grad_norm": 1.8465442657470703, "learning_rate": 9.246594683776536e-06, "loss": 0.5897, "num_input_tokens_seen": 58109224, "step": 100080 }, { "epoch": 14.906910932380102, "grad_norm": 1.9603712558746338, "learning_rate": 9.244071691092937e-06, "loss": 0.6049, "num_input_tokens_seen": 58111880, "step": 100085 }, { "epoch": 14.90765564492106, "grad_norm": 1.3385611772537231, "learning_rate": 9.241548964589747e-06, "loss": 0.5485, "num_input_tokens_seen": 58114728, "step": 100090 }, { "epoch": 14.90840035746202, "grad_norm": 1.8593860864639282, "learning_rate": 9.239026504309558e-06, "loss": 0.5899, "num_input_tokens_seen": 58117640, "step": 100095 }, { "epoch": 14.909145070002978, "grad_norm": 1.4487659931182861, "learning_rate": 9.236504310295007e-06, "loss": 0.5892, "num_input_tokens_seen": 58120936, "step": 100100 }, { "epoch": 14.909889782543939, "grad_norm": 1.0764405727386475, "learning_rate": 9.233982382588688e-06, "loss": 0.5419, "num_input_tokens_seen": 58124264, "step": 100105 }, { "epoch": 14.910634495084897, "grad_norm": 1.8512650728225708, "learning_rate": 9.23146072123322e-06, "loss": 0.4347, "num_input_tokens_seen": 58127080, "step": 100110 }, { "epoch": 14.911379207625856, "grad_norm": 2.3624725341796875, "learning_rate": 9.228939326271197e-06, "loss": 0.4741, "num_input_tokens_seen": 58129928, "step": 100115 }, { "epoch": 14.912123920166815, "grad_norm": 1.3495745658874512, "learning_rate": 9.226418197745206e-06, "loss": 0.587, "num_input_tokens_seen": 58132904, "step": 100120 }, { "epoch": 14.912868632707776, "grad_norm": 1.7544007301330566, "learning_rate": 9.223897335697856e-06, "loss": 0.5465, "num_input_tokens_seen": 58136008, "step": 100125 }, { "epoch": 14.913613345248734, "grad_norm": 1.5933032035827637, "learning_rate": 9.221376740171727e-06, "loss": 0.5982, "num_input_tokens_seen": 58138856, "step": 100130 }, { "epoch": 14.914358057789693, "grad_norm": 1.4008285999298096, "learning_rate": 9.2188564112094e-06, "loss": 0.4831, "num_input_tokens_seen": 58141800, "step": 100135 }, { "epoch": 14.915102770330652, "grad_norm": 1.2818388938903809, "learning_rate": 9.216336348853449e-06, "loss": 0.5828, "num_input_tokens_seen": 58144680, "step": 100140 }, { "epoch": 14.915847482871612, "grad_norm": 0.9416126608848572, "learning_rate": 9.213816553146462e-06, "loss": 0.6128, "num_input_tokens_seen": 58147624, "step": 100145 }, { "epoch": 14.916592195412571, "grad_norm": 1.4111380577087402, "learning_rate": 9.211297024130989e-06, "loss": 0.6861, "num_input_tokens_seen": 58150216, "step": 100150 }, { "epoch": 14.91733690795353, "grad_norm": 2.801145553588867, "learning_rate": 9.208777761849616e-06, "loss": 0.7314, "num_input_tokens_seen": 58153096, "step": 100155 }, { "epoch": 14.918081620494489, "grad_norm": 2.041247844696045, "learning_rate": 9.20625876634489e-06, "loss": 0.6512, "num_input_tokens_seen": 58155784, "step": 100160 }, { "epoch": 14.91882633303545, "grad_norm": 0.7286139726638794, "learning_rate": 9.203740037659367e-06, "loss": 0.3362, "num_input_tokens_seen": 58159016, "step": 100165 }, { "epoch": 14.919571045576408, "grad_norm": 1.1788800954818726, "learning_rate": 9.201221575835608e-06, "loss": 0.5713, "num_input_tokens_seen": 58161736, "step": 100170 }, { "epoch": 14.920315758117367, "grad_norm": 2.4886183738708496, "learning_rate": 9.198703380916143e-06, "loss": 0.6185, "num_input_tokens_seen": 58164968, "step": 100175 }, { "epoch": 14.921060470658325, "grad_norm": 1.9077391624450684, "learning_rate": 9.196185452943534e-06, "loss": 0.699, "num_input_tokens_seen": 58168008, "step": 100180 }, { "epoch": 14.921805183199286, "grad_norm": 1.273380994796753, "learning_rate": 9.193667791960303e-06, "loss": 0.5443, "num_input_tokens_seen": 58171112, "step": 100185 }, { "epoch": 14.922549895740245, "grad_norm": 1.7245153188705444, "learning_rate": 9.191150398008996e-06, "loss": 0.3957, "num_input_tokens_seen": 58173960, "step": 100190 }, { "epoch": 14.923294608281203, "grad_norm": 1.936266303062439, "learning_rate": 9.188633271132135e-06, "loss": 0.4844, "num_input_tokens_seen": 58177032, "step": 100195 }, { "epoch": 14.924039320822162, "grad_norm": 1.312319040298462, "learning_rate": 9.186116411372248e-06, "loss": 0.7995, "num_input_tokens_seen": 58180040, "step": 100200 }, { "epoch": 14.924784033363121, "grad_norm": 2.0484044551849365, "learning_rate": 9.183599818771849e-06, "loss": 0.6469, "num_input_tokens_seen": 58182920, "step": 100205 }, { "epoch": 14.925528745904082, "grad_norm": 2.4417362213134766, "learning_rate": 9.181083493373449e-06, "loss": 0.5647, "num_input_tokens_seen": 58185672, "step": 100210 }, { "epoch": 14.92627345844504, "grad_norm": 0.7971299290657043, "learning_rate": 9.178567435219574e-06, "loss": 0.417, "num_input_tokens_seen": 58188392, "step": 100215 }, { "epoch": 14.927018170985999, "grad_norm": 1.8459159135818481, "learning_rate": 9.176051644352713e-06, "loss": 0.5211, "num_input_tokens_seen": 58191432, "step": 100220 }, { "epoch": 14.927762883526958, "grad_norm": 1.113908290863037, "learning_rate": 9.173536120815385e-06, "loss": 0.5799, "num_input_tokens_seen": 58194472, "step": 100225 }, { "epoch": 14.928507596067918, "grad_norm": 2.3661999702453613, "learning_rate": 9.171020864650071e-06, "loss": 0.6387, "num_input_tokens_seen": 58197384, "step": 100230 }, { "epoch": 14.929252308608877, "grad_norm": 1.5667400360107422, "learning_rate": 9.16850587589928e-06, "loss": 0.6348, "num_input_tokens_seen": 58200488, "step": 100235 }, { "epoch": 14.929997021149836, "grad_norm": 2.1753785610198975, "learning_rate": 9.16599115460549e-06, "loss": 0.5236, "num_input_tokens_seen": 58203176, "step": 100240 }, { "epoch": 14.930741733690795, "grad_norm": 1.2152056694030762, "learning_rate": 9.16347670081118e-06, "loss": 0.4855, "num_input_tokens_seen": 58206152, "step": 100245 }, { "epoch": 14.931486446231755, "grad_norm": 2.814265251159668, "learning_rate": 9.160962514558843e-06, "loss": 0.6815, "num_input_tokens_seen": 58209384, "step": 100250 }, { "epoch": 14.932231158772714, "grad_norm": 1.4108731746673584, "learning_rate": 9.158448595890948e-06, "loss": 0.5726, "num_input_tokens_seen": 58212168, "step": 100255 }, { "epoch": 14.932975871313673, "grad_norm": 1.4624032974243164, "learning_rate": 9.155934944849953e-06, "loss": 0.8389, "num_input_tokens_seen": 58215208, "step": 100260 }, { "epoch": 14.933720583854631, "grad_norm": 2.0351078510284424, "learning_rate": 9.153421561478346e-06, "loss": 0.6116, "num_input_tokens_seen": 58218184, "step": 100265 }, { "epoch": 14.934465296395592, "grad_norm": 3.5828235149383545, "learning_rate": 9.150908445818571e-06, "loss": 0.6914, "num_input_tokens_seen": 58221256, "step": 100270 }, { "epoch": 14.93521000893655, "grad_norm": 1.3947324752807617, "learning_rate": 9.148395597913085e-06, "loss": 0.7259, "num_input_tokens_seen": 58224008, "step": 100275 }, { "epoch": 14.93595472147751, "grad_norm": 1.1679919958114624, "learning_rate": 9.14588301780435e-06, "loss": 0.5288, "num_input_tokens_seen": 58226888, "step": 100280 }, { "epoch": 14.936699434018468, "grad_norm": 2.706787109375, "learning_rate": 9.14337070553481e-06, "loss": 0.7137, "num_input_tokens_seen": 58229800, "step": 100285 }, { "epoch": 14.937444146559429, "grad_norm": 1.3095200061798096, "learning_rate": 9.140858661146897e-06, "loss": 0.5967, "num_input_tokens_seen": 58232744, "step": 100290 }, { "epoch": 14.938188859100388, "grad_norm": 2.0790226459503174, "learning_rate": 9.138346884683066e-06, "loss": 0.6129, "num_input_tokens_seen": 58235496, "step": 100295 }, { "epoch": 14.938933571641346, "grad_norm": 1.2762112617492676, "learning_rate": 9.135835376185737e-06, "loss": 0.5803, "num_input_tokens_seen": 58238152, "step": 100300 }, { "epoch": 14.939678284182305, "grad_norm": 1.3906880617141724, "learning_rate": 9.133324135697351e-06, "loss": 0.6741, "num_input_tokens_seen": 58240872, "step": 100305 }, { "epoch": 14.940422996723266, "grad_norm": 1.1105619668960571, "learning_rate": 9.130813163260321e-06, "loss": 0.5783, "num_input_tokens_seen": 58243624, "step": 100310 }, { "epoch": 14.941167709264224, "grad_norm": 1.313582181930542, "learning_rate": 9.128302458917081e-06, "loss": 0.7543, "num_input_tokens_seen": 58246376, "step": 100315 }, { "epoch": 14.941912421805183, "grad_norm": 1.3617348670959473, "learning_rate": 9.125792022710042e-06, "loss": 0.6872, "num_input_tokens_seen": 58249128, "step": 100320 }, { "epoch": 14.942657134346142, "grad_norm": 1.9583884477615356, "learning_rate": 9.123281854681612e-06, "loss": 0.7429, "num_input_tokens_seen": 58251848, "step": 100325 }, { "epoch": 14.943401846887102, "grad_norm": 2.544395685195923, "learning_rate": 9.120771954874199e-06, "loss": 0.4999, "num_input_tokens_seen": 58254888, "step": 100330 }, { "epoch": 14.944146559428061, "grad_norm": 2.456650972366333, "learning_rate": 9.118262323330196e-06, "loss": 0.6314, "num_input_tokens_seen": 58257864, "step": 100335 }, { "epoch": 14.94489127196902, "grad_norm": 1.8577059507369995, "learning_rate": 9.115752960092017e-06, "loss": 0.8028, "num_input_tokens_seen": 58260808, "step": 100340 }, { "epoch": 14.945635984509979, "grad_norm": 1.3932726383209229, "learning_rate": 9.11324386520204e-06, "loss": 0.6743, "num_input_tokens_seen": 58263432, "step": 100345 }, { "epoch": 14.946380697050937, "grad_norm": 1.6476895809173584, "learning_rate": 9.11073503870267e-06, "loss": 0.7168, "num_input_tokens_seen": 58266280, "step": 100350 }, { "epoch": 14.947125409591898, "grad_norm": 1.5534508228302002, "learning_rate": 9.108226480636276e-06, "loss": 0.6372, "num_input_tokens_seen": 58269256, "step": 100355 }, { "epoch": 14.947870122132857, "grad_norm": 2.2278754711151123, "learning_rate": 9.105718191045248e-06, "loss": 0.6727, "num_input_tokens_seen": 58272072, "step": 100360 }, { "epoch": 14.948614834673815, "grad_norm": 1.2490390539169312, "learning_rate": 9.10321016997196e-06, "loss": 0.5717, "num_input_tokens_seen": 58275144, "step": 100365 }, { "epoch": 14.949359547214776, "grad_norm": 3.341308832168579, "learning_rate": 9.10070241745877e-06, "loss": 0.6145, "num_input_tokens_seen": 58277768, "step": 100370 }, { "epoch": 14.950104259755735, "grad_norm": 2.5068070888519287, "learning_rate": 9.098194933548063e-06, "loss": 0.5379, "num_input_tokens_seen": 58280328, "step": 100375 }, { "epoch": 14.950848972296694, "grad_norm": 1.789624571800232, "learning_rate": 9.09568771828218e-06, "loss": 0.8002, "num_input_tokens_seen": 58283304, "step": 100380 }, { "epoch": 14.951593684837652, "grad_norm": 2.8082642555236816, "learning_rate": 9.0931807717035e-06, "loss": 0.6421, "num_input_tokens_seen": 58286216, "step": 100385 }, { "epoch": 14.952338397378611, "grad_norm": 1.6110663414001465, "learning_rate": 9.090674093854362e-06, "loss": 0.4685, "num_input_tokens_seen": 58289064, "step": 100390 }, { "epoch": 14.953083109919572, "grad_norm": 0.8082566857337952, "learning_rate": 9.088167684777115e-06, "loss": 0.5082, "num_input_tokens_seen": 58292072, "step": 100395 }, { "epoch": 14.95382782246053, "grad_norm": 2.9186434745788574, "learning_rate": 9.085661544514104e-06, "loss": 0.6161, "num_input_tokens_seen": 58294632, "step": 100400 }, { "epoch": 14.954572535001489, "grad_norm": 0.9206920862197876, "learning_rate": 9.083155673107657e-06, "loss": 0.5702, "num_input_tokens_seen": 58297576, "step": 100405 }, { "epoch": 14.955317247542448, "grad_norm": 1.8146899938583374, "learning_rate": 9.080650070600128e-06, "loss": 0.484, "num_input_tokens_seen": 58300392, "step": 100410 }, { "epoch": 14.956061960083408, "grad_norm": 3.047288179397583, "learning_rate": 9.078144737033827e-06, "loss": 0.6471, "num_input_tokens_seen": 58303272, "step": 100415 }, { "epoch": 14.956806672624367, "grad_norm": 1.9438130855560303, "learning_rate": 9.075639672451097e-06, "loss": 0.8008, "num_input_tokens_seen": 58306472, "step": 100420 }, { "epoch": 14.957551385165326, "grad_norm": 2.532165288925171, "learning_rate": 9.073134876894241e-06, "loss": 0.6633, "num_input_tokens_seen": 58309224, "step": 100425 }, { "epoch": 14.958296097706285, "grad_norm": 0.6306619048118591, "learning_rate": 9.070630350405593e-06, "loss": 0.5714, "num_input_tokens_seen": 58311912, "step": 100430 }, { "epoch": 14.959040810247245, "grad_norm": 1.4286448955535889, "learning_rate": 9.068126093027447e-06, "loss": 0.5496, "num_input_tokens_seen": 58315048, "step": 100435 }, { "epoch": 14.959785522788204, "grad_norm": 0.7074868679046631, "learning_rate": 9.065622104802126e-06, "loss": 0.6528, "num_input_tokens_seen": 58318024, "step": 100440 }, { "epoch": 14.960530235329163, "grad_norm": 0.5782075524330139, "learning_rate": 9.063118385771924e-06, "loss": 0.5495, "num_input_tokens_seen": 58320712, "step": 100445 }, { "epoch": 14.961274947870121, "grad_norm": 3.4266793727874756, "learning_rate": 9.060614935979131e-06, "loss": 0.6962, "num_input_tokens_seen": 58323272, "step": 100450 }, { "epoch": 14.962019660411082, "grad_norm": 2.6287379264831543, "learning_rate": 9.058111755466059e-06, "loss": 0.6746, "num_input_tokens_seen": 58326056, "step": 100455 }, { "epoch": 14.96276437295204, "grad_norm": 1.1338084936141968, "learning_rate": 9.055608844274985e-06, "loss": 0.5073, "num_input_tokens_seen": 58328936, "step": 100460 }, { "epoch": 14.963509085493, "grad_norm": 0.9217003583908081, "learning_rate": 9.053106202448194e-06, "loss": 0.7025, "num_input_tokens_seen": 58331944, "step": 100465 }, { "epoch": 14.964253798033958, "grad_norm": 2.235144853591919, "learning_rate": 9.050603830027959e-06, "loss": 0.625, "num_input_tokens_seen": 58334952, "step": 100470 }, { "epoch": 14.964998510574919, "grad_norm": 1.237271785736084, "learning_rate": 9.048101727056568e-06, "loss": 0.5349, "num_input_tokens_seen": 58337864, "step": 100475 }, { "epoch": 14.965743223115878, "grad_norm": 0.9370706081390381, "learning_rate": 9.045599893576287e-06, "loss": 0.5364, "num_input_tokens_seen": 58340744, "step": 100480 }, { "epoch": 14.966487935656836, "grad_norm": 1.142966866493225, "learning_rate": 9.043098329629374e-06, "loss": 0.5585, "num_input_tokens_seen": 58343624, "step": 100485 }, { "epoch": 14.967232648197795, "grad_norm": 1.4424028396606445, "learning_rate": 9.040597035258103e-06, "loss": 0.7068, "num_input_tokens_seen": 58346376, "step": 100490 }, { "epoch": 14.967977360738756, "grad_norm": 1.0743849277496338, "learning_rate": 9.038096010504714e-06, "loss": 0.514, "num_input_tokens_seen": 58349032, "step": 100495 }, { "epoch": 14.968722073279714, "grad_norm": 2.4747321605682373, "learning_rate": 9.035595255411482e-06, "loss": 0.4766, "num_input_tokens_seen": 58351816, "step": 100500 }, { "epoch": 14.969466785820673, "grad_norm": 1.432844877243042, "learning_rate": 9.033094770020634e-06, "loss": 0.4198, "num_input_tokens_seen": 58354792, "step": 100505 }, { "epoch": 14.970211498361632, "grad_norm": 1.05549955368042, "learning_rate": 9.03059455437443e-06, "loss": 0.6248, "num_input_tokens_seen": 58357672, "step": 100510 }, { "epoch": 14.970956210902592, "grad_norm": 1.0996662378311157, "learning_rate": 9.028094608515093e-06, "loss": 0.568, "num_input_tokens_seen": 58360552, "step": 100515 }, { "epoch": 14.971700923443551, "grad_norm": 1.5688674449920654, "learning_rate": 9.02559493248487e-06, "loss": 0.4872, "num_input_tokens_seen": 58363528, "step": 100520 }, { "epoch": 14.97244563598451, "grad_norm": 1.809314250946045, "learning_rate": 9.023095526325987e-06, "loss": 0.635, "num_input_tokens_seen": 58366088, "step": 100525 }, { "epoch": 14.973190348525469, "grad_norm": 1.0302717685699463, "learning_rate": 9.020596390080665e-06, "loss": 0.5863, "num_input_tokens_seen": 58368840, "step": 100530 }, { "epoch": 14.973935061066427, "grad_norm": 1.2581136226654053, "learning_rate": 9.018097523791127e-06, "loss": 0.699, "num_input_tokens_seen": 58371592, "step": 100535 }, { "epoch": 14.974679773607388, "grad_norm": 1.290504813194275, "learning_rate": 9.01559892749958e-06, "loss": 0.659, "num_input_tokens_seen": 58374376, "step": 100540 }, { "epoch": 14.975424486148347, "grad_norm": 1.598351240158081, "learning_rate": 9.013100601248254e-06, "loss": 0.5663, "num_input_tokens_seen": 58377288, "step": 100545 }, { "epoch": 14.976169198689306, "grad_norm": 1.2150321006774902, "learning_rate": 9.010602545079332e-06, "loss": 0.6223, "num_input_tokens_seen": 58380136, "step": 100550 }, { "epoch": 14.976913911230266, "grad_norm": 1.8353289365768433, "learning_rate": 9.00810475903504e-06, "loss": 0.4652, "num_input_tokens_seen": 58383336, "step": 100555 }, { "epoch": 14.977658623771225, "grad_norm": 1.5330244302749634, "learning_rate": 9.005607243157565e-06, "loss": 0.6746, "num_input_tokens_seen": 58386408, "step": 100560 }, { "epoch": 14.978403336312184, "grad_norm": 1.2325598001480103, "learning_rate": 9.003109997489092e-06, "loss": 0.5918, "num_input_tokens_seen": 58389288, "step": 100565 }, { "epoch": 14.979148048853142, "grad_norm": 2.2086610794067383, "learning_rate": 9.000613022071824e-06, "loss": 0.684, "num_input_tokens_seen": 58392072, "step": 100570 }, { "epoch": 14.979892761394101, "grad_norm": 1.915173053741455, "learning_rate": 8.99811631694793e-06, "loss": 0.6508, "num_input_tokens_seen": 58395240, "step": 100575 }, { "epoch": 14.980637473935062, "grad_norm": 1.064842700958252, "learning_rate": 8.995619882159606e-06, "loss": 0.4822, "num_input_tokens_seen": 58398056, "step": 100580 }, { "epoch": 14.98138218647602, "grad_norm": 0.8240399360656738, "learning_rate": 8.993123717749016e-06, "loss": 0.5418, "num_input_tokens_seen": 58400904, "step": 100585 }, { "epoch": 14.98212689901698, "grad_norm": 1.299033761024475, "learning_rate": 8.990627823758327e-06, "loss": 0.5772, "num_input_tokens_seen": 58403880, "step": 100590 }, { "epoch": 14.982871611557938, "grad_norm": 1.342429518699646, "learning_rate": 8.988132200229716e-06, "loss": 0.639, "num_input_tokens_seen": 58406728, "step": 100595 }, { "epoch": 14.983616324098898, "grad_norm": 1.0331023931503296, "learning_rate": 8.985636847205336e-06, "loss": 0.4949, "num_input_tokens_seen": 58409352, "step": 100600 }, { "epoch": 14.984361036639857, "grad_norm": 1.1855108737945557, "learning_rate": 8.983141764727348e-06, "loss": 0.4671, "num_input_tokens_seen": 58412072, "step": 100605 }, { "epoch": 14.985105749180816, "grad_norm": 1.1053074598312378, "learning_rate": 8.980646952837894e-06, "loss": 0.62, "num_input_tokens_seen": 58414536, "step": 100610 }, { "epoch": 14.985850461721775, "grad_norm": 1.8540276288986206, "learning_rate": 8.978152411579133e-06, "loss": 0.7585, "num_input_tokens_seen": 58417576, "step": 100615 }, { "epoch": 14.986595174262735, "grad_norm": 1.5232335329055786, "learning_rate": 8.975658140993196e-06, "loss": 0.8307, "num_input_tokens_seen": 58420744, "step": 100620 }, { "epoch": 14.987339886803694, "grad_norm": 1.4267829656600952, "learning_rate": 8.973164141122237e-06, "loss": 0.6284, "num_input_tokens_seen": 58423560, "step": 100625 }, { "epoch": 14.988084599344653, "grad_norm": 1.8636999130249023, "learning_rate": 8.970670412008372e-06, "loss": 0.626, "num_input_tokens_seen": 58426408, "step": 100630 }, { "epoch": 14.988829311885612, "grad_norm": 1.0656622648239136, "learning_rate": 8.96817695369375e-06, "loss": 0.562, "num_input_tokens_seen": 58429544, "step": 100635 }, { "epoch": 14.989574024426572, "grad_norm": 1.4219658374786377, "learning_rate": 8.965683766220481e-06, "loss": 0.6049, "num_input_tokens_seen": 58432520, "step": 100640 }, { "epoch": 14.99031873696753, "grad_norm": 1.8984628915786743, "learning_rate": 8.963190849630682e-06, "loss": 0.6425, "num_input_tokens_seen": 58435528, "step": 100645 }, { "epoch": 14.99106344950849, "grad_norm": 1.1267709732055664, "learning_rate": 8.96069820396648e-06, "loss": 0.9508, "num_input_tokens_seen": 58438536, "step": 100650 }, { "epoch": 14.991808162049448, "grad_norm": 1.3773013353347778, "learning_rate": 8.958205829269984e-06, "loss": 0.5939, "num_input_tokens_seen": 58441512, "step": 100655 }, { "epoch": 14.992552874590409, "grad_norm": 1.660921335220337, "learning_rate": 8.955713725583295e-06, "loss": 0.5224, "num_input_tokens_seen": 58444360, "step": 100660 }, { "epoch": 14.993297587131368, "grad_norm": 2.682670831680298, "learning_rate": 8.953221892948508e-06, "loss": 0.5929, "num_input_tokens_seen": 58447464, "step": 100665 }, { "epoch": 14.994042299672326, "grad_norm": 1.708526611328125, "learning_rate": 8.950730331407733e-06, "loss": 0.5026, "num_input_tokens_seen": 58450440, "step": 100670 }, { "epoch": 14.994787012213285, "grad_norm": 0.6425150632858276, "learning_rate": 8.94823904100305e-06, "loss": 0.4567, "num_input_tokens_seen": 58453224, "step": 100675 }, { "epoch": 14.995531724754246, "grad_norm": 1.362267255783081, "learning_rate": 8.945748021776564e-06, "loss": 0.5884, "num_input_tokens_seen": 58455976, "step": 100680 }, { "epoch": 14.996276437295204, "grad_norm": 1.5132099390029907, "learning_rate": 8.943257273770351e-06, "loss": 0.6077, "num_input_tokens_seen": 58459016, "step": 100685 }, { "epoch": 14.997021149836163, "grad_norm": 1.3193798065185547, "learning_rate": 8.940766797026476e-06, "loss": 0.6158, "num_input_tokens_seen": 58461800, "step": 100690 }, { "epoch": 14.997765862377122, "grad_norm": 1.1748912334442139, "learning_rate": 8.938276591587031e-06, "loss": 0.4471, "num_input_tokens_seen": 58464648, "step": 100695 }, { "epoch": 14.998510574918082, "grad_norm": 1.4280381202697754, "learning_rate": 8.935786657494072e-06, "loss": 0.7588, "num_input_tokens_seen": 58467688, "step": 100700 }, { "epoch": 14.999255287459041, "grad_norm": 1.6873873472213745, "learning_rate": 8.933296994789678e-06, "loss": 0.5843, "num_input_tokens_seen": 58470376, "step": 100705 }, { "epoch": 15.0, "grad_norm": 1.7302523851394653, "learning_rate": 8.930807603515895e-06, "loss": 0.4801, "num_input_tokens_seen": 58472760, "step": 100710 }, { "epoch": 15.0, "eval_loss": 0.6579505801200867, "eval_runtime": 46.9937, "eval_samples_per_second": 63.498, "eval_steps_per_second": 15.874, "num_input_tokens_seen": 58472760, "step": 100710 }, { "epoch": 15.000744712540959, "grad_norm": 1.8045799732208252, "learning_rate": 8.928318483714793e-06, "loss": 0.577, "num_input_tokens_seen": 58475768, "step": 100715 }, { "epoch": 15.001489425081918, "grad_norm": 1.4863988161087036, "learning_rate": 8.925829635428414e-06, "loss": 0.6706, "num_input_tokens_seen": 58478936, "step": 100720 }, { "epoch": 15.002234137622878, "grad_norm": 1.9308544397354126, "learning_rate": 8.92334105869881e-06, "loss": 0.8023, "num_input_tokens_seen": 58481688, "step": 100725 }, { "epoch": 15.002978850163837, "grad_norm": 2.2505784034729004, "learning_rate": 8.920852753568015e-06, "loss": 0.6908, "num_input_tokens_seen": 58484536, "step": 100730 }, { "epoch": 15.003723562704796, "grad_norm": 1.1851915121078491, "learning_rate": 8.918364720078063e-06, "loss": 0.7319, "num_input_tokens_seen": 58487544, "step": 100735 }, { "epoch": 15.004468275245754, "grad_norm": 1.5974375009536743, "learning_rate": 8.915876958271006e-06, "loss": 0.8455, "num_input_tokens_seen": 58490200, "step": 100740 }, { "epoch": 15.005212987786715, "grad_norm": 2.30151629447937, "learning_rate": 8.913389468188849e-06, "loss": 0.5054, "num_input_tokens_seen": 58493336, "step": 100745 }, { "epoch": 15.005957700327674, "grad_norm": 2.0743696689605713, "learning_rate": 8.910902249873637e-06, "loss": 0.5108, "num_input_tokens_seen": 58496184, "step": 100750 }, { "epoch": 15.006702412868632, "grad_norm": 2.351893901824951, "learning_rate": 8.908415303367371e-06, "loss": 0.5905, "num_input_tokens_seen": 58499128, "step": 100755 }, { "epoch": 15.007447125409591, "grad_norm": 2.358959197998047, "learning_rate": 8.905928628712083e-06, "loss": 0.7363, "num_input_tokens_seen": 58502200, "step": 100760 }, { "epoch": 15.008191837950552, "grad_norm": 3.0082101821899414, "learning_rate": 8.90344222594977e-06, "loss": 0.6195, "num_input_tokens_seen": 58505048, "step": 100765 }, { "epoch": 15.00893655049151, "grad_norm": 1.2389055490493774, "learning_rate": 8.900956095122435e-06, "loss": 0.5143, "num_input_tokens_seen": 58507576, "step": 100770 }, { "epoch": 15.00968126303247, "grad_norm": 1.3228187561035156, "learning_rate": 8.898470236272091e-06, "loss": 0.5801, "num_input_tokens_seen": 58510840, "step": 100775 }, { "epoch": 15.010425975573428, "grad_norm": 1.231790542602539, "learning_rate": 8.895984649440722e-06, "loss": 0.4476, "num_input_tokens_seen": 58513688, "step": 100780 }, { "epoch": 15.011170688114388, "grad_norm": 2.430298328399658, "learning_rate": 8.89349933467033e-06, "loss": 0.7215, "num_input_tokens_seen": 58516632, "step": 100785 }, { "epoch": 15.011915400655347, "grad_norm": 3.1698665618896484, "learning_rate": 8.8910142920029e-06, "loss": 0.9438, "num_input_tokens_seen": 58519352, "step": 100790 }, { "epoch": 15.012660113196306, "grad_norm": 1.2268236875534058, "learning_rate": 8.88852952148041e-06, "loss": 0.4907, "num_input_tokens_seen": 58522040, "step": 100795 }, { "epoch": 15.013404825737265, "grad_norm": 1.106220006942749, "learning_rate": 8.886045023144829e-06, "loss": 0.4987, "num_input_tokens_seen": 58524888, "step": 100800 }, { "epoch": 15.014149538278225, "grad_norm": 1.4835163354873657, "learning_rate": 8.883560797038152e-06, "loss": 0.5213, "num_input_tokens_seen": 58527896, "step": 100805 }, { "epoch": 15.014894250819184, "grad_norm": 0.8186678290367126, "learning_rate": 8.881076843202332e-06, "loss": 0.591, "num_input_tokens_seen": 58530744, "step": 100810 }, { "epoch": 15.015638963360143, "grad_norm": 1.4829384088516235, "learning_rate": 8.878593161679327e-06, "loss": 0.4726, "num_input_tokens_seen": 58533656, "step": 100815 }, { "epoch": 15.016383675901102, "grad_norm": 1.7433408498764038, "learning_rate": 8.876109752511117e-06, "loss": 0.6667, "num_input_tokens_seen": 58536792, "step": 100820 }, { "epoch": 15.017128388442062, "grad_norm": 1.06125807762146, "learning_rate": 8.873626615739632e-06, "loss": 0.5172, "num_input_tokens_seen": 58539640, "step": 100825 }, { "epoch": 15.01787310098302, "grad_norm": 1.2937201261520386, "learning_rate": 8.871143751406849e-06, "loss": 0.5896, "num_input_tokens_seen": 58542328, "step": 100830 }, { "epoch": 15.01861781352398, "grad_norm": 1.3144944906234741, "learning_rate": 8.868661159554689e-06, "loss": 0.528, "num_input_tokens_seen": 58545208, "step": 100835 }, { "epoch": 15.019362526064938, "grad_norm": 1.9876002073287964, "learning_rate": 8.866178840225111e-06, "loss": 0.5736, "num_input_tokens_seen": 58548152, "step": 100840 }, { "epoch": 15.020107238605899, "grad_norm": 1.7410447597503662, "learning_rate": 8.863696793460047e-06, "loss": 0.71, "num_input_tokens_seen": 58551160, "step": 100845 }, { "epoch": 15.020851951146858, "grad_norm": 3.725059747695923, "learning_rate": 8.861215019301414e-06, "loss": 0.6873, "num_input_tokens_seen": 58554040, "step": 100850 }, { "epoch": 15.021596663687816, "grad_norm": 2.3508763313293457, "learning_rate": 8.85873351779116e-06, "loss": 0.588, "num_input_tokens_seen": 58556952, "step": 100855 }, { "epoch": 15.022341376228775, "grad_norm": 1.0547540187835693, "learning_rate": 8.856252288971198e-06, "loss": 0.664, "num_input_tokens_seen": 58559960, "step": 100860 }, { "epoch": 15.023086088769736, "grad_norm": 1.3589766025543213, "learning_rate": 8.853771332883446e-06, "loss": 0.4768, "num_input_tokens_seen": 58563160, "step": 100865 }, { "epoch": 15.023830801310694, "grad_norm": 1.6582450866699219, "learning_rate": 8.851290649569808e-06, "loss": 0.4925, "num_input_tokens_seen": 58566136, "step": 100870 }, { "epoch": 15.024575513851653, "grad_norm": 1.6423641443252563, "learning_rate": 8.848810239072208e-06, "loss": 0.4838, "num_input_tokens_seen": 58569080, "step": 100875 }, { "epoch": 15.025320226392612, "grad_norm": 1.176238775253296, "learning_rate": 8.84633010143254e-06, "loss": 0.6581, "num_input_tokens_seen": 58571736, "step": 100880 }, { "epoch": 15.02606493893357, "grad_norm": 1.006248950958252, "learning_rate": 8.84385023669271e-06, "loss": 0.5319, "num_input_tokens_seen": 58574520, "step": 100885 }, { "epoch": 15.026809651474531, "grad_norm": 1.9143800735473633, "learning_rate": 8.841370644894614e-06, "loss": 0.6078, "num_input_tokens_seen": 58577176, "step": 100890 }, { "epoch": 15.02755436401549, "grad_norm": 2.7492125034332275, "learning_rate": 8.838891326080129e-06, "loss": 0.65, "num_input_tokens_seen": 58580056, "step": 100895 }, { "epoch": 15.028299076556449, "grad_norm": 2.791677474975586, "learning_rate": 8.83641228029116e-06, "loss": 0.6859, "num_input_tokens_seen": 58582776, "step": 100900 }, { "epoch": 15.029043789097408, "grad_norm": 1.2230693101882935, "learning_rate": 8.833933507569564e-06, "loss": 0.7328, "num_input_tokens_seen": 58585816, "step": 100905 }, { "epoch": 15.029788501638368, "grad_norm": 1.542905569076538, "learning_rate": 8.831455007957243e-06, "loss": 0.7717, "num_input_tokens_seen": 58588632, "step": 100910 }, { "epoch": 15.030533214179327, "grad_norm": 1.60318124294281, "learning_rate": 8.828976781496057e-06, "loss": 0.6277, "num_input_tokens_seen": 58591416, "step": 100915 }, { "epoch": 15.031277926720286, "grad_norm": 1.5039974451065063, "learning_rate": 8.826498828227861e-06, "loss": 0.6222, "num_input_tokens_seen": 58594328, "step": 100920 }, { "epoch": 15.032022639261244, "grad_norm": 1.5738413333892822, "learning_rate": 8.824021148194541e-06, "loss": 0.6083, "num_input_tokens_seen": 58597400, "step": 100925 }, { "epoch": 15.032767351802205, "grad_norm": 2.2250940799713135, "learning_rate": 8.82154374143794e-06, "loss": 0.7066, "num_input_tokens_seen": 58600120, "step": 100930 }, { "epoch": 15.033512064343164, "grad_norm": 3.037391424179077, "learning_rate": 8.819066607999918e-06, "loss": 0.5053, "num_input_tokens_seen": 58603032, "step": 100935 }, { "epoch": 15.034256776884122, "grad_norm": 2.1232287883758545, "learning_rate": 8.816589747922311e-06, "loss": 0.5933, "num_input_tokens_seen": 58605880, "step": 100940 }, { "epoch": 15.035001489425081, "grad_norm": 1.8620294332504272, "learning_rate": 8.814113161246979e-06, "loss": 0.7228, "num_input_tokens_seen": 58608664, "step": 100945 }, { "epoch": 15.035746201966042, "grad_norm": 0.44164028763771057, "learning_rate": 8.811636848015747e-06, "loss": 0.5226, "num_input_tokens_seen": 58611288, "step": 100950 }, { "epoch": 15.036490914507, "grad_norm": 0.9116631746292114, "learning_rate": 8.809160808270464e-06, "loss": 0.6851, "num_input_tokens_seen": 58613944, "step": 100955 }, { "epoch": 15.03723562704796, "grad_norm": 1.1031737327575684, "learning_rate": 8.806685042052949e-06, "loss": 0.5415, "num_input_tokens_seen": 58616696, "step": 100960 }, { "epoch": 15.037980339588918, "grad_norm": 1.1042256355285645, "learning_rate": 8.804209549405037e-06, "loss": 0.5621, "num_input_tokens_seen": 58619704, "step": 100965 }, { "epoch": 15.038725052129879, "grad_norm": 2.0278756618499756, "learning_rate": 8.801734330368544e-06, "loss": 0.6171, "num_input_tokens_seen": 58622936, "step": 100970 }, { "epoch": 15.039469764670837, "grad_norm": 1.1284407377243042, "learning_rate": 8.79925938498528e-06, "loss": 0.652, "num_input_tokens_seen": 58625912, "step": 100975 }, { "epoch": 15.040214477211796, "grad_norm": 3.306455135345459, "learning_rate": 8.796784713297072e-06, "loss": 0.5566, "num_input_tokens_seen": 58628888, "step": 100980 }, { "epoch": 15.040959189752755, "grad_norm": 2.9154253005981445, "learning_rate": 8.794310315345713e-06, "loss": 0.576, "num_input_tokens_seen": 58631992, "step": 100985 }, { "epoch": 15.041703902293715, "grad_norm": 1.7544575929641724, "learning_rate": 8.791836191173017e-06, "loss": 0.4481, "num_input_tokens_seen": 58634968, "step": 100990 }, { "epoch": 15.042448614834674, "grad_norm": 1.8330248594284058, "learning_rate": 8.78936234082076e-06, "loss": 0.6874, "num_input_tokens_seen": 58637816, "step": 100995 }, { "epoch": 15.043193327375633, "grad_norm": 0.9452171921730042, "learning_rate": 8.786888764330767e-06, "loss": 0.3895, "num_input_tokens_seen": 58640536, "step": 101000 }, { "epoch": 15.043938039916592, "grad_norm": 1.185137152671814, "learning_rate": 8.784415461744805e-06, "loss": 0.6608, "num_input_tokens_seen": 58643448, "step": 101005 }, { "epoch": 15.044682752457552, "grad_norm": 1.036033272743225, "learning_rate": 8.781942433104654e-06, "loss": 0.4996, "num_input_tokens_seen": 58646808, "step": 101010 }, { "epoch": 15.045427464998511, "grad_norm": 2.407097578048706, "learning_rate": 8.779469678452113e-06, "loss": 0.7369, "num_input_tokens_seen": 58649656, "step": 101015 }, { "epoch": 15.04617217753947, "grad_norm": 1.1047022342681885, "learning_rate": 8.776997197828937e-06, "loss": 0.5201, "num_input_tokens_seen": 58652632, "step": 101020 }, { "epoch": 15.046916890080428, "grad_norm": 2.122716188430786, "learning_rate": 8.774524991276911e-06, "loss": 0.5321, "num_input_tokens_seen": 58655480, "step": 101025 }, { "epoch": 15.047661602621389, "grad_norm": 2.28048038482666, "learning_rate": 8.77205305883779e-06, "loss": 0.6948, "num_input_tokens_seen": 58658520, "step": 101030 }, { "epoch": 15.048406315162348, "grad_norm": 1.0495340824127197, "learning_rate": 8.769581400553346e-06, "loss": 0.5963, "num_input_tokens_seen": 58661432, "step": 101035 }, { "epoch": 15.049151027703306, "grad_norm": 1.305549144744873, "learning_rate": 8.767110016465318e-06, "loss": 0.6869, "num_input_tokens_seen": 58664408, "step": 101040 }, { "epoch": 15.049895740244265, "grad_norm": 1.3983725309371948, "learning_rate": 8.76463890661548e-06, "loss": 0.5744, "num_input_tokens_seen": 58667544, "step": 101045 }, { "epoch": 15.050640452785226, "grad_norm": 1.3592525720596313, "learning_rate": 8.762168071045566e-06, "loss": 0.6421, "num_input_tokens_seen": 58670776, "step": 101050 }, { "epoch": 15.051385165326185, "grad_norm": 0.8648992776870728, "learning_rate": 8.759697509797315e-06, "loss": 0.4085, "num_input_tokens_seen": 58673496, "step": 101055 }, { "epoch": 15.052129877867143, "grad_norm": 1.1693156957626343, "learning_rate": 8.757227222912473e-06, "loss": 0.4611, "num_input_tokens_seen": 58676600, "step": 101060 }, { "epoch": 15.052874590408102, "grad_norm": 1.409225583076477, "learning_rate": 8.754757210432758e-06, "loss": 0.5941, "num_input_tokens_seen": 58679480, "step": 101065 }, { "epoch": 15.05361930294906, "grad_norm": 1.25383722782135, "learning_rate": 8.752287472399918e-06, "loss": 0.5866, "num_input_tokens_seen": 58682648, "step": 101070 }, { "epoch": 15.054364015490021, "grad_norm": 2.4890379905700684, "learning_rate": 8.74981800885566e-06, "loss": 0.6101, "num_input_tokens_seen": 58685336, "step": 101075 }, { "epoch": 15.05510872803098, "grad_norm": 2.1510961055755615, "learning_rate": 8.747348819841719e-06, "loss": 0.6271, "num_input_tokens_seen": 58688280, "step": 101080 }, { "epoch": 15.055853440571939, "grad_norm": 0.8819954991340637, "learning_rate": 8.7448799053998e-06, "loss": 0.6031, "num_input_tokens_seen": 58691032, "step": 101085 }, { "epoch": 15.056598153112898, "grad_norm": 1.4451950788497925, "learning_rate": 8.742411265571607e-06, "loss": 0.7314, "num_input_tokens_seen": 58693720, "step": 101090 }, { "epoch": 15.057342865653858, "grad_norm": 1.368958592414856, "learning_rate": 8.73994290039886e-06, "loss": 0.4847, "num_input_tokens_seen": 58696440, "step": 101095 }, { "epoch": 15.058087578194817, "grad_norm": 1.5065152645111084, "learning_rate": 8.737474809923244e-06, "loss": 0.4642, "num_input_tokens_seen": 58699480, "step": 101100 }, { "epoch": 15.058832290735776, "grad_norm": 2.422325849533081, "learning_rate": 8.73500699418647e-06, "loss": 0.7225, "num_input_tokens_seen": 58702488, "step": 101105 }, { "epoch": 15.059577003276734, "grad_norm": 0.9939960837364197, "learning_rate": 8.732539453230215e-06, "loss": 0.5775, "num_input_tokens_seen": 58705368, "step": 101110 }, { "epoch": 15.060321715817695, "grad_norm": 1.202854871749878, "learning_rate": 8.730072187096178e-06, "loss": 0.5518, "num_input_tokens_seen": 58708088, "step": 101115 }, { "epoch": 15.061066428358654, "grad_norm": 1.3261443376541138, "learning_rate": 8.727605195826038e-06, "loss": 0.632, "num_input_tokens_seen": 58711128, "step": 101120 }, { "epoch": 15.061811140899612, "grad_norm": 1.3138315677642822, "learning_rate": 8.72513847946147e-06, "loss": 0.8359, "num_input_tokens_seen": 58714008, "step": 101125 }, { "epoch": 15.062555853440571, "grad_norm": 1.0677440166473389, "learning_rate": 8.722672038044145e-06, "loss": 0.5257, "num_input_tokens_seen": 58716984, "step": 101130 }, { "epoch": 15.063300565981532, "grad_norm": 3.858715772628784, "learning_rate": 8.720205871615722e-06, "loss": 0.7781, "num_input_tokens_seen": 58719832, "step": 101135 }, { "epoch": 15.06404527852249, "grad_norm": 1.0951042175292969, "learning_rate": 8.717739980217887e-06, "loss": 0.6352, "num_input_tokens_seen": 58722712, "step": 101140 }, { "epoch": 15.06478999106345, "grad_norm": 2.9931743144989014, "learning_rate": 8.715274363892276e-06, "loss": 0.6748, "num_input_tokens_seen": 58725816, "step": 101145 }, { "epoch": 15.065534703604408, "grad_norm": 1.0424091815948486, "learning_rate": 8.712809022680563e-06, "loss": 0.5432, "num_input_tokens_seen": 58728568, "step": 101150 }, { "epoch": 15.066279416145369, "grad_norm": 1.041054368019104, "learning_rate": 8.710343956624379e-06, "loss": 0.5283, "num_input_tokens_seen": 58731384, "step": 101155 }, { "epoch": 15.067024128686327, "grad_norm": 1.0439443588256836, "learning_rate": 8.707879165765384e-06, "loss": 0.4259, "num_input_tokens_seen": 58734392, "step": 101160 }, { "epoch": 15.067768841227286, "grad_norm": 1.8202980756759644, "learning_rate": 8.705414650145215e-06, "loss": 0.4022, "num_input_tokens_seen": 58737144, "step": 101165 }, { "epoch": 15.068513553768245, "grad_norm": 1.2696303129196167, "learning_rate": 8.702950409805493e-06, "loss": 0.6235, "num_input_tokens_seen": 58739992, "step": 101170 }, { "epoch": 15.069258266309205, "grad_norm": 3.0193374156951904, "learning_rate": 8.700486444787872e-06, "loss": 0.7141, "num_input_tokens_seen": 58742840, "step": 101175 }, { "epoch": 15.070002978850164, "grad_norm": 0.9530521035194397, "learning_rate": 8.698022755133957e-06, "loss": 0.576, "num_input_tokens_seen": 58747128, "step": 101180 }, { "epoch": 15.070747691391123, "grad_norm": 1.7005069255828857, "learning_rate": 8.695559340885387e-06, "loss": 0.5207, "num_input_tokens_seen": 58750200, "step": 101185 }, { "epoch": 15.071492403932082, "grad_norm": 2.8774197101593018, "learning_rate": 8.693096202083773e-06, "loss": 0.5862, "num_input_tokens_seen": 58753304, "step": 101190 }, { "epoch": 15.072237116473042, "grad_norm": 1.082772970199585, "learning_rate": 8.69063333877072e-06, "loss": 0.4431, "num_input_tokens_seen": 58756344, "step": 101195 }, { "epoch": 15.072981829014001, "grad_norm": 1.2539057731628418, "learning_rate": 8.688170750987836e-06, "loss": 0.5762, "num_input_tokens_seen": 58759032, "step": 101200 }, { "epoch": 15.07372654155496, "grad_norm": 1.4285902976989746, "learning_rate": 8.685708438776739e-06, "loss": 0.5895, "num_input_tokens_seen": 58762072, "step": 101205 }, { "epoch": 15.074471254095918, "grad_norm": 1.8421388864517212, "learning_rate": 8.683246402179013e-06, "loss": 0.6044, "num_input_tokens_seen": 58764472, "step": 101210 }, { "epoch": 15.075215966636879, "grad_norm": 1.0173383951187134, "learning_rate": 8.680784641236248e-06, "loss": 0.6481, "num_input_tokens_seen": 58767128, "step": 101215 }, { "epoch": 15.075960679177838, "grad_norm": 2.3354392051696777, "learning_rate": 8.678323155990047e-06, "loss": 0.7194, "num_input_tokens_seen": 58770040, "step": 101220 }, { "epoch": 15.076705391718797, "grad_norm": 1.4606832265853882, "learning_rate": 8.67586194648198e-06, "loss": 0.5032, "num_input_tokens_seen": 58772600, "step": 101225 }, { "epoch": 15.077450104259755, "grad_norm": 1.6003563404083252, "learning_rate": 8.673401012753646e-06, "loss": 0.584, "num_input_tokens_seen": 58775608, "step": 101230 }, { "epoch": 15.078194816800714, "grad_norm": 1.8103033304214478, "learning_rate": 8.670940354846596e-06, "loss": 0.435, "num_input_tokens_seen": 58778456, "step": 101235 }, { "epoch": 15.078939529341675, "grad_norm": 1.596954584121704, "learning_rate": 8.668479972802423e-06, "loss": 0.5566, "num_input_tokens_seen": 58781080, "step": 101240 }, { "epoch": 15.079684241882633, "grad_norm": 2.7882015705108643, "learning_rate": 8.666019866662683e-06, "loss": 0.5863, "num_input_tokens_seen": 58784120, "step": 101245 }, { "epoch": 15.080428954423592, "grad_norm": 1.4227083921432495, "learning_rate": 8.663560036468926e-06, "loss": 0.5122, "num_input_tokens_seen": 58787032, "step": 101250 }, { "epoch": 15.08117366696455, "grad_norm": 1.1493602991104126, "learning_rate": 8.661100482262729e-06, "loss": 0.606, "num_input_tokens_seen": 58789880, "step": 101255 }, { "epoch": 15.081918379505511, "grad_norm": 1.3226158618927002, "learning_rate": 8.658641204085632e-06, "loss": 0.6491, "num_input_tokens_seen": 58793016, "step": 101260 }, { "epoch": 15.08266309204647, "grad_norm": 1.414312481880188, "learning_rate": 8.656182201979181e-06, "loss": 0.6026, "num_input_tokens_seen": 58796024, "step": 101265 }, { "epoch": 15.083407804587429, "grad_norm": 1.3810391426086426, "learning_rate": 8.653723475984916e-06, "loss": 0.564, "num_input_tokens_seen": 58798840, "step": 101270 }, { "epoch": 15.084152517128388, "grad_norm": 1.5459797382354736, "learning_rate": 8.651265026144387e-06, "loss": 0.5722, "num_input_tokens_seen": 58801592, "step": 101275 }, { "epoch": 15.084897229669348, "grad_norm": 1.4450472593307495, "learning_rate": 8.648806852499109e-06, "loss": 0.5785, "num_input_tokens_seen": 58804440, "step": 101280 }, { "epoch": 15.085641942210307, "grad_norm": 1.0201057195663452, "learning_rate": 8.64634895509063e-06, "loss": 0.6778, "num_input_tokens_seen": 58807544, "step": 101285 }, { "epoch": 15.086386654751266, "grad_norm": 3.635856866836548, "learning_rate": 8.643891333960464e-06, "loss": 0.4624, "num_input_tokens_seen": 58810424, "step": 101290 }, { "epoch": 15.087131367292224, "grad_norm": 1.5032570362091064, "learning_rate": 8.641433989150123e-06, "loss": 0.6214, "num_input_tokens_seen": 58813496, "step": 101295 }, { "epoch": 15.087876079833185, "grad_norm": 2.182159185409546, "learning_rate": 8.638976920701137e-06, "loss": 0.5465, "num_input_tokens_seen": 58816536, "step": 101300 }, { "epoch": 15.088620792374144, "grad_norm": 0.9771220684051514, "learning_rate": 8.636520128654995e-06, "loss": 0.4977, "num_input_tokens_seen": 58819672, "step": 101305 }, { "epoch": 15.089365504915103, "grad_norm": 2.845468282699585, "learning_rate": 8.634063613053228e-06, "loss": 0.6233, "num_input_tokens_seen": 58822904, "step": 101310 }, { "epoch": 15.090110217456061, "grad_norm": 1.89010751247406, "learning_rate": 8.631607373937319e-06, "loss": 0.6198, "num_input_tokens_seen": 58825688, "step": 101315 }, { "epoch": 15.090854929997022, "grad_norm": 3.1263535022735596, "learning_rate": 8.62915141134877e-06, "loss": 0.5767, "num_input_tokens_seen": 58828568, "step": 101320 }, { "epoch": 15.09159964253798, "grad_norm": 1.0112597942352295, "learning_rate": 8.626695725329059e-06, "loss": 0.6335, "num_input_tokens_seen": 58831448, "step": 101325 }, { "epoch": 15.09234435507894, "grad_norm": 1.1290591955184937, "learning_rate": 8.624240315919693e-06, "loss": 0.5664, "num_input_tokens_seen": 58834136, "step": 101330 }, { "epoch": 15.093089067619898, "grad_norm": 1.740691065788269, "learning_rate": 8.62178518316214e-06, "loss": 0.5269, "num_input_tokens_seen": 58837336, "step": 101335 }, { "epoch": 15.093833780160859, "grad_norm": 1.1111148595809937, "learning_rate": 8.619330327097874e-06, "loss": 0.5838, "num_input_tokens_seen": 58840376, "step": 101340 }, { "epoch": 15.094578492701817, "grad_norm": 1.4457241296768188, "learning_rate": 8.616875747768382e-06, "loss": 0.587, "num_input_tokens_seen": 58843608, "step": 101345 }, { "epoch": 15.095323205242776, "grad_norm": 1.3719538450241089, "learning_rate": 8.614421445215116e-06, "loss": 0.5736, "num_input_tokens_seen": 58846488, "step": 101350 }, { "epoch": 15.096067917783735, "grad_norm": 1.7568092346191406, "learning_rate": 8.611967419479553e-06, "loss": 0.5405, "num_input_tokens_seen": 58849624, "step": 101355 }, { "epoch": 15.096812630324695, "grad_norm": 1.815545916557312, "learning_rate": 8.609513670603137e-06, "loss": 0.5554, "num_input_tokens_seen": 58852696, "step": 101360 }, { "epoch": 15.097557342865654, "grad_norm": 2.7591664791107178, "learning_rate": 8.607060198627337e-06, "loss": 0.7921, "num_input_tokens_seen": 58855288, "step": 101365 }, { "epoch": 15.098302055406613, "grad_norm": 2.7778162956237793, "learning_rate": 8.604607003593593e-06, "loss": 0.4672, "num_input_tokens_seen": 58858008, "step": 101370 }, { "epoch": 15.099046767947572, "grad_norm": 1.7352569103240967, "learning_rate": 8.602154085543341e-06, "loss": 0.633, "num_input_tokens_seen": 58860696, "step": 101375 }, { "epoch": 15.099791480488532, "grad_norm": 1.527187466621399, "learning_rate": 8.59970144451804e-06, "loss": 0.4503, "num_input_tokens_seen": 58863448, "step": 101380 }, { "epoch": 15.100536193029491, "grad_norm": 1.5248677730560303, "learning_rate": 8.597249080559114e-06, "loss": 0.4581, "num_input_tokens_seen": 58866104, "step": 101385 }, { "epoch": 15.10128090557045, "grad_norm": 1.640901803970337, "learning_rate": 8.594796993707993e-06, "loss": 0.4724, "num_input_tokens_seen": 58868952, "step": 101390 }, { "epoch": 15.102025618111409, "grad_norm": 2.07893443107605, "learning_rate": 8.592345184006096e-06, "loss": 0.5773, "num_input_tokens_seen": 58871800, "step": 101395 }, { "epoch": 15.102770330652369, "grad_norm": 1.1805120706558228, "learning_rate": 8.58989365149486e-06, "loss": 0.4291, "num_input_tokens_seen": 58874488, "step": 101400 }, { "epoch": 15.103515043193328, "grad_norm": 1.5555140972137451, "learning_rate": 8.58744239621568e-06, "loss": 0.7086, "num_input_tokens_seen": 58877528, "step": 101405 }, { "epoch": 15.104259755734287, "grad_norm": 1.3272678852081299, "learning_rate": 8.584991418209992e-06, "loss": 0.4712, "num_input_tokens_seen": 58880632, "step": 101410 }, { "epoch": 15.105004468275245, "grad_norm": 2.0446040630340576, "learning_rate": 8.582540717519191e-06, "loss": 0.6262, "num_input_tokens_seen": 58883480, "step": 101415 }, { "epoch": 15.105749180816204, "grad_norm": 1.3076859712600708, "learning_rate": 8.580090294184667e-06, "loss": 0.4293, "num_input_tokens_seen": 58886392, "step": 101420 }, { "epoch": 15.106493893357165, "grad_norm": 1.5067812204360962, "learning_rate": 8.57764014824784e-06, "loss": 0.4768, "num_input_tokens_seen": 58889272, "step": 101425 }, { "epoch": 15.107238605898123, "grad_norm": 1.4178309440612793, "learning_rate": 8.575190279750085e-06, "loss": 0.4345, "num_input_tokens_seen": 58892216, "step": 101430 }, { "epoch": 15.107983318439082, "grad_norm": 1.5572770833969116, "learning_rate": 8.5727406887328e-06, "loss": 0.4893, "num_input_tokens_seen": 58895096, "step": 101435 }, { "epoch": 15.108728030980041, "grad_norm": 0.8659759163856506, "learning_rate": 8.570291375237361e-06, "loss": 0.6169, "num_input_tokens_seen": 58897880, "step": 101440 }, { "epoch": 15.109472743521001, "grad_norm": 1.600771427154541, "learning_rate": 8.567842339305157e-06, "loss": 0.559, "num_input_tokens_seen": 58900440, "step": 101445 }, { "epoch": 15.11021745606196, "grad_norm": 1.8023566007614136, "learning_rate": 8.565393580977558e-06, "loss": 0.6519, "num_input_tokens_seen": 58903352, "step": 101450 }, { "epoch": 15.110962168602919, "grad_norm": 1.368633508682251, "learning_rate": 8.562945100295927e-06, "loss": 0.5893, "num_input_tokens_seen": 58906360, "step": 101455 }, { "epoch": 15.111706881143878, "grad_norm": 1.1139949560165405, "learning_rate": 8.560496897301637e-06, "loss": 0.5249, "num_input_tokens_seen": 58909272, "step": 101460 }, { "epoch": 15.112451593684838, "grad_norm": 3.30171537399292, "learning_rate": 8.558048972036031e-06, "loss": 0.4724, "num_input_tokens_seen": 58911896, "step": 101465 }, { "epoch": 15.113196306225797, "grad_norm": 1.5111274719238281, "learning_rate": 8.555601324540488e-06, "loss": 0.5111, "num_input_tokens_seen": 58914680, "step": 101470 }, { "epoch": 15.113941018766756, "grad_norm": 1.733544111251831, "learning_rate": 8.553153954856338e-06, "loss": 0.7472, "num_input_tokens_seen": 58917688, "step": 101475 }, { "epoch": 15.114685731307715, "grad_norm": 2.620738983154297, "learning_rate": 8.550706863024945e-06, "loss": 0.6814, "num_input_tokens_seen": 58920792, "step": 101480 }, { "epoch": 15.115430443848675, "grad_norm": 1.53373384475708, "learning_rate": 8.548260049087634e-06, "loss": 0.5968, "num_input_tokens_seen": 58923576, "step": 101485 }, { "epoch": 15.116175156389634, "grad_norm": 1.7969688177108765, "learning_rate": 8.545813513085757e-06, "loss": 0.5995, "num_input_tokens_seen": 58926584, "step": 101490 }, { "epoch": 15.116919868930593, "grad_norm": 2.4887311458587646, "learning_rate": 8.543367255060636e-06, "loss": 0.5836, "num_input_tokens_seen": 58929432, "step": 101495 }, { "epoch": 15.117664581471551, "grad_norm": 1.4123132228851318, "learning_rate": 8.54092127505359e-06, "loss": 0.6039, "num_input_tokens_seen": 58932088, "step": 101500 }, { "epoch": 15.118409294012512, "grad_norm": 1.3300102949142456, "learning_rate": 8.538475573105961e-06, "loss": 0.5109, "num_input_tokens_seen": 58935224, "step": 101505 }, { "epoch": 15.11915400655347, "grad_norm": 1.5329645872116089, "learning_rate": 8.536030149259046e-06, "loss": 0.4939, "num_input_tokens_seen": 58938296, "step": 101510 }, { "epoch": 15.11989871909443, "grad_norm": 2.581298589706421, "learning_rate": 8.533585003554179e-06, "loss": 0.5306, "num_input_tokens_seen": 58941336, "step": 101515 }, { "epoch": 15.120643431635388, "grad_norm": 2.6432852745056152, "learning_rate": 8.53114013603266e-06, "loss": 0.6822, "num_input_tokens_seen": 58944216, "step": 101520 }, { "epoch": 15.121388144176349, "grad_norm": 1.2590099573135376, "learning_rate": 8.528695546735784e-06, "loss": 0.5264, "num_input_tokens_seen": 58947000, "step": 101525 }, { "epoch": 15.122132856717307, "grad_norm": 1.7657710313796997, "learning_rate": 8.52625123570486e-06, "loss": 0.6898, "num_input_tokens_seen": 58949496, "step": 101530 }, { "epoch": 15.122877569258266, "grad_norm": 3.150141477584839, "learning_rate": 8.523807202981168e-06, "loss": 0.5388, "num_input_tokens_seen": 58952376, "step": 101535 }, { "epoch": 15.123622281799225, "grad_norm": 1.5632413625717163, "learning_rate": 8.521363448606018e-06, "loss": 0.7434, "num_input_tokens_seen": 58955480, "step": 101540 }, { "epoch": 15.124366994340185, "grad_norm": 2.6657187938690186, "learning_rate": 8.518919972620675e-06, "loss": 0.9613, "num_input_tokens_seen": 58958552, "step": 101545 }, { "epoch": 15.125111706881144, "grad_norm": 1.364551067352295, "learning_rate": 8.516476775066438e-06, "loss": 0.5519, "num_input_tokens_seen": 58961752, "step": 101550 }, { "epoch": 15.125856419422103, "grad_norm": 1.0520962476730347, "learning_rate": 8.514033855984563e-06, "loss": 0.4577, "num_input_tokens_seen": 58964632, "step": 101555 }, { "epoch": 15.126601131963062, "grad_norm": 1.3672410249710083, "learning_rate": 8.51159121541634e-06, "loss": 0.7295, "num_input_tokens_seen": 58967832, "step": 101560 }, { "epoch": 15.127345844504022, "grad_norm": 1.4984204769134521, "learning_rate": 8.509148853403015e-06, "loss": 0.6017, "num_input_tokens_seen": 58970776, "step": 101565 }, { "epoch": 15.128090557044981, "grad_norm": 1.3585963249206543, "learning_rate": 8.50670676998587e-06, "loss": 0.602, "num_input_tokens_seen": 58974008, "step": 101570 }, { "epoch": 15.12883526958594, "grad_norm": 3.378155469894409, "learning_rate": 8.504264965206148e-06, "loss": 0.6298, "num_input_tokens_seen": 58976632, "step": 101575 }, { "epoch": 15.129579982126899, "grad_norm": 1.6360682249069214, "learning_rate": 8.5018234391051e-06, "loss": 0.5123, "num_input_tokens_seen": 58979352, "step": 101580 }, { "epoch": 15.130324694667857, "grad_norm": 2.2729074954986572, "learning_rate": 8.499382191723981e-06, "loss": 0.619, "num_input_tokens_seen": 58982264, "step": 101585 }, { "epoch": 15.131069407208818, "grad_norm": 2.3383877277374268, "learning_rate": 8.496941223104032e-06, "loss": 0.6299, "num_input_tokens_seen": 58985112, "step": 101590 }, { "epoch": 15.131814119749777, "grad_norm": 2.907953977584839, "learning_rate": 8.494500533286487e-06, "loss": 0.6806, "num_input_tokens_seen": 58988024, "step": 101595 }, { "epoch": 15.132558832290735, "grad_norm": 1.6298357248306274, "learning_rate": 8.492060122312572e-06, "loss": 0.5295, "num_input_tokens_seen": 58991192, "step": 101600 }, { "epoch": 15.133303544831694, "grad_norm": 2.132080078125, "learning_rate": 8.489619990223533e-06, "loss": 0.718, "num_input_tokens_seen": 58994040, "step": 101605 }, { "epoch": 15.134048257372655, "grad_norm": 2.495140314102173, "learning_rate": 8.487180137060582e-06, "loss": 0.5353, "num_input_tokens_seen": 58997176, "step": 101610 }, { "epoch": 15.134792969913613, "grad_norm": 2.219491958618164, "learning_rate": 8.484740562864931e-06, "loss": 0.6888, "num_input_tokens_seen": 58999736, "step": 101615 }, { "epoch": 15.135537682454572, "grad_norm": 1.4187037944793701, "learning_rate": 8.482301267677813e-06, "loss": 0.6563, "num_input_tokens_seen": 59002776, "step": 101620 }, { "epoch": 15.136282394995531, "grad_norm": 2.3723502159118652, "learning_rate": 8.47986225154042e-06, "loss": 0.5472, "num_input_tokens_seen": 59005656, "step": 101625 }, { "epoch": 15.137027107536491, "grad_norm": 1.5387332439422607, "learning_rate": 8.477423514493967e-06, "loss": 0.6585, "num_input_tokens_seen": 59008728, "step": 101630 }, { "epoch": 15.13777182007745, "grad_norm": 1.5020583868026733, "learning_rate": 8.474985056579648e-06, "loss": 0.5696, "num_input_tokens_seen": 59011704, "step": 101635 }, { "epoch": 15.138516532618409, "grad_norm": 1.6092842817306519, "learning_rate": 8.47254687783867e-06, "loss": 0.6972, "num_input_tokens_seen": 59014616, "step": 101640 }, { "epoch": 15.139261245159368, "grad_norm": 2.281158447265625, "learning_rate": 8.470108978312211e-06, "loss": 0.5569, "num_input_tokens_seen": 59017368, "step": 101645 }, { "epoch": 15.140005957700328, "grad_norm": 1.941763162612915, "learning_rate": 8.46767135804146e-06, "loss": 0.8925, "num_input_tokens_seen": 59020120, "step": 101650 }, { "epoch": 15.140750670241287, "grad_norm": 1.2782349586486816, "learning_rate": 8.465234017067595e-06, "loss": 0.6966, "num_input_tokens_seen": 59023064, "step": 101655 }, { "epoch": 15.141495382782246, "grad_norm": 2.6173555850982666, "learning_rate": 8.462796955431801e-06, "loss": 0.5718, "num_input_tokens_seen": 59025944, "step": 101660 }, { "epoch": 15.142240095323205, "grad_norm": 1.136749029159546, "learning_rate": 8.460360173175244e-06, "loss": 0.6043, "num_input_tokens_seen": 59028696, "step": 101665 }, { "epoch": 15.142984807864165, "grad_norm": 1.507158637046814, "learning_rate": 8.457923670339085e-06, "loss": 0.4705, "num_input_tokens_seen": 59031704, "step": 101670 }, { "epoch": 15.143729520405124, "grad_norm": 1.4391307830810547, "learning_rate": 8.455487446964502e-06, "loss": 0.7386, "num_input_tokens_seen": 59034520, "step": 101675 }, { "epoch": 15.144474232946083, "grad_norm": 1.780329942703247, "learning_rate": 8.453051503092632e-06, "loss": 0.6598, "num_input_tokens_seen": 59037368, "step": 101680 }, { "epoch": 15.145218945487041, "grad_norm": 1.4713430404663086, "learning_rate": 8.450615838764653e-06, "loss": 0.5733, "num_input_tokens_seen": 59039960, "step": 101685 }, { "epoch": 15.145963658028002, "grad_norm": 2.6615397930145264, "learning_rate": 8.448180454021695e-06, "loss": 0.7144, "num_input_tokens_seen": 59042808, "step": 101690 }, { "epoch": 15.14670837056896, "grad_norm": 2.2829513549804688, "learning_rate": 8.445745348904898e-06, "loss": 0.6244, "num_input_tokens_seen": 59045592, "step": 101695 }, { "epoch": 15.14745308310992, "grad_norm": 1.0699857473373413, "learning_rate": 8.443310523455416e-06, "loss": 0.4175, "num_input_tokens_seen": 59048216, "step": 101700 }, { "epoch": 15.148197795650878, "grad_norm": 1.5547335147857666, "learning_rate": 8.440875977714368e-06, "loss": 0.5702, "num_input_tokens_seen": 59050936, "step": 101705 }, { "epoch": 15.148942508191839, "grad_norm": 1.5035550594329834, "learning_rate": 8.4384417117229e-06, "loss": 0.588, "num_input_tokens_seen": 59053784, "step": 101710 }, { "epoch": 15.149687220732797, "grad_norm": 1.5151076316833496, "learning_rate": 8.436007725522127e-06, "loss": 0.4582, "num_input_tokens_seen": 59056568, "step": 101715 }, { "epoch": 15.150431933273756, "grad_norm": 1.844602346420288, "learning_rate": 8.433574019153167e-06, "loss": 0.6903, "num_input_tokens_seen": 59059576, "step": 101720 }, { "epoch": 15.151176645814715, "grad_norm": 1.4775646924972534, "learning_rate": 8.43114059265713e-06, "loss": 0.5736, "num_input_tokens_seen": 59062296, "step": 101725 }, { "epoch": 15.151921358355676, "grad_norm": 1.3877149820327759, "learning_rate": 8.428707446075138e-06, "loss": 0.582, "num_input_tokens_seen": 59065304, "step": 101730 }, { "epoch": 15.152666070896634, "grad_norm": 1.3182024955749512, "learning_rate": 8.426274579448293e-06, "loss": 0.5846, "num_input_tokens_seen": 59068184, "step": 101735 }, { "epoch": 15.153410783437593, "grad_norm": 3.245844841003418, "learning_rate": 8.423841992817688e-06, "loss": 0.8117, "num_input_tokens_seen": 59070840, "step": 101740 }, { "epoch": 15.154155495978552, "grad_norm": 2.2303431034088135, "learning_rate": 8.42140968622443e-06, "loss": 0.5554, "num_input_tokens_seen": 59073624, "step": 101745 }, { "epoch": 15.15490020851951, "grad_norm": 1.1091527938842773, "learning_rate": 8.4189776597096e-06, "loss": 0.6117, "num_input_tokens_seen": 59076664, "step": 101750 }, { "epoch": 15.155644921060471, "grad_norm": 2.431534767150879, "learning_rate": 8.416545913314296e-06, "loss": 0.5089, "num_input_tokens_seen": 59079416, "step": 101755 }, { "epoch": 15.15638963360143, "grad_norm": 1.5371257066726685, "learning_rate": 8.414114447079588e-06, "loss": 0.6167, "num_input_tokens_seen": 59082296, "step": 101760 }, { "epoch": 15.157134346142389, "grad_norm": 1.3336308002471924, "learning_rate": 8.411683261046569e-06, "loss": 0.5868, "num_input_tokens_seen": 59085272, "step": 101765 }, { "epoch": 15.157879058683347, "grad_norm": 1.5844131708145142, "learning_rate": 8.409252355256297e-06, "loss": 0.632, "num_input_tokens_seen": 59088216, "step": 101770 }, { "epoch": 15.158623771224308, "grad_norm": 3.349597454071045, "learning_rate": 8.40682172974984e-06, "loss": 0.5132, "num_input_tokens_seen": 59091000, "step": 101775 }, { "epoch": 15.159368483765267, "grad_norm": 1.6135681867599487, "learning_rate": 8.404391384568271e-06, "loss": 0.7115, "num_input_tokens_seen": 59093880, "step": 101780 }, { "epoch": 15.160113196306225, "grad_norm": 2.031370162963867, "learning_rate": 8.401961319752646e-06, "loss": 0.6287, "num_input_tokens_seen": 59096984, "step": 101785 }, { "epoch": 15.160857908847184, "grad_norm": 2.475111961364746, "learning_rate": 8.399531535344013e-06, "loss": 0.6172, "num_input_tokens_seen": 59100184, "step": 101790 }, { "epoch": 15.161602621388145, "grad_norm": 1.3098222017288208, "learning_rate": 8.397102031383414e-06, "loss": 0.8176, "num_input_tokens_seen": 59103224, "step": 101795 }, { "epoch": 15.162347333929103, "grad_norm": 1.0207290649414062, "learning_rate": 8.39467280791191e-06, "loss": 0.478, "num_input_tokens_seen": 59106424, "step": 101800 }, { "epoch": 15.163092046470062, "grad_norm": 1.6301286220550537, "learning_rate": 8.392243864970525e-06, "loss": 0.7439, "num_input_tokens_seen": 59109080, "step": 101805 }, { "epoch": 15.163836759011021, "grad_norm": 1.2105361223220825, "learning_rate": 8.389815202600306e-06, "loss": 0.4687, "num_input_tokens_seen": 59111672, "step": 101810 }, { "epoch": 15.164581471551982, "grad_norm": 1.1729815006256104, "learning_rate": 8.38738682084228e-06, "loss": 0.4766, "num_input_tokens_seen": 59114456, "step": 101815 }, { "epoch": 15.16532618409294, "grad_norm": 0.8721812963485718, "learning_rate": 8.38495871973746e-06, "loss": 0.7134, "num_input_tokens_seen": 59117432, "step": 101820 }, { "epoch": 15.166070896633899, "grad_norm": 1.2015612125396729, "learning_rate": 8.382530899326885e-06, "loss": 0.6034, "num_input_tokens_seen": 59120184, "step": 101825 }, { "epoch": 15.166815609174858, "grad_norm": 1.973867654800415, "learning_rate": 8.380103359651553e-06, "loss": 0.7304, "num_input_tokens_seen": 59123256, "step": 101830 }, { "epoch": 15.167560321715818, "grad_norm": 2.269467830657959, "learning_rate": 8.377676100752491e-06, "loss": 0.5212, "num_input_tokens_seen": 59126008, "step": 101835 }, { "epoch": 15.168305034256777, "grad_norm": 1.0243945121765137, "learning_rate": 8.375249122670686e-06, "loss": 0.4784, "num_input_tokens_seen": 59129368, "step": 101840 }, { "epoch": 15.169049746797736, "grad_norm": 1.7543308734893799, "learning_rate": 8.372822425447164e-06, "loss": 0.547, "num_input_tokens_seen": 59132056, "step": 101845 }, { "epoch": 15.169794459338695, "grad_norm": 4.431387901306152, "learning_rate": 8.370396009122902e-06, "loss": 0.753, "num_input_tokens_seen": 59134872, "step": 101850 }, { "epoch": 15.170539171879655, "grad_norm": 1.8874820470809937, "learning_rate": 8.3679698737389e-06, "loss": 0.5476, "num_input_tokens_seen": 59137752, "step": 101855 }, { "epoch": 15.171283884420614, "grad_norm": 1.911959171295166, "learning_rate": 8.365544019336146e-06, "loss": 0.5692, "num_input_tokens_seen": 59140696, "step": 101860 }, { "epoch": 15.172028596961573, "grad_norm": 1.2387796640396118, "learning_rate": 8.363118445955609e-06, "loss": 0.5397, "num_input_tokens_seen": 59144312, "step": 101865 }, { "epoch": 15.172773309502531, "grad_norm": 1.456356406211853, "learning_rate": 8.360693153638285e-06, "loss": 0.6381, "num_input_tokens_seen": 59147512, "step": 101870 }, { "epoch": 15.173518022043492, "grad_norm": 1.622832179069519, "learning_rate": 8.35826814242513e-06, "loss": 0.6555, "num_input_tokens_seen": 59150360, "step": 101875 }, { "epoch": 15.17426273458445, "grad_norm": 1.28842031955719, "learning_rate": 8.355843412357131e-06, "loss": 0.6684, "num_input_tokens_seen": 59153208, "step": 101880 }, { "epoch": 15.17500744712541, "grad_norm": 1.6050891876220703, "learning_rate": 8.353418963475232e-06, "loss": 0.5016, "num_input_tokens_seen": 59156088, "step": 101885 }, { "epoch": 15.175752159666368, "grad_norm": 1.3186167478561401, "learning_rate": 8.350994795820407e-06, "loss": 0.5477, "num_input_tokens_seen": 59159000, "step": 101890 }, { "epoch": 15.176496872207329, "grad_norm": 2.206974983215332, "learning_rate": 8.348570909433607e-06, "loss": 0.6611, "num_input_tokens_seen": 59162072, "step": 101895 }, { "epoch": 15.177241584748288, "grad_norm": 1.4123408794403076, "learning_rate": 8.346147304355767e-06, "loss": 0.5322, "num_input_tokens_seen": 59164792, "step": 101900 }, { "epoch": 15.177986297289246, "grad_norm": 2.672029972076416, "learning_rate": 8.343723980627848e-06, "loss": 0.6692, "num_input_tokens_seen": 59167640, "step": 101905 }, { "epoch": 15.178731009830205, "grad_norm": 2.190626621246338, "learning_rate": 8.34130093829078e-06, "loss": 0.7022, "num_input_tokens_seen": 59170648, "step": 101910 }, { "epoch": 15.179475722371166, "grad_norm": 0.8140240907669067, "learning_rate": 8.338878177385508e-06, "loss": 0.3936, "num_input_tokens_seen": 59173432, "step": 101915 }, { "epoch": 15.180220434912124, "grad_norm": 1.0315033197402954, "learning_rate": 8.336455697952956e-06, "loss": 0.5917, "num_input_tokens_seen": 59176120, "step": 101920 }, { "epoch": 15.180965147453083, "grad_norm": 1.858373999595642, "learning_rate": 8.33403350003405e-06, "loss": 0.5649, "num_input_tokens_seen": 59178904, "step": 101925 }, { "epoch": 15.181709859994042, "grad_norm": 1.8101160526275635, "learning_rate": 8.3316115836697e-06, "loss": 0.6872, "num_input_tokens_seen": 59181624, "step": 101930 }, { "epoch": 15.182454572535, "grad_norm": 1.8796391487121582, "learning_rate": 8.32918994890084e-06, "loss": 0.4939, "num_input_tokens_seen": 59184504, "step": 101935 }, { "epoch": 15.183199285075961, "grad_norm": 1.8317347764968872, "learning_rate": 8.32676859576837e-06, "loss": 0.546, "num_input_tokens_seen": 59187128, "step": 101940 }, { "epoch": 15.18394399761692, "grad_norm": 1.4216431379318237, "learning_rate": 8.324347524313192e-06, "loss": 0.7155, "num_input_tokens_seen": 59189944, "step": 101945 }, { "epoch": 15.184688710157879, "grad_norm": 2.4415769577026367, "learning_rate": 8.321926734576223e-06, "loss": 0.669, "num_input_tokens_seen": 59192952, "step": 101950 }, { "epoch": 15.185433422698837, "grad_norm": 2.2461929321289062, "learning_rate": 8.319506226598342e-06, "loss": 0.7294, "num_input_tokens_seen": 59195672, "step": 101955 }, { "epoch": 15.186178135239798, "grad_norm": 2.3793904781341553, "learning_rate": 8.317086000420459e-06, "loss": 0.72, "num_input_tokens_seen": 59198616, "step": 101960 }, { "epoch": 15.186922847780757, "grad_norm": 2.051017999649048, "learning_rate": 8.314666056083444e-06, "loss": 0.6138, "num_input_tokens_seen": 59201464, "step": 101965 }, { "epoch": 15.187667560321715, "grad_norm": 1.441918134689331, "learning_rate": 8.312246393628195e-06, "loss": 0.5818, "num_input_tokens_seen": 59204792, "step": 101970 }, { "epoch": 15.188412272862674, "grad_norm": 1.5167800188064575, "learning_rate": 8.309827013095584e-06, "loss": 0.7861, "num_input_tokens_seen": 59207864, "step": 101975 }, { "epoch": 15.189156985403635, "grad_norm": 2.2765886783599854, "learning_rate": 8.30740791452648e-06, "loss": 0.524, "num_input_tokens_seen": 59210680, "step": 101980 }, { "epoch": 15.189901697944594, "grad_norm": 1.398830771446228, "learning_rate": 8.304989097961748e-06, "loss": 0.4254, "num_input_tokens_seen": 59213464, "step": 101985 }, { "epoch": 15.190646410485552, "grad_norm": 1.5046418905258179, "learning_rate": 8.302570563442263e-06, "loss": 0.5904, "num_input_tokens_seen": 59216376, "step": 101990 }, { "epoch": 15.191391123026511, "grad_norm": 1.7144774198532104, "learning_rate": 8.300152311008883e-06, "loss": 0.5423, "num_input_tokens_seen": 59219320, "step": 101995 }, { "epoch": 15.192135835567472, "grad_norm": 1.8494188785552979, "learning_rate": 8.297734340702443e-06, "loss": 0.6294, "num_input_tokens_seen": 59222072, "step": 102000 }, { "epoch": 15.19288054810843, "grad_norm": 0.7727224826812744, "learning_rate": 8.295316652563817e-06, "loss": 0.4681, "num_input_tokens_seen": 59225304, "step": 102005 }, { "epoch": 15.19362526064939, "grad_norm": 1.8958680629730225, "learning_rate": 8.292899246633828e-06, "loss": 0.4873, "num_input_tokens_seen": 59228120, "step": 102010 }, { "epoch": 15.194369973190348, "grad_norm": 1.0109143257141113, "learning_rate": 8.290482122953336e-06, "loss": 0.6835, "num_input_tokens_seen": 59230904, "step": 102015 }, { "epoch": 15.195114685731308, "grad_norm": 0.7306501865386963, "learning_rate": 8.288065281563164e-06, "loss": 0.5658, "num_input_tokens_seen": 59233752, "step": 102020 }, { "epoch": 15.195859398272267, "grad_norm": 0.9543352723121643, "learning_rate": 8.285648722504136e-06, "loss": 0.5557, "num_input_tokens_seen": 59236888, "step": 102025 }, { "epoch": 15.196604110813226, "grad_norm": 1.8255056142807007, "learning_rate": 8.283232445817094e-06, "loss": 0.5402, "num_input_tokens_seen": 59239864, "step": 102030 }, { "epoch": 15.197348823354185, "grad_norm": 3.311473846435547, "learning_rate": 8.280816451542841e-06, "loss": 0.6631, "num_input_tokens_seen": 59242648, "step": 102035 }, { "epoch": 15.198093535895145, "grad_norm": 1.7640351057052612, "learning_rate": 8.278400739722211e-06, "loss": 0.473, "num_input_tokens_seen": 59245592, "step": 102040 }, { "epoch": 15.198838248436104, "grad_norm": 1.2451103925704956, "learning_rate": 8.275985310396003e-06, "loss": 0.71, "num_input_tokens_seen": 59248344, "step": 102045 }, { "epoch": 15.199582960977063, "grad_norm": 2.8875982761383057, "learning_rate": 8.273570163605026e-06, "loss": 0.7625, "num_input_tokens_seen": 59251224, "step": 102050 }, { "epoch": 15.200327673518021, "grad_norm": 2.0246598720550537, "learning_rate": 8.271155299390082e-06, "loss": 0.7294, "num_input_tokens_seen": 59253880, "step": 102055 }, { "epoch": 15.201072386058982, "grad_norm": 1.1442973613739014, "learning_rate": 8.26874071779196e-06, "loss": 0.6294, "num_input_tokens_seen": 59256664, "step": 102060 }, { "epoch": 15.20181709859994, "grad_norm": 1.5721200704574585, "learning_rate": 8.266326418851467e-06, "loss": 0.5463, "num_input_tokens_seen": 59259384, "step": 102065 }, { "epoch": 15.2025618111409, "grad_norm": 2.6205198764801025, "learning_rate": 8.26391240260937e-06, "loss": 0.7069, "num_input_tokens_seen": 59262392, "step": 102070 }, { "epoch": 15.203306523681858, "grad_norm": 1.040968418121338, "learning_rate": 8.261498669106473e-06, "loss": 0.6228, "num_input_tokens_seen": 59265432, "step": 102075 }, { "epoch": 15.204051236222819, "grad_norm": 1.1538819074630737, "learning_rate": 8.259085218383536e-06, "loss": 0.5887, "num_input_tokens_seen": 59268184, "step": 102080 }, { "epoch": 15.204795948763778, "grad_norm": 2.134791135787964, "learning_rate": 8.256672050481348e-06, "loss": 0.4614, "num_input_tokens_seen": 59271000, "step": 102085 }, { "epoch": 15.205540661304736, "grad_norm": 2.1778712272644043, "learning_rate": 8.254259165440662e-06, "loss": 0.6703, "num_input_tokens_seen": 59273720, "step": 102090 }, { "epoch": 15.206285373845695, "grad_norm": 1.7688387632369995, "learning_rate": 8.251846563302253e-06, "loss": 0.5573, "num_input_tokens_seen": 59276856, "step": 102095 }, { "epoch": 15.207030086386654, "grad_norm": 1.4745945930480957, "learning_rate": 8.249434244106875e-06, "loss": 0.651, "num_input_tokens_seen": 59279736, "step": 102100 }, { "epoch": 15.207774798927614, "grad_norm": 2.075526237487793, "learning_rate": 8.247022207895271e-06, "loss": 0.5218, "num_input_tokens_seen": 59282648, "step": 102105 }, { "epoch": 15.208519511468573, "grad_norm": 1.8575820922851562, "learning_rate": 8.244610454708213e-06, "loss": 0.5895, "num_input_tokens_seen": 59285592, "step": 102110 }, { "epoch": 15.209264224009532, "grad_norm": 2.135209560394287, "learning_rate": 8.242198984586427e-06, "loss": 0.5358, "num_input_tokens_seen": 59288664, "step": 102115 }, { "epoch": 15.21000893655049, "grad_norm": 1.6298696994781494, "learning_rate": 8.239787797570661e-06, "loss": 0.326, "num_input_tokens_seen": 59291512, "step": 102120 }, { "epoch": 15.210753649091451, "grad_norm": 1.5205445289611816, "learning_rate": 8.237376893701635e-06, "loss": 0.6901, "num_input_tokens_seen": 59294232, "step": 102125 }, { "epoch": 15.21149836163241, "grad_norm": 2.946409225463867, "learning_rate": 8.2349662730201e-06, "loss": 0.5644, "num_input_tokens_seen": 59297208, "step": 102130 }, { "epoch": 15.212243074173369, "grad_norm": 1.8309754133224487, "learning_rate": 8.232555935566769e-06, "loss": 0.6513, "num_input_tokens_seen": 59300024, "step": 102135 }, { "epoch": 15.212987786714327, "grad_norm": 1.2680554389953613, "learning_rate": 8.230145881382357e-06, "loss": 0.6476, "num_input_tokens_seen": 59302712, "step": 102140 }, { "epoch": 15.213732499255288, "grad_norm": 2.3496837615966797, "learning_rate": 8.227736110507592e-06, "loss": 0.7241, "num_input_tokens_seen": 59305496, "step": 102145 }, { "epoch": 15.214477211796247, "grad_norm": 3.78204345703125, "learning_rate": 8.225326622983173e-06, "loss": 0.8554, "num_input_tokens_seen": 59308472, "step": 102150 }, { "epoch": 15.215221924337206, "grad_norm": 1.2899843454360962, "learning_rate": 8.222917418849819e-06, "loss": 0.647, "num_input_tokens_seen": 59311544, "step": 102155 }, { "epoch": 15.215966636878164, "grad_norm": 1.3068453073501587, "learning_rate": 8.220508498148213e-06, "loss": 0.4413, "num_input_tokens_seen": 59314648, "step": 102160 }, { "epoch": 15.216711349419125, "grad_norm": 1.3699030876159668, "learning_rate": 8.218099860919074e-06, "loss": 0.5242, "num_input_tokens_seen": 59317176, "step": 102165 }, { "epoch": 15.217456061960084, "grad_norm": 2.3619589805603027, "learning_rate": 8.215691507203072e-06, "loss": 0.6364, "num_input_tokens_seen": 59320248, "step": 102170 }, { "epoch": 15.218200774501042, "grad_norm": 2.0315945148468018, "learning_rate": 8.213283437040911e-06, "loss": 0.4988, "num_input_tokens_seen": 59323096, "step": 102175 }, { "epoch": 15.218945487042001, "grad_norm": 3.9542105197906494, "learning_rate": 8.210875650473266e-06, "loss": 0.6223, "num_input_tokens_seen": 59326232, "step": 102180 }, { "epoch": 15.219690199582962, "grad_norm": 1.715775966644287, "learning_rate": 8.208468147540812e-06, "loss": 0.5078, "num_input_tokens_seen": 59329240, "step": 102185 }, { "epoch": 15.22043491212392, "grad_norm": 3.494581460952759, "learning_rate": 8.206060928284223e-06, "loss": 0.7299, "num_input_tokens_seen": 59332056, "step": 102190 }, { "epoch": 15.22117962466488, "grad_norm": 1.1537983417510986, "learning_rate": 8.20365399274416e-06, "loss": 0.7212, "num_input_tokens_seen": 59335096, "step": 102195 }, { "epoch": 15.221924337205838, "grad_norm": 1.7500754594802856, "learning_rate": 8.201247340961296e-06, "loss": 0.5853, "num_input_tokens_seen": 59337976, "step": 102200 }, { "epoch": 15.222669049746798, "grad_norm": 3.555840492248535, "learning_rate": 8.19884097297628e-06, "loss": 0.6082, "num_input_tokens_seen": 59340696, "step": 102205 }, { "epoch": 15.223413762287757, "grad_norm": 2.8041458129882812, "learning_rate": 8.196434888829774e-06, "loss": 0.6697, "num_input_tokens_seen": 59343416, "step": 102210 }, { "epoch": 15.224158474828716, "grad_norm": 1.4723352193832397, "learning_rate": 8.194029088562425e-06, "loss": 0.5968, "num_input_tokens_seen": 59346168, "step": 102215 }, { "epoch": 15.224903187369675, "grad_norm": 1.7064850330352783, "learning_rate": 8.191623572214865e-06, "loss": 0.4888, "num_input_tokens_seen": 59348696, "step": 102220 }, { "epoch": 15.225647899910635, "grad_norm": 1.3627007007598877, "learning_rate": 8.18921833982775e-06, "loss": 0.7361, "num_input_tokens_seen": 59351736, "step": 102225 }, { "epoch": 15.226392612451594, "grad_norm": 1.2587652206420898, "learning_rate": 8.186813391441697e-06, "loss": 0.5538, "num_input_tokens_seen": 59354680, "step": 102230 }, { "epoch": 15.227137324992553, "grad_norm": 3.572809934616089, "learning_rate": 8.184408727097354e-06, "loss": 0.7431, "num_input_tokens_seen": 59357720, "step": 102235 }, { "epoch": 15.227882037533512, "grad_norm": 3.046858549118042, "learning_rate": 8.182004346835323e-06, "loss": 0.7132, "num_input_tokens_seen": 59360376, "step": 102240 }, { "epoch": 15.228626750074472, "grad_norm": 2.9569122791290283, "learning_rate": 8.179600250696245e-06, "loss": 0.7853, "num_input_tokens_seen": 59363256, "step": 102245 }, { "epoch": 15.22937146261543, "grad_norm": 1.6895105838775635, "learning_rate": 8.177196438720724e-06, "loss": 0.5466, "num_input_tokens_seen": 59366200, "step": 102250 }, { "epoch": 15.23011617515639, "grad_norm": 1.7636210918426514, "learning_rate": 8.174792910949376e-06, "loss": 0.4718, "num_input_tokens_seen": 59368824, "step": 102255 }, { "epoch": 15.230860887697348, "grad_norm": 1.5286729335784912, "learning_rate": 8.172389667422797e-06, "loss": 0.5075, "num_input_tokens_seen": 59371704, "step": 102260 }, { "epoch": 15.231605600238307, "grad_norm": 1.4435510635375977, "learning_rate": 8.169986708181584e-06, "loss": 0.5028, "num_input_tokens_seen": 59374616, "step": 102265 }, { "epoch": 15.232350312779268, "grad_norm": 1.1640307903289795, "learning_rate": 8.167584033266349e-06, "loss": 0.3431, "num_input_tokens_seen": 59377432, "step": 102270 }, { "epoch": 15.233095025320226, "grad_norm": 1.0674070119857788, "learning_rate": 8.165181642717668e-06, "loss": 0.5572, "num_input_tokens_seen": 59380312, "step": 102275 }, { "epoch": 15.233839737861185, "grad_norm": 1.5333420038223267, "learning_rate": 8.162779536576138e-06, "loss": 0.7267, "num_input_tokens_seen": 59383448, "step": 102280 }, { "epoch": 15.234584450402144, "grad_norm": 2.3510982990264893, "learning_rate": 8.160377714882327e-06, "loss": 0.5266, "num_input_tokens_seen": 59386360, "step": 102285 }, { "epoch": 15.235329162943104, "grad_norm": 1.515794038772583, "learning_rate": 8.15797617767683e-06, "loss": 0.5939, "num_input_tokens_seen": 59389208, "step": 102290 }, { "epoch": 15.236073875484063, "grad_norm": 1.8923389911651611, "learning_rate": 8.155574925000207e-06, "loss": 0.5304, "num_input_tokens_seen": 59392184, "step": 102295 }, { "epoch": 15.236818588025022, "grad_norm": 2.0454463958740234, "learning_rate": 8.153173956893018e-06, "loss": 0.4661, "num_input_tokens_seen": 59395352, "step": 102300 }, { "epoch": 15.23756330056598, "grad_norm": 0.9964738488197327, "learning_rate": 8.15077327339584e-06, "loss": 0.5877, "num_input_tokens_seen": 59398232, "step": 102305 }, { "epoch": 15.238308013106941, "grad_norm": 1.5792444944381714, "learning_rate": 8.148372874549224e-06, "loss": 0.6369, "num_input_tokens_seen": 59401016, "step": 102310 }, { "epoch": 15.2390527256479, "grad_norm": 0.671059250831604, "learning_rate": 8.145972760393711e-06, "loss": 0.5943, "num_input_tokens_seen": 59403800, "step": 102315 }, { "epoch": 15.239797438188859, "grad_norm": 0.7318406105041504, "learning_rate": 8.143572930969866e-06, "loss": 0.4846, "num_input_tokens_seen": 59406456, "step": 102320 }, { "epoch": 15.240542150729818, "grad_norm": 3.2330119609832764, "learning_rate": 8.141173386318226e-06, "loss": 0.839, "num_input_tokens_seen": 59409880, "step": 102325 }, { "epoch": 15.241286863270778, "grad_norm": 1.3684626817703247, "learning_rate": 8.13877412647932e-06, "loss": 0.6917, "num_input_tokens_seen": 59412632, "step": 102330 }, { "epoch": 15.242031575811737, "grad_norm": 1.882730484008789, "learning_rate": 8.136375151493695e-06, "loss": 0.6629, "num_input_tokens_seen": 59415192, "step": 102335 }, { "epoch": 15.242776288352696, "grad_norm": 1.468977689743042, "learning_rate": 8.13397646140187e-06, "loss": 0.5148, "num_input_tokens_seen": 59417784, "step": 102340 }, { "epoch": 15.243521000893654, "grad_norm": 1.3203673362731934, "learning_rate": 8.131578056244365e-06, "loss": 0.544, "num_input_tokens_seen": 59420728, "step": 102345 }, { "epoch": 15.244265713434615, "grad_norm": 2.8210785388946533, "learning_rate": 8.129179936061715e-06, "loss": 0.7188, "num_input_tokens_seen": 59423448, "step": 102350 }, { "epoch": 15.245010425975574, "grad_norm": 1.0321745872497559, "learning_rate": 8.126782100894411e-06, "loss": 0.5831, "num_input_tokens_seen": 59426392, "step": 102355 }, { "epoch": 15.245755138516532, "grad_norm": 4.101057529449463, "learning_rate": 8.124384550782985e-06, "loss": 0.7548, "num_input_tokens_seen": 59429336, "step": 102360 }, { "epoch": 15.246499851057491, "grad_norm": 1.7174999713897705, "learning_rate": 8.12198728576792e-06, "loss": 0.5266, "num_input_tokens_seen": 59432472, "step": 102365 }, { "epoch": 15.247244563598452, "grad_norm": 1.6824145317077637, "learning_rate": 8.119590305889737e-06, "loss": 0.8829, "num_input_tokens_seen": 59435352, "step": 102370 }, { "epoch": 15.24798927613941, "grad_norm": 0.7513353824615479, "learning_rate": 8.117193611188917e-06, "loss": 0.2895, "num_input_tokens_seen": 59438072, "step": 102375 }, { "epoch": 15.24873398868037, "grad_norm": 1.0000325441360474, "learning_rate": 8.114797201705954e-06, "loss": 0.4779, "num_input_tokens_seen": 59441432, "step": 102380 }, { "epoch": 15.249478701221328, "grad_norm": 1.3252190351486206, "learning_rate": 8.112401077481329e-06, "loss": 0.5967, "num_input_tokens_seen": 59444152, "step": 102385 }, { "epoch": 15.250223413762289, "grad_norm": 1.1789478063583374, "learning_rate": 8.110005238555517e-06, "loss": 0.5725, "num_input_tokens_seen": 59447032, "step": 102390 }, { "epoch": 15.250968126303247, "grad_norm": 1.5108381509780884, "learning_rate": 8.107609684969008e-06, "loss": 0.6078, "num_input_tokens_seen": 59449624, "step": 102395 }, { "epoch": 15.251712838844206, "grad_norm": 2.314242124557495, "learning_rate": 8.105214416762255e-06, "loss": 0.6428, "num_input_tokens_seen": 59452760, "step": 102400 }, { "epoch": 15.252457551385165, "grad_norm": 1.9953802824020386, "learning_rate": 8.102819433975745e-06, "loss": 0.6076, "num_input_tokens_seen": 59456152, "step": 102405 }, { "epoch": 15.253202263926125, "grad_norm": 1.7324661016464233, "learning_rate": 8.100424736649918e-06, "loss": 0.3802, "num_input_tokens_seen": 59459000, "step": 102410 }, { "epoch": 15.253946976467084, "grad_norm": 1.372040033340454, "learning_rate": 8.098030324825246e-06, "loss": 0.5125, "num_input_tokens_seen": 59461656, "step": 102415 }, { "epoch": 15.254691689008043, "grad_norm": 2.8203043937683105, "learning_rate": 8.095636198542173e-06, "loss": 0.8057, "num_input_tokens_seen": 59464408, "step": 102420 }, { "epoch": 15.255436401549002, "grad_norm": 1.3142297267913818, "learning_rate": 8.093242357841136e-06, "loss": 0.6255, "num_input_tokens_seen": 59467288, "step": 102425 }, { "epoch": 15.256181114089962, "grad_norm": 1.1456215381622314, "learning_rate": 8.090848802762596e-06, "loss": 0.5521, "num_input_tokens_seen": 59470136, "step": 102430 }, { "epoch": 15.256925826630921, "grad_norm": 1.0843982696533203, "learning_rate": 8.088455533346973e-06, "loss": 0.5897, "num_input_tokens_seen": 59472568, "step": 102435 }, { "epoch": 15.25767053917188, "grad_norm": 1.5867547988891602, "learning_rate": 8.086062549634712e-06, "loss": 0.5153, "num_input_tokens_seen": 59475576, "step": 102440 }, { "epoch": 15.258415251712838, "grad_norm": 2.5688350200653076, "learning_rate": 8.083669851666235e-06, "loss": 0.7331, "num_input_tokens_seen": 59478232, "step": 102445 }, { "epoch": 15.259159964253797, "grad_norm": 2.397775650024414, "learning_rate": 8.081277439481961e-06, "loss": 0.6924, "num_input_tokens_seen": 59481336, "step": 102450 }, { "epoch": 15.259904676794758, "grad_norm": 1.185407042503357, "learning_rate": 8.078885313122311e-06, "loss": 0.5042, "num_input_tokens_seen": 59484216, "step": 102455 }, { "epoch": 15.260649389335716, "grad_norm": 1.1429386138916016, "learning_rate": 8.076493472627687e-06, "loss": 0.6903, "num_input_tokens_seen": 59487384, "step": 102460 }, { "epoch": 15.261394101876675, "grad_norm": 2.003403663635254, "learning_rate": 8.074101918038512e-06, "loss": 0.6028, "num_input_tokens_seen": 59490328, "step": 102465 }, { "epoch": 15.262138814417634, "grad_norm": 1.8067071437835693, "learning_rate": 8.071710649395178e-06, "loss": 0.5657, "num_input_tokens_seen": 59493304, "step": 102470 }, { "epoch": 15.262883526958595, "grad_norm": 1.5142487287521362, "learning_rate": 8.069319666738093e-06, "loss": 0.5285, "num_input_tokens_seen": 59496184, "step": 102475 }, { "epoch": 15.263628239499553, "grad_norm": 1.310112476348877, "learning_rate": 8.066928970107638e-06, "loss": 0.5907, "num_input_tokens_seen": 59499128, "step": 102480 }, { "epoch": 15.264372952040512, "grad_norm": 1.6375900506973267, "learning_rate": 8.064538559544213e-06, "loss": 0.5854, "num_input_tokens_seen": 59502104, "step": 102485 }, { "epoch": 15.26511766458147, "grad_norm": 2.0599112510681152, "learning_rate": 8.06214843508819e-06, "loss": 0.691, "num_input_tokens_seen": 59504824, "step": 102490 }, { "epoch": 15.265862377122431, "grad_norm": 2.312702178955078, "learning_rate": 8.059758596779965e-06, "loss": 0.4554, "num_input_tokens_seen": 59507896, "step": 102495 }, { "epoch": 15.26660708966339, "grad_norm": 1.1963289976119995, "learning_rate": 8.057369044659899e-06, "loss": 0.5337, "num_input_tokens_seen": 59511032, "step": 102500 }, { "epoch": 15.267351802204349, "grad_norm": 1.2421972751617432, "learning_rate": 8.054979778768354e-06, "loss": 0.6201, "num_input_tokens_seen": 59513688, "step": 102505 }, { "epoch": 15.268096514745308, "grad_norm": 1.0505694150924683, "learning_rate": 8.052590799145715e-06, "loss": 0.4911, "num_input_tokens_seen": 59516824, "step": 102510 }, { "epoch": 15.268841227286268, "grad_norm": 1.5758870840072632, "learning_rate": 8.050202105832327e-06, "loss": 0.5969, "num_input_tokens_seen": 59519608, "step": 102515 }, { "epoch": 15.269585939827227, "grad_norm": 2.1839020252227783, "learning_rate": 8.047813698868548e-06, "loss": 0.4839, "num_input_tokens_seen": 59522520, "step": 102520 }, { "epoch": 15.270330652368186, "grad_norm": 1.2002493143081665, "learning_rate": 8.045425578294719e-06, "loss": 0.5153, "num_input_tokens_seen": 59525848, "step": 102525 }, { "epoch": 15.271075364909144, "grad_norm": 1.120916724205017, "learning_rate": 8.043037744151203e-06, "loss": 0.4794, "num_input_tokens_seen": 59529016, "step": 102530 }, { "epoch": 15.271820077450105, "grad_norm": 1.0106632709503174, "learning_rate": 8.040650196478319e-06, "loss": 0.466, "num_input_tokens_seen": 59531640, "step": 102535 }, { "epoch": 15.272564789991064, "grad_norm": 1.3620561361312866, "learning_rate": 8.038262935316423e-06, "loss": 0.5577, "num_input_tokens_seen": 59534360, "step": 102540 }, { "epoch": 15.273309502532022, "grad_norm": 1.0714328289031982, "learning_rate": 8.035875960705835e-06, "loss": 0.4325, "num_input_tokens_seen": 59537016, "step": 102545 }, { "epoch": 15.274054215072981, "grad_norm": 1.7557299137115479, "learning_rate": 8.033489272686872e-06, "loss": 0.5632, "num_input_tokens_seen": 59539672, "step": 102550 }, { "epoch": 15.274798927613942, "grad_norm": 1.820859432220459, "learning_rate": 8.031102871299876e-06, "loss": 0.6189, "num_input_tokens_seen": 59542552, "step": 102555 }, { "epoch": 15.2755436401549, "grad_norm": 1.3478645086288452, "learning_rate": 8.02871675658514e-06, "loss": 0.5341, "num_input_tokens_seen": 59545688, "step": 102560 }, { "epoch": 15.27628835269586, "grad_norm": 1.5295460224151611, "learning_rate": 8.026330928582993e-06, "loss": 0.595, "num_input_tokens_seen": 59548376, "step": 102565 }, { "epoch": 15.277033065236818, "grad_norm": 0.8549254536628723, "learning_rate": 8.023945387333722e-06, "loss": 0.4506, "num_input_tokens_seen": 59551576, "step": 102570 }, { "epoch": 15.277777777777779, "grad_norm": 3.0118634700775146, "learning_rate": 8.021560132877653e-06, "loss": 0.5875, "num_input_tokens_seen": 59554552, "step": 102575 }, { "epoch": 15.278522490318737, "grad_norm": 1.3554885387420654, "learning_rate": 8.019175165255069e-06, "loss": 0.4827, "num_input_tokens_seen": 59557368, "step": 102580 }, { "epoch": 15.279267202859696, "grad_norm": 1.3495439291000366, "learning_rate": 8.016790484506261e-06, "loss": 0.5347, "num_input_tokens_seen": 59560120, "step": 102585 }, { "epoch": 15.280011915400655, "grad_norm": 1.8533447980880737, "learning_rate": 8.014406090671516e-06, "loss": 0.8938, "num_input_tokens_seen": 59563032, "step": 102590 }, { "epoch": 15.280756627941615, "grad_norm": 0.7115100622177124, "learning_rate": 8.012021983791112e-06, "loss": 0.6768, "num_input_tokens_seen": 59565976, "step": 102595 }, { "epoch": 15.281501340482574, "grad_norm": 1.7821286916732788, "learning_rate": 8.009638163905337e-06, "loss": 0.6872, "num_input_tokens_seen": 59568760, "step": 102600 }, { "epoch": 15.282246053023533, "grad_norm": 2.094353199005127, "learning_rate": 8.007254631054448e-06, "loss": 0.7104, "num_input_tokens_seen": 59571672, "step": 102605 }, { "epoch": 15.282990765564492, "grad_norm": 1.5687739849090576, "learning_rate": 8.00487138527873e-06, "loss": 0.5715, "num_input_tokens_seen": 59574488, "step": 102610 }, { "epoch": 15.283735478105452, "grad_norm": 1.126154899597168, "learning_rate": 8.002488426618429e-06, "loss": 0.5088, "num_input_tokens_seen": 59577176, "step": 102615 }, { "epoch": 15.284480190646411, "grad_norm": 1.150240421295166, "learning_rate": 8.000105755113818e-06, "loss": 0.6644, "num_input_tokens_seen": 59580472, "step": 102620 }, { "epoch": 15.28522490318737, "grad_norm": 1.9301643371582031, "learning_rate": 7.997723370805143e-06, "loss": 0.4982, "num_input_tokens_seen": 59583192, "step": 102625 }, { "epoch": 15.285969615728328, "grad_norm": 2.418982982635498, "learning_rate": 7.995341273732642e-06, "loss": 0.544, "num_input_tokens_seen": 59586232, "step": 102630 }, { "epoch": 15.286714328269287, "grad_norm": 2.01224684715271, "learning_rate": 7.992959463936578e-06, "loss": 0.7044, "num_input_tokens_seen": 59589144, "step": 102635 }, { "epoch": 15.287459040810248, "grad_norm": 1.514593243598938, "learning_rate": 7.990577941457175e-06, "loss": 0.5487, "num_input_tokens_seen": 59592056, "step": 102640 }, { "epoch": 15.288203753351207, "grad_norm": 1.4114686250686646, "learning_rate": 7.988196706334666e-06, "loss": 0.5495, "num_input_tokens_seen": 59594936, "step": 102645 }, { "epoch": 15.288948465892165, "grad_norm": 1.8878788948059082, "learning_rate": 7.985815758609289e-06, "loss": 0.5958, "num_input_tokens_seen": 59597560, "step": 102650 }, { "epoch": 15.289693178433124, "grad_norm": 1.4986094236373901, "learning_rate": 7.983435098321263e-06, "loss": 0.6047, "num_input_tokens_seen": 59600344, "step": 102655 }, { "epoch": 15.290437890974085, "grad_norm": 2.5247840881347656, "learning_rate": 7.981054725510805e-06, "loss": 0.5881, "num_input_tokens_seen": 59604216, "step": 102660 }, { "epoch": 15.291182603515043, "grad_norm": 1.2765543460845947, "learning_rate": 7.978674640218126e-06, "loss": 0.4633, "num_input_tokens_seen": 59607032, "step": 102665 }, { "epoch": 15.291927316056002, "grad_norm": 2.402090549468994, "learning_rate": 7.976294842483446e-06, "loss": 0.5553, "num_input_tokens_seen": 59609912, "step": 102670 }, { "epoch": 15.29267202859696, "grad_norm": 2.642665147781372, "learning_rate": 7.97391533234695e-06, "loss": 0.7635, "num_input_tokens_seen": 59613016, "step": 102675 }, { "epoch": 15.293416741137921, "grad_norm": 1.2773209810256958, "learning_rate": 7.971536109848862e-06, "loss": 0.4744, "num_input_tokens_seen": 59616184, "step": 102680 }, { "epoch": 15.29416145367888, "grad_norm": 3.2518796920776367, "learning_rate": 7.969157175029354e-06, "loss": 0.5756, "num_input_tokens_seen": 59618968, "step": 102685 }, { "epoch": 15.294906166219839, "grad_norm": 2.547454357147217, "learning_rate": 7.966778527928637e-06, "loss": 0.6589, "num_input_tokens_seen": 59621784, "step": 102690 }, { "epoch": 15.295650878760798, "grad_norm": 1.8384052515029907, "learning_rate": 7.964400168586875e-06, "loss": 0.5996, "num_input_tokens_seen": 59624824, "step": 102695 }, { "epoch": 15.296395591301758, "grad_norm": 0.7008944153785706, "learning_rate": 7.962022097044266e-06, "loss": 0.5154, "num_input_tokens_seen": 59627480, "step": 102700 }, { "epoch": 15.297140303842717, "grad_norm": 1.9283584356307983, "learning_rate": 7.959644313340978e-06, "loss": 0.5467, "num_input_tokens_seen": 59630552, "step": 102705 }, { "epoch": 15.297885016383676, "grad_norm": 2.1160075664520264, "learning_rate": 7.95726681751718e-06, "loss": 0.7322, "num_input_tokens_seen": 59633304, "step": 102710 }, { "epoch": 15.298629728924634, "grad_norm": 1.5952281951904297, "learning_rate": 7.95488960961304e-06, "loss": 0.5885, "num_input_tokens_seen": 59636120, "step": 102715 }, { "epoch": 15.299374441465595, "grad_norm": 1.483080506324768, "learning_rate": 7.952512689668703e-06, "loss": 0.5816, "num_input_tokens_seen": 59639000, "step": 102720 }, { "epoch": 15.300119154006554, "grad_norm": 1.4228469133377075, "learning_rate": 7.95013605772435e-06, "loss": 0.7063, "num_input_tokens_seen": 59641912, "step": 102725 }, { "epoch": 15.300863866547513, "grad_norm": 1.719026803970337, "learning_rate": 7.947759713820111e-06, "loss": 0.7804, "num_input_tokens_seen": 59645048, "step": 102730 }, { "epoch": 15.301608579088471, "grad_norm": 1.3220683336257935, "learning_rate": 7.945383657996148e-06, "loss": 0.4593, "num_input_tokens_seen": 59647896, "step": 102735 }, { "epoch": 15.302353291629432, "grad_norm": 1.5220065116882324, "learning_rate": 7.943007890292593e-06, "loss": 0.586, "num_input_tokens_seen": 59650840, "step": 102740 }, { "epoch": 15.30309800417039, "grad_norm": 1.665761113166809, "learning_rate": 7.940632410749577e-06, "loss": 0.6123, "num_input_tokens_seen": 59654040, "step": 102745 }, { "epoch": 15.30384271671135, "grad_norm": 1.2067551612854004, "learning_rate": 7.938257219407246e-06, "loss": 0.5594, "num_input_tokens_seen": 59657240, "step": 102750 }, { "epoch": 15.304587429252308, "grad_norm": 0.9863471984863281, "learning_rate": 7.93588231630571e-06, "loss": 0.4499, "num_input_tokens_seen": 59660248, "step": 102755 }, { "epoch": 15.305332141793269, "grad_norm": 1.8327709436416626, "learning_rate": 7.933507701485108e-06, "loss": 0.5709, "num_input_tokens_seen": 59663064, "step": 102760 }, { "epoch": 15.306076854334227, "grad_norm": 2.9499478340148926, "learning_rate": 7.93113337498554e-06, "loss": 0.7205, "num_input_tokens_seen": 59665816, "step": 102765 }, { "epoch": 15.306821566875186, "grad_norm": 1.2866244316101074, "learning_rate": 7.928759336847133e-06, "loss": 0.5257, "num_input_tokens_seen": 59668792, "step": 102770 }, { "epoch": 15.307566279416145, "grad_norm": 1.7990491390228271, "learning_rate": 7.926385587109986e-06, "loss": 0.4277, "num_input_tokens_seen": 59671768, "step": 102775 }, { "epoch": 15.308310991957104, "grad_norm": 2.8474984169006348, "learning_rate": 7.924012125814203e-06, "loss": 0.7122, "num_input_tokens_seen": 59674616, "step": 102780 }, { "epoch": 15.309055704498064, "grad_norm": 1.6591788530349731, "learning_rate": 7.92163895299988e-06, "loss": 0.6507, "num_input_tokens_seen": 59677528, "step": 102785 }, { "epoch": 15.309800417039023, "grad_norm": 1.5466108322143555, "learning_rate": 7.919266068707099e-06, "loss": 0.6779, "num_input_tokens_seen": 59680760, "step": 102790 }, { "epoch": 15.310545129579982, "grad_norm": 1.1138659715652466, "learning_rate": 7.916893472975967e-06, "loss": 0.7031, "num_input_tokens_seen": 59683768, "step": 102795 }, { "epoch": 15.31128984212094, "grad_norm": 1.0576443672180176, "learning_rate": 7.91452116584655e-06, "loss": 0.432, "num_input_tokens_seen": 59686744, "step": 102800 }, { "epoch": 15.312034554661901, "grad_norm": 1.4102888107299805, "learning_rate": 7.912149147358938e-06, "loss": 0.4447, "num_input_tokens_seen": 59689816, "step": 102805 }, { "epoch": 15.31277926720286, "grad_norm": 2.9999160766601562, "learning_rate": 7.909777417553193e-06, "loss": 0.5834, "num_input_tokens_seen": 59692600, "step": 102810 }, { "epoch": 15.313523979743819, "grad_norm": 1.2072398662567139, "learning_rate": 7.907405976469397e-06, "loss": 0.6661, "num_input_tokens_seen": 59695480, "step": 102815 }, { "epoch": 15.314268692284777, "grad_norm": 1.8037201166152954, "learning_rate": 7.905034824147605e-06, "loss": 0.6331, "num_input_tokens_seen": 59698360, "step": 102820 }, { "epoch": 15.315013404825738, "grad_norm": 1.6233336925506592, "learning_rate": 7.902663960627869e-06, "loss": 0.4712, "num_input_tokens_seen": 59701368, "step": 102825 }, { "epoch": 15.315758117366697, "grad_norm": 1.836380958557129, "learning_rate": 7.900293385950253e-06, "loss": 0.6215, "num_input_tokens_seen": 59703992, "step": 102830 }, { "epoch": 15.316502829907655, "grad_norm": 1.6716605424880981, "learning_rate": 7.897923100154794e-06, "loss": 0.6751, "num_input_tokens_seen": 59706968, "step": 102835 }, { "epoch": 15.317247542448614, "grad_norm": 1.0875401496887207, "learning_rate": 7.895553103281552e-06, "loss": 0.565, "num_input_tokens_seen": 59710008, "step": 102840 }, { "epoch": 15.317992254989575, "grad_norm": 1.3834095001220703, "learning_rate": 7.893183395370554e-06, "loss": 0.6386, "num_input_tokens_seen": 59712728, "step": 102845 }, { "epoch": 15.318736967530533, "grad_norm": 1.8912748098373413, "learning_rate": 7.890813976461836e-06, "loss": 0.6846, "num_input_tokens_seen": 59715864, "step": 102850 }, { "epoch": 15.319481680071492, "grad_norm": 1.2593655586242676, "learning_rate": 7.888444846595422e-06, "loss": 0.5634, "num_input_tokens_seen": 59718616, "step": 102855 }, { "epoch": 15.320226392612451, "grad_norm": 0.9600899815559387, "learning_rate": 7.886076005811346e-06, "loss": 0.5874, "num_input_tokens_seen": 59721400, "step": 102860 }, { "epoch": 15.320971105153411, "grad_norm": 2.1455094814300537, "learning_rate": 7.883707454149621e-06, "loss": 0.4658, "num_input_tokens_seen": 59724440, "step": 102865 }, { "epoch": 15.32171581769437, "grad_norm": 1.707273006439209, "learning_rate": 7.881339191650256e-06, "loss": 0.5701, "num_input_tokens_seen": 59727352, "step": 102870 }, { "epoch": 15.322460530235329, "grad_norm": 1.5387171506881714, "learning_rate": 7.878971218353275e-06, "loss": 0.5984, "num_input_tokens_seen": 59730200, "step": 102875 }, { "epoch": 15.323205242776288, "grad_norm": 1.3367559909820557, "learning_rate": 7.876603534298666e-06, "loss": 0.6937, "num_input_tokens_seen": 59732824, "step": 102880 }, { "epoch": 15.323949955317248, "grad_norm": 1.650806188583374, "learning_rate": 7.874236139526445e-06, "loss": 0.5216, "num_input_tokens_seen": 59735864, "step": 102885 }, { "epoch": 15.324694667858207, "grad_norm": 1.3824070692062378, "learning_rate": 7.87186903407659e-06, "loss": 0.54, "num_input_tokens_seen": 59739192, "step": 102890 }, { "epoch": 15.325439380399166, "grad_norm": 3.209904432296753, "learning_rate": 7.869502217989108e-06, "loss": 0.6557, "num_input_tokens_seen": 59742168, "step": 102895 }, { "epoch": 15.326184092940125, "grad_norm": 1.7568286657333374, "learning_rate": 7.867135691303975e-06, "loss": 0.6642, "num_input_tokens_seen": 59745432, "step": 102900 }, { "epoch": 15.326928805481085, "grad_norm": 2.3045425415039062, "learning_rate": 7.864769454061163e-06, "loss": 0.572, "num_input_tokens_seen": 59748760, "step": 102905 }, { "epoch": 15.327673518022044, "grad_norm": 1.463969111442566, "learning_rate": 7.862403506300664e-06, "loss": 0.6402, "num_input_tokens_seen": 59751544, "step": 102910 }, { "epoch": 15.328418230563003, "grad_norm": 2.829582691192627, "learning_rate": 7.86003784806244e-06, "loss": 0.6696, "num_input_tokens_seen": 59754392, "step": 102915 }, { "epoch": 15.329162943103961, "grad_norm": 1.9416954517364502, "learning_rate": 7.857672479386458e-06, "loss": 0.7403, "num_input_tokens_seen": 59757208, "step": 102920 }, { "epoch": 15.329907655644922, "grad_norm": 1.301131010055542, "learning_rate": 7.855307400312667e-06, "loss": 0.6093, "num_input_tokens_seen": 59760472, "step": 102925 }, { "epoch": 15.33065236818588, "grad_norm": 1.2779979705810547, "learning_rate": 7.85294261088104e-06, "loss": 0.5951, "num_input_tokens_seen": 59763288, "step": 102930 }, { "epoch": 15.33139708072684, "grad_norm": 0.8481202721595764, "learning_rate": 7.850578111131513e-06, "loss": 0.5399, "num_input_tokens_seen": 59766168, "step": 102935 }, { "epoch": 15.332141793267798, "grad_norm": 3.5258712768554688, "learning_rate": 7.848213901104045e-06, "loss": 0.7301, "num_input_tokens_seen": 59769112, "step": 102940 }, { "epoch": 15.332886505808759, "grad_norm": 1.7035579681396484, "learning_rate": 7.845849980838574e-06, "loss": 0.5021, "num_input_tokens_seen": 59772568, "step": 102945 }, { "epoch": 15.333631218349717, "grad_norm": 1.6861268281936646, "learning_rate": 7.843486350375023e-06, "loss": 0.5276, "num_input_tokens_seen": 59775384, "step": 102950 }, { "epoch": 15.334375930890676, "grad_norm": 0.8323277831077576, "learning_rate": 7.84112300975334e-06, "loss": 0.6042, "num_input_tokens_seen": 59778488, "step": 102955 }, { "epoch": 15.335120643431635, "grad_norm": 2.2162301540374756, "learning_rate": 7.838759959013439e-06, "loss": 0.6981, "num_input_tokens_seen": 59781272, "step": 102960 }, { "epoch": 15.335865355972594, "grad_norm": 0.8940776586532593, "learning_rate": 7.836397198195252e-06, "loss": 0.5062, "num_input_tokens_seen": 59784504, "step": 102965 }, { "epoch": 15.336610068513554, "grad_norm": 2.1078848838806152, "learning_rate": 7.83403472733869e-06, "loss": 0.4738, "num_input_tokens_seen": 59787448, "step": 102970 }, { "epoch": 15.337354781054513, "grad_norm": 2.496253252029419, "learning_rate": 7.83167254648366e-06, "loss": 0.7929, "num_input_tokens_seen": 59790296, "step": 102975 }, { "epoch": 15.338099493595472, "grad_norm": 0.9525500535964966, "learning_rate": 7.829310655670077e-06, "loss": 0.5089, "num_input_tokens_seen": 59792984, "step": 102980 }, { "epoch": 15.33884420613643, "grad_norm": 1.8927624225616455, "learning_rate": 7.82694905493784e-06, "loss": 0.7615, "num_input_tokens_seen": 59796120, "step": 102985 }, { "epoch": 15.339588918677391, "grad_norm": 1.9983947277069092, "learning_rate": 7.824587744326847e-06, "loss": 0.7467, "num_input_tokens_seen": 59799352, "step": 102990 }, { "epoch": 15.34033363121835, "grad_norm": 2.0881338119506836, "learning_rate": 7.822226723876976e-06, "loss": 0.5382, "num_input_tokens_seen": 59802264, "step": 102995 }, { "epoch": 15.341078343759309, "grad_norm": 2.7451982498168945, "learning_rate": 7.819865993628139e-06, "loss": 0.564, "num_input_tokens_seen": 59805112, "step": 103000 }, { "epoch": 15.341823056300267, "grad_norm": 2.014981746673584, "learning_rate": 7.817505553620194e-06, "loss": 0.6154, "num_input_tokens_seen": 59807832, "step": 103005 }, { "epoch": 15.342567768841228, "grad_norm": 1.1985048055648804, "learning_rate": 7.815145403893037e-06, "loss": 0.5007, "num_input_tokens_seen": 59810712, "step": 103010 }, { "epoch": 15.343312481382187, "grad_norm": 1.8681395053863525, "learning_rate": 7.812785544486526e-06, "loss": 0.5246, "num_input_tokens_seen": 59813432, "step": 103015 }, { "epoch": 15.344057193923145, "grad_norm": 1.7168869972229004, "learning_rate": 7.81042597544054e-06, "loss": 0.5412, "num_input_tokens_seen": 59816280, "step": 103020 }, { "epoch": 15.344801906464104, "grad_norm": 1.1020630598068237, "learning_rate": 7.808066696794938e-06, "loss": 0.5208, "num_input_tokens_seen": 59819160, "step": 103025 }, { "epoch": 15.345546619005065, "grad_norm": 1.49748957157135, "learning_rate": 7.805707708589569e-06, "loss": 0.7223, "num_input_tokens_seen": 59822104, "step": 103030 }, { "epoch": 15.346291331546023, "grad_norm": 2.6745755672454834, "learning_rate": 7.8033490108643e-06, "loss": 0.7247, "num_input_tokens_seen": 59825080, "step": 103035 }, { "epoch": 15.347036044086982, "grad_norm": 1.2067067623138428, "learning_rate": 7.80099060365897e-06, "loss": 0.5501, "num_input_tokens_seen": 59827960, "step": 103040 }, { "epoch": 15.347780756627941, "grad_norm": 2.000229835510254, "learning_rate": 7.798632487013427e-06, "loss": 0.7228, "num_input_tokens_seen": 59831224, "step": 103045 }, { "epoch": 15.348525469168901, "grad_norm": 0.973653256893158, "learning_rate": 7.796274660967496e-06, "loss": 0.5156, "num_input_tokens_seen": 59834008, "step": 103050 }, { "epoch": 15.34927018170986, "grad_norm": 1.449717402458191, "learning_rate": 7.793917125561027e-06, "loss": 0.5697, "num_input_tokens_seen": 59836760, "step": 103055 }, { "epoch": 15.350014894250819, "grad_norm": 0.9612836837768555, "learning_rate": 7.79155988083384e-06, "loss": 0.6701, "num_input_tokens_seen": 59839448, "step": 103060 }, { "epoch": 15.350759606791778, "grad_norm": 1.6654210090637207, "learning_rate": 7.78920292682575e-06, "loss": 0.5577, "num_input_tokens_seen": 59842424, "step": 103065 }, { "epoch": 15.351504319332738, "grad_norm": 1.9835054874420166, "learning_rate": 7.786846263576594e-06, "loss": 0.6985, "num_input_tokens_seen": 59845304, "step": 103070 }, { "epoch": 15.352249031873697, "grad_norm": 3.9121129512786865, "learning_rate": 7.784489891126167e-06, "loss": 0.5913, "num_input_tokens_seen": 59848344, "step": 103075 }, { "epoch": 15.352993744414656, "grad_norm": 1.8696281909942627, "learning_rate": 7.782133809514297e-06, "loss": 0.8732, "num_input_tokens_seen": 59851224, "step": 103080 }, { "epoch": 15.353738456955615, "grad_norm": 2.1173036098480225, "learning_rate": 7.779778018780765e-06, "loss": 0.6674, "num_input_tokens_seen": 59854264, "step": 103085 }, { "epoch": 15.354483169496575, "grad_norm": 1.9410983324050903, "learning_rate": 7.77742251896539e-06, "loss": 0.7122, "num_input_tokens_seen": 59857240, "step": 103090 }, { "epoch": 15.355227882037534, "grad_norm": 1.2571009397506714, "learning_rate": 7.775067310107953e-06, "loss": 0.688, "num_input_tokens_seen": 59860504, "step": 103095 }, { "epoch": 15.355972594578493, "grad_norm": 1.230233907699585, "learning_rate": 7.772712392248251e-06, "loss": 0.5073, "num_input_tokens_seen": 59863480, "step": 103100 }, { "epoch": 15.356717307119451, "grad_norm": 0.7653052806854248, "learning_rate": 7.770357765426068e-06, "loss": 0.5834, "num_input_tokens_seen": 59866456, "step": 103105 }, { "epoch": 15.357462019660412, "grad_norm": 1.9766422510147095, "learning_rate": 7.768003429681175e-06, "loss": 0.405, "num_input_tokens_seen": 59868856, "step": 103110 }, { "epoch": 15.35820673220137, "grad_norm": 1.9709892272949219, "learning_rate": 7.765649385053353e-06, "loss": 0.6013, "num_input_tokens_seen": 59871864, "step": 103115 }, { "epoch": 15.35895144474233, "grad_norm": 0.9716086983680725, "learning_rate": 7.76329563158236e-06, "loss": 0.4469, "num_input_tokens_seen": 59874712, "step": 103120 }, { "epoch": 15.359696157283288, "grad_norm": 2.000262498855591, "learning_rate": 7.760942169307975e-06, "loss": 0.6216, "num_input_tokens_seen": 59877528, "step": 103125 }, { "epoch": 15.360440869824249, "grad_norm": 1.680259108543396, "learning_rate": 7.758588998269944e-06, "loss": 0.6142, "num_input_tokens_seen": 59880888, "step": 103130 }, { "epoch": 15.361185582365207, "grad_norm": 2.5451767444610596, "learning_rate": 7.756236118508036e-06, "loss": 0.7829, "num_input_tokens_seen": 59883672, "step": 103135 }, { "epoch": 15.361930294906166, "grad_norm": 1.4695695638656616, "learning_rate": 7.753883530061987e-06, "loss": 0.4984, "num_input_tokens_seen": 59886680, "step": 103140 }, { "epoch": 15.362675007447125, "grad_norm": 1.0723555088043213, "learning_rate": 7.751531232971554e-06, "loss": 0.5844, "num_input_tokens_seen": 59889912, "step": 103145 }, { "epoch": 15.363419719988084, "grad_norm": 2.2001774311065674, "learning_rate": 7.749179227276471e-06, "loss": 0.7024, "num_input_tokens_seen": 59892952, "step": 103150 }, { "epoch": 15.364164432529044, "grad_norm": 1.5114901065826416, "learning_rate": 7.746827513016464e-06, "loss": 0.4959, "num_input_tokens_seen": 59896056, "step": 103155 }, { "epoch": 15.364909145070003, "grad_norm": 1.3606864213943481, "learning_rate": 7.744476090231275e-06, "loss": 0.4276, "num_input_tokens_seen": 59898616, "step": 103160 }, { "epoch": 15.365653857610962, "grad_norm": 1.0396292209625244, "learning_rate": 7.742124958960622e-06, "loss": 0.5727, "num_input_tokens_seen": 59901912, "step": 103165 }, { "epoch": 15.36639857015192, "grad_norm": 0.7367584705352783, "learning_rate": 7.739774119244233e-06, "loss": 0.5057, "num_input_tokens_seen": 59904952, "step": 103170 }, { "epoch": 15.367143282692881, "grad_norm": 1.5927921533584595, "learning_rate": 7.737423571121818e-06, "loss": 0.3777, "num_input_tokens_seen": 59907704, "step": 103175 }, { "epoch": 15.36788799523384, "grad_norm": 1.8509658575057983, "learning_rate": 7.73507331463309e-06, "loss": 0.5091, "num_input_tokens_seen": 59910328, "step": 103180 }, { "epoch": 15.368632707774799, "grad_norm": 2.291271448135376, "learning_rate": 7.732723349817747e-06, "loss": 0.7046, "num_input_tokens_seen": 59913656, "step": 103185 }, { "epoch": 15.369377420315757, "grad_norm": 1.9213546514511108, "learning_rate": 7.730373676715488e-06, "loss": 0.5744, "num_input_tokens_seen": 59916600, "step": 103190 }, { "epoch": 15.370122132856718, "grad_norm": 2.5136566162109375, "learning_rate": 7.728024295366018e-06, "loss": 0.5968, "num_input_tokens_seen": 59919416, "step": 103195 }, { "epoch": 15.370866845397677, "grad_norm": 0.8612159490585327, "learning_rate": 7.725675205809019e-06, "loss": 0.4942, "num_input_tokens_seen": 59922424, "step": 103200 }, { "epoch": 15.371611557938635, "grad_norm": 0.8041795492172241, "learning_rate": 7.723326408084186e-06, "loss": 0.5329, "num_input_tokens_seen": 59925496, "step": 103205 }, { "epoch": 15.372356270479594, "grad_norm": 0.9410576820373535, "learning_rate": 7.720977902231189e-06, "loss": 0.5346, "num_input_tokens_seen": 59928152, "step": 103210 }, { "epoch": 15.373100983020555, "grad_norm": 1.8948040008544922, "learning_rate": 7.718629688289713e-06, "loss": 0.7105, "num_input_tokens_seen": 59931064, "step": 103215 }, { "epoch": 15.373845695561513, "grad_norm": 1.958000659942627, "learning_rate": 7.716281766299419e-06, "loss": 0.7673, "num_input_tokens_seen": 59934040, "step": 103220 }, { "epoch": 15.374590408102472, "grad_norm": 2.276855707168579, "learning_rate": 7.713934136299985e-06, "loss": 0.6328, "num_input_tokens_seen": 59936600, "step": 103225 }, { "epoch": 15.375335120643431, "grad_norm": 1.5380775928497314, "learning_rate": 7.711586798331066e-06, "loss": 0.6028, "num_input_tokens_seen": 59939320, "step": 103230 }, { "epoch": 15.376079833184392, "grad_norm": 1.642630696296692, "learning_rate": 7.70923975243231e-06, "loss": 0.6385, "num_input_tokens_seen": 59942232, "step": 103235 }, { "epoch": 15.37682454572535, "grad_norm": 1.5925458669662476, "learning_rate": 7.70689299864338e-06, "loss": 0.5737, "num_input_tokens_seen": 59945304, "step": 103240 }, { "epoch": 15.377569258266309, "grad_norm": 1.2115720510482788, "learning_rate": 7.704546537003918e-06, "loss": 0.5817, "num_input_tokens_seen": 59948344, "step": 103245 }, { "epoch": 15.378313970807268, "grad_norm": 1.6590003967285156, "learning_rate": 7.702200367553563e-06, "loss": 0.6417, "num_input_tokens_seen": 59951256, "step": 103250 }, { "epoch": 15.379058683348228, "grad_norm": 1.853133201599121, "learning_rate": 7.699854490331948e-06, "loss": 0.6138, "num_input_tokens_seen": 59954200, "step": 103255 }, { "epoch": 15.379803395889187, "grad_norm": 1.7833950519561768, "learning_rate": 7.697508905378712e-06, "loss": 0.7553, "num_input_tokens_seen": 59957048, "step": 103260 }, { "epoch": 15.380548108430146, "grad_norm": 1.5209808349609375, "learning_rate": 7.69516361273348e-06, "loss": 0.5034, "num_input_tokens_seen": 59959608, "step": 103265 }, { "epoch": 15.381292820971105, "grad_norm": 2.472781181335449, "learning_rate": 7.692818612435862e-06, "loss": 0.6725, "num_input_tokens_seen": 59962328, "step": 103270 }, { "epoch": 15.382037533512065, "grad_norm": 1.1797356605529785, "learning_rate": 7.690473904525491e-06, "loss": 0.5453, "num_input_tokens_seen": 59965560, "step": 103275 }, { "epoch": 15.382782246053024, "grad_norm": 1.5838603973388672, "learning_rate": 7.688129489041963e-06, "loss": 0.6026, "num_input_tokens_seen": 59968280, "step": 103280 }, { "epoch": 15.383526958593983, "grad_norm": 1.561077356338501, "learning_rate": 7.685785366024901e-06, "loss": 0.6997, "num_input_tokens_seen": 59971256, "step": 103285 }, { "epoch": 15.384271671134941, "grad_norm": 1.0926178693771362, "learning_rate": 7.683441535513888e-06, "loss": 0.5958, "num_input_tokens_seen": 59974168, "step": 103290 }, { "epoch": 15.3850163836759, "grad_norm": 1.4654039144515991, "learning_rate": 7.681097997548539e-06, "loss": 0.6235, "num_input_tokens_seen": 59977400, "step": 103295 }, { "epoch": 15.38576109621686, "grad_norm": 1.9865641593933105, "learning_rate": 7.678754752168438e-06, "loss": 0.4709, "num_input_tokens_seen": 59979960, "step": 103300 }, { "epoch": 15.38650580875782, "grad_norm": 1.8799947500228882, "learning_rate": 7.676411799413163e-06, "loss": 0.3985, "num_input_tokens_seen": 59982936, "step": 103305 }, { "epoch": 15.387250521298778, "grad_norm": 1.5492146015167236, "learning_rate": 7.674069139322312e-06, "loss": 0.6039, "num_input_tokens_seen": 59985592, "step": 103310 }, { "epoch": 15.387995233839739, "grad_norm": 1.5199003219604492, "learning_rate": 7.671726771935453e-06, "loss": 0.6446, "num_input_tokens_seen": 59988568, "step": 103315 }, { "epoch": 15.388739946380698, "grad_norm": 1.6748931407928467, "learning_rate": 7.669384697292158e-06, "loss": 0.5123, "num_input_tokens_seen": 59991352, "step": 103320 }, { "epoch": 15.389484658921656, "grad_norm": 2.508058547973633, "learning_rate": 7.667042915431987e-06, "loss": 0.3871, "num_input_tokens_seen": 59994360, "step": 103325 }, { "epoch": 15.390229371462615, "grad_norm": 0.8955451250076294, "learning_rate": 7.66470142639452e-06, "loss": 0.3731, "num_input_tokens_seen": 59997528, "step": 103330 }, { "epoch": 15.390974084003574, "grad_norm": 1.9129034280776978, "learning_rate": 7.662360230219293e-06, "loss": 0.4954, "num_input_tokens_seen": 60000248, "step": 103335 }, { "epoch": 15.391718796544534, "grad_norm": 1.046798825263977, "learning_rate": 7.660019326945874e-06, "loss": 0.4557, "num_input_tokens_seen": 60003128, "step": 103340 }, { "epoch": 15.392463509085493, "grad_norm": 1.5240660905838013, "learning_rate": 7.657678716613808e-06, "loss": 0.764, "num_input_tokens_seen": 60005784, "step": 103345 }, { "epoch": 15.393208221626452, "grad_norm": 2.950634479522705, "learning_rate": 7.655338399262627e-06, "loss": 0.6269, "num_input_tokens_seen": 60008952, "step": 103350 }, { "epoch": 15.39395293416741, "grad_norm": 0.7394258379936218, "learning_rate": 7.652998374931882e-06, "loss": 0.4335, "num_input_tokens_seen": 60011768, "step": 103355 }, { "epoch": 15.394697646708371, "grad_norm": 1.5887285470962524, "learning_rate": 7.65065864366109e-06, "loss": 0.5591, "num_input_tokens_seen": 60014680, "step": 103360 }, { "epoch": 15.39544235924933, "grad_norm": 1.1396011114120483, "learning_rate": 7.648319205489798e-06, "loss": 0.6693, "num_input_tokens_seen": 60017624, "step": 103365 }, { "epoch": 15.396187071790289, "grad_norm": 1.0734891891479492, "learning_rate": 7.645980060457517e-06, "loss": 0.5171, "num_input_tokens_seen": 60020600, "step": 103370 }, { "epoch": 15.396931784331247, "grad_norm": 1.0759451389312744, "learning_rate": 7.643641208603764e-06, "loss": 0.4754, "num_input_tokens_seen": 60023480, "step": 103375 }, { "epoch": 15.397676496872208, "grad_norm": 1.062624216079712, "learning_rate": 7.641302649968043e-06, "loss": 0.558, "num_input_tokens_seen": 60026520, "step": 103380 }, { "epoch": 15.398421209413167, "grad_norm": 2.2721848487854004, "learning_rate": 7.638964384589881e-06, "loss": 0.6057, "num_input_tokens_seen": 60029400, "step": 103385 }, { "epoch": 15.399165921954125, "grad_norm": 2.660992383956909, "learning_rate": 7.63662641250877e-06, "loss": 0.7448, "num_input_tokens_seen": 60032216, "step": 103390 }, { "epoch": 15.399910634495084, "grad_norm": 0.9981499910354614, "learning_rate": 7.6342887337642e-06, "loss": 0.7107, "num_input_tokens_seen": 60035416, "step": 103395 }, { "epoch": 15.400655347036045, "grad_norm": 1.6566659212112427, "learning_rate": 7.631951348395683e-06, "loss": 0.8033, "num_input_tokens_seen": 60038456, "step": 103400 }, { "epoch": 15.401400059577004, "grad_norm": 1.4442064762115479, "learning_rate": 7.629614256442685e-06, "loss": 0.6261, "num_input_tokens_seen": 60041496, "step": 103405 }, { "epoch": 15.402144772117962, "grad_norm": 2.4863433837890625, "learning_rate": 7.6272774579447065e-06, "loss": 0.628, "num_input_tokens_seen": 60044472, "step": 103410 }, { "epoch": 15.402889484658921, "grad_norm": 2.3486437797546387, "learning_rate": 7.6249409529412145e-06, "loss": 0.6519, "num_input_tokens_seen": 60047224, "step": 103415 }, { "epoch": 15.403634197199882, "grad_norm": 1.4770151376724243, "learning_rate": 7.622604741471692e-06, "loss": 0.5422, "num_input_tokens_seen": 60050200, "step": 103420 }, { "epoch": 15.40437890974084, "grad_norm": 1.6776832342147827, "learning_rate": 7.620268823575599e-06, "loss": 0.5982, "num_input_tokens_seen": 60053304, "step": 103425 }, { "epoch": 15.405123622281799, "grad_norm": 1.9012712240219116, "learning_rate": 7.617933199292396e-06, "loss": 0.6787, "num_input_tokens_seen": 60056280, "step": 103430 }, { "epoch": 15.405868334822758, "grad_norm": 2.0620265007019043, "learning_rate": 7.61559786866155e-06, "loss": 0.4322, "num_input_tokens_seen": 60059224, "step": 103435 }, { "epoch": 15.406613047363718, "grad_norm": 1.2857131958007812, "learning_rate": 7.613262831722509e-06, "loss": 0.6762, "num_input_tokens_seen": 60062072, "step": 103440 }, { "epoch": 15.407357759904677, "grad_norm": 1.652251958847046, "learning_rate": 7.610928088514724e-06, "loss": 0.3936, "num_input_tokens_seen": 60064696, "step": 103445 }, { "epoch": 15.408102472445636, "grad_norm": 1.6554864645004272, "learning_rate": 7.608593639077627e-06, "loss": 0.5666, "num_input_tokens_seen": 60067736, "step": 103450 }, { "epoch": 15.408847184986595, "grad_norm": 0.9221864342689514, "learning_rate": 7.60625948345067e-06, "loss": 0.6742, "num_input_tokens_seen": 60070392, "step": 103455 }, { "epoch": 15.409591897527555, "grad_norm": 1.8883512020111084, "learning_rate": 7.603925621673275e-06, "loss": 0.806, "num_input_tokens_seen": 60073080, "step": 103460 }, { "epoch": 15.410336610068514, "grad_norm": 1.57646906375885, "learning_rate": 7.601592053784884e-06, "loss": 0.6793, "num_input_tokens_seen": 60076120, "step": 103465 }, { "epoch": 15.411081322609473, "grad_norm": 2.4847798347473145, "learning_rate": 7.599258779824911e-06, "loss": 0.5904, "num_input_tokens_seen": 60079256, "step": 103470 }, { "epoch": 15.411826035150431, "grad_norm": 1.4731500148773193, "learning_rate": 7.596925799832769e-06, "loss": 0.8373, "num_input_tokens_seen": 60082072, "step": 103475 }, { "epoch": 15.41257074769139, "grad_norm": 1.245002031326294, "learning_rate": 7.594593113847887e-06, "loss": 0.5497, "num_input_tokens_seen": 60084920, "step": 103480 }, { "epoch": 15.41331546023235, "grad_norm": 1.4056742191314697, "learning_rate": 7.592260721909655e-06, "loss": 0.6985, "num_input_tokens_seen": 60087448, "step": 103485 }, { "epoch": 15.41406017277331, "grad_norm": 1.15715491771698, "learning_rate": 7.589928624057494e-06, "loss": 0.5622, "num_input_tokens_seen": 60090360, "step": 103490 }, { "epoch": 15.414804885314268, "grad_norm": 1.6351796388626099, "learning_rate": 7.587596820330783e-06, "loss": 0.6373, "num_input_tokens_seen": 60093336, "step": 103495 }, { "epoch": 15.415549597855227, "grad_norm": 1.0965838432312012, "learning_rate": 7.585265310768938e-06, "loss": 0.4875, "num_input_tokens_seen": 60096152, "step": 103500 }, { "epoch": 15.416294310396188, "grad_norm": 2.7608067989349365, "learning_rate": 7.582934095411337e-06, "loss": 0.6593, "num_input_tokens_seen": 60098840, "step": 103505 }, { "epoch": 15.417039022937146, "grad_norm": 1.8233453035354614, "learning_rate": 7.58060317429736e-06, "loss": 0.5641, "num_input_tokens_seen": 60101752, "step": 103510 }, { "epoch": 15.417783735478105, "grad_norm": 1.4923756122589111, "learning_rate": 7.5782725474663894e-06, "loss": 0.6052, "num_input_tokens_seen": 60104536, "step": 103515 }, { "epoch": 15.418528448019064, "grad_norm": 1.5594072341918945, "learning_rate": 7.575942214957787e-06, "loss": 0.6769, "num_input_tokens_seen": 60107576, "step": 103520 }, { "epoch": 15.419273160560024, "grad_norm": 2.0798511505126953, "learning_rate": 7.573612176810943e-06, "loss": 0.708, "num_input_tokens_seen": 60110776, "step": 103525 }, { "epoch": 15.420017873100983, "grad_norm": 0.9848253726959229, "learning_rate": 7.5712824330651995e-06, "loss": 0.6294, "num_input_tokens_seen": 60113592, "step": 103530 }, { "epoch": 15.420762585641942, "grad_norm": 1.9482680559158325, "learning_rate": 7.568952983759936e-06, "loss": 0.4601, "num_input_tokens_seen": 60116568, "step": 103535 }, { "epoch": 15.4215072981829, "grad_norm": 1.3547343015670776, "learning_rate": 7.566623828934485e-06, "loss": 0.5639, "num_input_tokens_seen": 60119320, "step": 103540 }, { "epoch": 15.422252010723861, "grad_norm": 2.6026523113250732, "learning_rate": 7.5642949686282165e-06, "loss": 0.6136, "num_input_tokens_seen": 60122168, "step": 103545 }, { "epoch": 15.42299672326482, "grad_norm": 2.3863420486450195, "learning_rate": 7.561966402880461e-06, "loss": 0.6162, "num_input_tokens_seen": 60125240, "step": 103550 }, { "epoch": 15.423741435805779, "grad_norm": 1.1452388763427734, "learning_rate": 7.559638131730554e-06, "loss": 0.5849, "num_input_tokens_seen": 60128312, "step": 103555 }, { "epoch": 15.424486148346737, "grad_norm": 1.6975417137145996, "learning_rate": 7.557310155217842e-06, "loss": 0.8249, "num_input_tokens_seen": 60131160, "step": 103560 }, { "epoch": 15.425230860887698, "grad_norm": 2.3889613151550293, "learning_rate": 7.554982473381639e-06, "loss": 0.4468, "num_input_tokens_seen": 60133656, "step": 103565 }, { "epoch": 15.425975573428657, "grad_norm": 0.9744742512702942, "learning_rate": 7.552655086261287e-06, "loss": 0.7356, "num_input_tokens_seen": 60136632, "step": 103570 }, { "epoch": 15.426720285969616, "grad_norm": 1.9686775207519531, "learning_rate": 7.550327993896092e-06, "loss": 0.5518, "num_input_tokens_seen": 60139320, "step": 103575 }, { "epoch": 15.427464998510574, "grad_norm": 2.675891637802124, "learning_rate": 7.548001196325372e-06, "loss": 0.5469, "num_input_tokens_seen": 60142200, "step": 103580 }, { "epoch": 15.428209711051535, "grad_norm": 1.3077223300933838, "learning_rate": 7.545674693588434e-06, "loss": 0.8371, "num_input_tokens_seen": 60145304, "step": 103585 }, { "epoch": 15.428954423592494, "grad_norm": 1.4053971767425537, "learning_rate": 7.543348485724572e-06, "loss": 0.7528, "num_input_tokens_seen": 60148248, "step": 103590 }, { "epoch": 15.429699136133452, "grad_norm": 1.7008320093154907, "learning_rate": 7.541022572773107e-06, "loss": 0.6149, "num_input_tokens_seen": 60151352, "step": 103595 }, { "epoch": 15.430443848674411, "grad_norm": 2.680933713912964, "learning_rate": 7.538696954773311e-06, "loss": 0.6453, "num_input_tokens_seen": 60154200, "step": 103600 }, { "epoch": 15.431188561215372, "grad_norm": 1.260423183441162, "learning_rate": 7.536371631764491e-06, "loss": 0.6118, "num_input_tokens_seen": 60156824, "step": 103605 }, { "epoch": 15.43193327375633, "grad_norm": 1.006807565689087, "learning_rate": 7.534046603785916e-06, "loss": 0.6117, "num_input_tokens_seen": 60159512, "step": 103610 }, { "epoch": 15.43267798629729, "grad_norm": 1.3906099796295166, "learning_rate": 7.531721870876879e-06, "loss": 0.643, "num_input_tokens_seen": 60162456, "step": 103615 }, { "epoch": 15.433422698838248, "grad_norm": 2.3384811878204346, "learning_rate": 7.529397433076638e-06, "loss": 0.5386, "num_input_tokens_seen": 60165144, "step": 103620 }, { "epoch": 15.434167411379208, "grad_norm": 1.8063998222351074, "learning_rate": 7.52707329042448e-06, "loss": 0.5548, "num_input_tokens_seen": 60168120, "step": 103625 }, { "epoch": 15.434912123920167, "grad_norm": 1.4138188362121582, "learning_rate": 7.524749442959661e-06, "loss": 0.5939, "num_input_tokens_seen": 60170680, "step": 103630 }, { "epoch": 15.435656836461126, "grad_norm": 1.5090749263763428, "learning_rate": 7.522425890721432e-06, "loss": 0.6487, "num_input_tokens_seen": 60173816, "step": 103635 }, { "epoch": 15.436401549002085, "grad_norm": 2.7176098823547363, "learning_rate": 7.52010263374906e-06, "loss": 0.9731, "num_input_tokens_seen": 60176728, "step": 103640 }, { "epoch": 15.437146261543045, "grad_norm": 1.7958251237869263, "learning_rate": 7.51777967208179e-06, "loss": 0.6445, "num_input_tokens_seen": 60179608, "step": 103645 }, { "epoch": 15.437890974084004, "grad_norm": 1.1609073877334595, "learning_rate": 7.515457005758864e-06, "loss": 0.4274, "num_input_tokens_seen": 60182808, "step": 103650 }, { "epoch": 15.438635686624963, "grad_norm": 1.51478111743927, "learning_rate": 7.5131346348195105e-06, "loss": 0.7284, "num_input_tokens_seen": 60186008, "step": 103655 }, { "epoch": 15.439380399165922, "grad_norm": 2.366051197052002, "learning_rate": 7.510812559302985e-06, "loss": 0.6336, "num_input_tokens_seen": 60188856, "step": 103660 }, { "epoch": 15.44012511170688, "grad_norm": 1.4134163856506348, "learning_rate": 7.508490779248506e-06, "loss": 0.6887, "num_input_tokens_seen": 60191864, "step": 103665 }, { "epoch": 15.44086982424784, "grad_norm": 1.175019383430481, "learning_rate": 7.5061692946952896e-06, "loss": 0.5366, "num_input_tokens_seen": 60194584, "step": 103670 }, { "epoch": 15.4416145367888, "grad_norm": 1.368337631225586, "learning_rate": 7.503848105682571e-06, "loss": 0.5644, "num_input_tokens_seen": 60197624, "step": 103675 }, { "epoch": 15.442359249329758, "grad_norm": 1.678591012954712, "learning_rate": 7.501527212249549e-06, "loss": 0.6595, "num_input_tokens_seen": 60200632, "step": 103680 }, { "epoch": 15.443103961870717, "grad_norm": 1.7763664722442627, "learning_rate": 7.4992066144354475e-06, "loss": 0.7367, "num_input_tokens_seen": 60203608, "step": 103685 }, { "epoch": 15.443848674411678, "grad_norm": 1.352327585220337, "learning_rate": 7.496886312279455e-06, "loss": 0.6776, "num_input_tokens_seen": 60206264, "step": 103690 }, { "epoch": 15.444593386952636, "grad_norm": 0.8741292953491211, "learning_rate": 7.494566305820788e-06, "loss": 0.6607, "num_input_tokens_seen": 60209176, "step": 103695 }, { "epoch": 15.445338099493595, "grad_norm": 1.2232534885406494, "learning_rate": 7.492246595098629e-06, "loss": 0.7758, "num_input_tokens_seen": 60212056, "step": 103700 }, { "epoch": 15.446082812034554, "grad_norm": 2.602034568786621, "learning_rate": 7.489927180152173e-06, "loss": 0.6808, "num_input_tokens_seen": 60214968, "step": 103705 }, { "epoch": 15.446827524575514, "grad_norm": 1.7545615434646606, "learning_rate": 7.487608061020599e-06, "loss": 0.6374, "num_input_tokens_seen": 60217880, "step": 103710 }, { "epoch": 15.447572237116473, "grad_norm": 1.5576306581497192, "learning_rate": 7.485289237743079e-06, "loss": 0.6202, "num_input_tokens_seen": 60221112, "step": 103715 }, { "epoch": 15.448316949657432, "grad_norm": 1.3734240531921387, "learning_rate": 7.482970710358806e-06, "loss": 0.52, "num_input_tokens_seen": 60223896, "step": 103720 }, { "epoch": 15.44906166219839, "grad_norm": 1.0311726331710815, "learning_rate": 7.48065247890693e-06, "loss": 0.5551, "num_input_tokens_seen": 60226616, "step": 103725 }, { "epoch": 15.449806374739351, "grad_norm": 0.9643334746360779, "learning_rate": 7.478334543426632e-06, "loss": 0.6268, "num_input_tokens_seen": 60229464, "step": 103730 }, { "epoch": 15.45055108728031, "grad_norm": 1.2226989269256592, "learning_rate": 7.476016903957058e-06, "loss": 0.4768, "num_input_tokens_seen": 60232536, "step": 103735 }, { "epoch": 15.451295799821269, "grad_norm": 1.2969450950622559, "learning_rate": 7.473699560537376e-06, "loss": 0.6772, "num_input_tokens_seen": 60235512, "step": 103740 }, { "epoch": 15.452040512362228, "grad_norm": 0.9386035203933716, "learning_rate": 7.471382513206718e-06, "loss": 0.5695, "num_input_tokens_seen": 60238328, "step": 103745 }, { "epoch": 15.452785224903188, "grad_norm": 1.9519362449645996, "learning_rate": 7.469065762004243e-06, "loss": 0.6075, "num_input_tokens_seen": 60240824, "step": 103750 }, { "epoch": 15.453529937444147, "grad_norm": 2.220186948776245, "learning_rate": 7.466749306969087e-06, "loss": 0.6157, "num_input_tokens_seen": 60243352, "step": 103755 }, { "epoch": 15.454274649985106, "grad_norm": 1.8879433870315552, "learning_rate": 7.464433148140371e-06, "loss": 0.5474, "num_input_tokens_seen": 60246328, "step": 103760 }, { "epoch": 15.455019362526064, "grad_norm": 1.4313549995422363, "learning_rate": 7.462117285557246e-06, "loss": 0.4225, "num_input_tokens_seen": 60249304, "step": 103765 }, { "epoch": 15.455764075067025, "grad_norm": 1.4338154792785645, "learning_rate": 7.459801719258821e-06, "loss": 0.627, "num_input_tokens_seen": 60252056, "step": 103770 }, { "epoch": 15.456508787607984, "grad_norm": 1.5774253606796265, "learning_rate": 7.457486449284221e-06, "loss": 0.5446, "num_input_tokens_seen": 60255064, "step": 103775 }, { "epoch": 15.457253500148942, "grad_norm": 1.1065804958343506, "learning_rate": 7.455171475672551e-06, "loss": 0.7221, "num_input_tokens_seen": 60257944, "step": 103780 }, { "epoch": 15.457998212689901, "grad_norm": 1.3038822412490845, "learning_rate": 7.4528567984629344e-06, "loss": 0.4624, "num_input_tokens_seen": 60260696, "step": 103785 }, { "epoch": 15.458742925230862, "grad_norm": 1.1766941547393799, "learning_rate": 7.450542417694467e-06, "loss": 0.7151, "num_input_tokens_seen": 60263800, "step": 103790 }, { "epoch": 15.45948763777182, "grad_norm": 0.870553731918335, "learning_rate": 7.448228333406241e-06, "loss": 0.5365, "num_input_tokens_seen": 60266840, "step": 103795 }, { "epoch": 15.46023235031278, "grad_norm": 0.7961137294769287, "learning_rate": 7.445914545637367e-06, "loss": 0.7977, "num_input_tokens_seen": 60270040, "step": 103800 }, { "epoch": 15.460977062853738, "grad_norm": 2.096864700317383, "learning_rate": 7.443601054426919e-06, "loss": 0.5717, "num_input_tokens_seen": 60272824, "step": 103805 }, { "epoch": 15.461721775394698, "grad_norm": 1.493090271949768, "learning_rate": 7.441287859813995e-06, "loss": 0.5614, "num_input_tokens_seen": 60275960, "step": 103810 }, { "epoch": 15.462466487935657, "grad_norm": 1.2854278087615967, "learning_rate": 7.438974961837655e-06, "loss": 0.4492, "num_input_tokens_seen": 60278456, "step": 103815 }, { "epoch": 15.463211200476616, "grad_norm": 1.3591327667236328, "learning_rate": 7.436662360536997e-06, "loss": 0.5702, "num_input_tokens_seen": 60281464, "step": 103820 }, { "epoch": 15.463955913017575, "grad_norm": 1.0392401218414307, "learning_rate": 7.43435005595107e-06, "loss": 0.6255, "num_input_tokens_seen": 60284440, "step": 103825 }, { "epoch": 15.464700625558535, "grad_norm": 1.1287405490875244, "learning_rate": 7.432038048118953e-06, "loss": 0.459, "num_input_tokens_seen": 60287256, "step": 103830 }, { "epoch": 15.465445338099494, "grad_norm": 1.5469971895217896, "learning_rate": 7.429726337079695e-06, "loss": 0.5858, "num_input_tokens_seen": 60289976, "step": 103835 }, { "epoch": 15.466190050640453, "grad_norm": 1.8786373138427734, "learning_rate": 7.427414922872356e-06, "loss": 0.5984, "num_input_tokens_seen": 60292920, "step": 103840 }, { "epoch": 15.466934763181412, "grad_norm": 1.2288000583648682, "learning_rate": 7.4251038055359825e-06, "loss": 0.5728, "num_input_tokens_seen": 60296184, "step": 103845 }, { "epoch": 15.46767947572237, "grad_norm": 2.096466064453125, "learning_rate": 7.422792985109608e-06, "loss": 0.5952, "num_input_tokens_seen": 60299128, "step": 103850 }, { "epoch": 15.46842418826333, "grad_norm": 2.642749786376953, "learning_rate": 7.420482461632289e-06, "loss": 0.6536, "num_input_tokens_seen": 60301944, "step": 103855 }, { "epoch": 15.46916890080429, "grad_norm": 0.9963042140007019, "learning_rate": 7.418172235143045e-06, "loss": 0.7092, "num_input_tokens_seen": 60304600, "step": 103860 }, { "epoch": 15.469913613345248, "grad_norm": 1.146174430847168, "learning_rate": 7.415862305680921e-06, "loss": 0.539, "num_input_tokens_seen": 60307320, "step": 103865 }, { "epoch": 15.470658325886207, "grad_norm": 1.9219026565551758, "learning_rate": 7.413552673284929e-06, "loss": 0.7158, "num_input_tokens_seen": 60309976, "step": 103870 }, { "epoch": 15.471403038427168, "grad_norm": 1.609778881072998, "learning_rate": 7.411243337994084e-06, "loss": 0.6118, "num_input_tokens_seen": 60312824, "step": 103875 }, { "epoch": 15.472147750968126, "grad_norm": 1.4095345735549927, "learning_rate": 7.4089342998474145e-06, "loss": 0.5772, "num_input_tokens_seen": 60315480, "step": 103880 }, { "epoch": 15.472892463509085, "grad_norm": 0.6842436194419861, "learning_rate": 7.406625558883912e-06, "loss": 0.817, "num_input_tokens_seen": 60318456, "step": 103885 }, { "epoch": 15.473637176050044, "grad_norm": 1.8207483291625977, "learning_rate": 7.404317115142598e-06, "loss": 0.3557, "num_input_tokens_seen": 60321144, "step": 103890 }, { "epoch": 15.474381888591004, "grad_norm": 1.7116910219192505, "learning_rate": 7.402008968662455e-06, "loss": 0.6083, "num_input_tokens_seen": 60324312, "step": 103895 }, { "epoch": 15.475126601131963, "grad_norm": 1.731688380241394, "learning_rate": 7.399701119482494e-06, "loss": 0.6354, "num_input_tokens_seen": 60327160, "step": 103900 }, { "epoch": 15.475871313672922, "grad_norm": 1.279152750968933, "learning_rate": 7.397393567641694e-06, "loss": 0.6182, "num_input_tokens_seen": 60329848, "step": 103905 }, { "epoch": 15.47661602621388, "grad_norm": 1.7890921831130981, "learning_rate": 7.395086313179037e-06, "loss": 0.6883, "num_input_tokens_seen": 60332632, "step": 103910 }, { "epoch": 15.477360738754841, "grad_norm": 1.5225001573562622, "learning_rate": 7.392779356133506e-06, "loss": 0.4998, "num_input_tokens_seen": 60335544, "step": 103915 }, { "epoch": 15.4781054512958, "grad_norm": 1.0606523752212524, "learning_rate": 7.390472696544065e-06, "loss": 0.5593, "num_input_tokens_seen": 60338392, "step": 103920 }, { "epoch": 15.478850163836759, "grad_norm": 1.1963778734207153, "learning_rate": 7.388166334449697e-06, "loss": 0.5729, "num_input_tokens_seen": 60341048, "step": 103925 }, { "epoch": 15.479594876377718, "grad_norm": 2.649277687072754, "learning_rate": 7.3858602698893495e-06, "loss": 0.6359, "num_input_tokens_seen": 60343896, "step": 103930 }, { "epoch": 15.480339588918678, "grad_norm": 1.7879905700683594, "learning_rate": 7.383554502902001e-06, "loss": 0.6377, "num_input_tokens_seen": 60346744, "step": 103935 }, { "epoch": 15.481084301459637, "grad_norm": 1.4806387424468994, "learning_rate": 7.381249033526585e-06, "loss": 0.8024, "num_input_tokens_seen": 60349784, "step": 103940 }, { "epoch": 15.481829014000596, "grad_norm": 1.3368618488311768, "learning_rate": 7.37894386180207e-06, "loss": 0.4749, "num_input_tokens_seen": 60352728, "step": 103945 }, { "epoch": 15.482573726541554, "grad_norm": 2.327613115310669, "learning_rate": 7.376638987767387e-06, "loss": 0.6707, "num_input_tokens_seen": 60355672, "step": 103950 }, { "epoch": 15.483318439082515, "grad_norm": 1.047951102256775, "learning_rate": 7.37433441146147e-06, "loss": 0.6114, "num_input_tokens_seen": 60358552, "step": 103955 }, { "epoch": 15.484063151623474, "grad_norm": 0.8291263580322266, "learning_rate": 7.372030132923266e-06, "loss": 0.5368, "num_input_tokens_seen": 60361656, "step": 103960 }, { "epoch": 15.484807864164432, "grad_norm": 1.2477853298187256, "learning_rate": 7.369726152191692e-06, "loss": 0.4954, "num_input_tokens_seen": 60364504, "step": 103965 }, { "epoch": 15.485552576705391, "grad_norm": 1.2397019863128662, "learning_rate": 7.367422469305679e-06, "loss": 0.4735, "num_input_tokens_seen": 60367288, "step": 103970 }, { "epoch": 15.486297289246352, "grad_norm": 2.09063458442688, "learning_rate": 7.365119084304145e-06, "loss": 0.6827, "num_input_tokens_seen": 60370168, "step": 103975 }, { "epoch": 15.48704200178731, "grad_norm": 0.9640238881111145, "learning_rate": 7.362815997226e-06, "loss": 0.6057, "num_input_tokens_seen": 60373016, "step": 103980 }, { "epoch": 15.48778671432827, "grad_norm": 2.2732961177825928, "learning_rate": 7.360513208110148e-06, "loss": 0.6842, "num_input_tokens_seen": 60376088, "step": 103985 }, { "epoch": 15.488531426869228, "grad_norm": 2.029848098754883, "learning_rate": 7.3582107169955005e-06, "loss": 0.6531, "num_input_tokens_seen": 60378968, "step": 103990 }, { "epoch": 15.489276139410187, "grad_norm": 3.172356128692627, "learning_rate": 7.355908523920957e-06, "loss": 0.7515, "num_input_tokens_seen": 60381752, "step": 103995 }, { "epoch": 15.490020851951147, "grad_norm": 1.8704169988632202, "learning_rate": 7.353606628925397e-06, "loss": 0.6566, "num_input_tokens_seen": 60384600, "step": 104000 }, { "epoch": 15.490765564492106, "grad_norm": 1.5535142421722412, "learning_rate": 7.351305032047726e-06, "loss": 0.5652, "num_input_tokens_seen": 60387416, "step": 104005 }, { "epoch": 15.491510277033065, "grad_norm": 1.7354516983032227, "learning_rate": 7.349003733326809e-06, "loss": 0.5274, "num_input_tokens_seen": 60390584, "step": 104010 }, { "epoch": 15.492254989574024, "grad_norm": 1.698019027709961, "learning_rate": 7.346702732801544e-06, "loss": 0.7277, "num_input_tokens_seen": 60393624, "step": 104015 }, { "epoch": 15.492999702114984, "grad_norm": 1.8486593961715698, "learning_rate": 7.344402030510786e-06, "loss": 0.6993, "num_input_tokens_seen": 60396600, "step": 104020 }, { "epoch": 15.493744414655943, "grad_norm": 1.42878258228302, "learning_rate": 7.34210162649342e-06, "loss": 0.5357, "num_input_tokens_seen": 60399160, "step": 104025 }, { "epoch": 15.494489127196902, "grad_norm": 3.5156304836273193, "learning_rate": 7.3398015207883006e-06, "loss": 0.4255, "num_input_tokens_seen": 60401976, "step": 104030 }, { "epoch": 15.49523383973786, "grad_norm": 1.3112516403198242, "learning_rate": 7.337501713434283e-06, "loss": 0.4522, "num_input_tokens_seen": 60404760, "step": 104035 }, { "epoch": 15.495978552278821, "grad_norm": 1.0722756385803223, "learning_rate": 7.3352022044702266e-06, "loss": 0.5538, "num_input_tokens_seen": 60407896, "step": 104040 }, { "epoch": 15.49672326481978, "grad_norm": 1.671642780303955, "learning_rate": 7.332902993934965e-06, "loss": 0.7688, "num_input_tokens_seen": 60410712, "step": 104045 }, { "epoch": 15.497467977360738, "grad_norm": 1.1359885931015015, "learning_rate": 7.33060408186736e-06, "loss": 0.5854, "num_input_tokens_seen": 60413752, "step": 104050 }, { "epoch": 15.498212689901697, "grad_norm": 1.0259981155395508, "learning_rate": 7.328305468306229e-06, "loss": 0.5637, "num_input_tokens_seen": 60416568, "step": 104055 }, { "epoch": 15.498957402442658, "grad_norm": 1.7349978685379028, "learning_rate": 7.326007153290429e-06, "loss": 0.6811, "num_input_tokens_seen": 60419640, "step": 104060 }, { "epoch": 15.499702114983616, "grad_norm": 1.691891074180603, "learning_rate": 7.323709136858764e-06, "loss": 0.7061, "num_input_tokens_seen": 60422712, "step": 104065 }, { "epoch": 15.500446827524575, "grad_norm": 1.3997679948806763, "learning_rate": 7.321411419050078e-06, "loss": 0.5725, "num_input_tokens_seen": 60425688, "step": 104070 }, { "epoch": 15.501191540065534, "grad_norm": 1.3136550188064575, "learning_rate": 7.319113999903176e-06, "loss": 0.4224, "num_input_tokens_seen": 60428472, "step": 104075 }, { "epoch": 15.501936252606495, "grad_norm": 1.445168375968933, "learning_rate": 7.31681687945687e-06, "loss": 0.435, "num_input_tokens_seen": 60431640, "step": 104080 }, { "epoch": 15.502680965147453, "grad_norm": 1.1281569004058838, "learning_rate": 7.314520057749974e-06, "loss": 0.5724, "num_input_tokens_seen": 60434360, "step": 104085 }, { "epoch": 15.503425677688412, "grad_norm": 2.2210474014282227, "learning_rate": 7.312223534821281e-06, "loss": 0.6801, "num_input_tokens_seen": 60437208, "step": 104090 }, { "epoch": 15.50417039022937, "grad_norm": 1.4735685586929321, "learning_rate": 7.3099273107096e-06, "loss": 0.801, "num_input_tokens_seen": 60440184, "step": 104095 }, { "epoch": 15.504915102770331, "grad_norm": 1.8540955781936646, "learning_rate": 7.307631385453717e-06, "loss": 0.5883, "num_input_tokens_seen": 60442968, "step": 104100 }, { "epoch": 15.50565981531129, "grad_norm": 1.437998652458191, "learning_rate": 7.305335759092424e-06, "loss": 0.6916, "num_input_tokens_seen": 60446200, "step": 104105 }, { "epoch": 15.506404527852249, "grad_norm": 1.9988731145858765, "learning_rate": 7.303040431664496e-06, "loss": 0.7105, "num_input_tokens_seen": 60448920, "step": 104110 }, { "epoch": 15.507149240393208, "grad_norm": 1.9427320957183838, "learning_rate": 7.300745403208705e-06, "loss": 0.4938, "num_input_tokens_seen": 60451416, "step": 104115 }, { "epoch": 15.507893952934168, "grad_norm": 2.025059938430786, "learning_rate": 7.298450673763843e-06, "loss": 0.6357, "num_input_tokens_seen": 60454456, "step": 104120 }, { "epoch": 15.508638665475127, "grad_norm": 1.1129517555236816, "learning_rate": 7.296156243368657e-06, "loss": 0.4974, "num_input_tokens_seen": 60457368, "step": 104125 }, { "epoch": 15.509383378016086, "grad_norm": 2.1088461875915527, "learning_rate": 7.293862112061925e-06, "loss": 0.6563, "num_input_tokens_seen": 60459992, "step": 104130 }, { "epoch": 15.510128090557044, "grad_norm": 1.396593451499939, "learning_rate": 7.291568279882388e-06, "loss": 0.5057, "num_input_tokens_seen": 60462840, "step": 104135 }, { "epoch": 15.510872803098005, "grad_norm": 2.193354368209839, "learning_rate": 7.289274746868818e-06, "loss": 0.6027, "num_input_tokens_seen": 60465816, "step": 104140 }, { "epoch": 15.511617515638964, "grad_norm": 1.571214199066162, "learning_rate": 7.28698151305994e-06, "loss": 0.437, "num_input_tokens_seen": 60468568, "step": 104145 }, { "epoch": 15.512362228179922, "grad_norm": 2.647355556488037, "learning_rate": 7.284688578494514e-06, "loss": 0.7237, "num_input_tokens_seen": 60471256, "step": 104150 }, { "epoch": 15.513106940720881, "grad_norm": 2.489952564239502, "learning_rate": 7.2823959432112705e-06, "loss": 0.5541, "num_input_tokens_seen": 60474456, "step": 104155 }, { "epoch": 15.513851653261842, "grad_norm": 1.2153047323226929, "learning_rate": 7.280103607248934e-06, "loss": 0.5181, "num_input_tokens_seen": 60476952, "step": 104160 }, { "epoch": 15.5145963658028, "grad_norm": 1.6513221263885498, "learning_rate": 7.277811570646242e-06, "loss": 0.5604, "num_input_tokens_seen": 60479864, "step": 104165 }, { "epoch": 15.51534107834376, "grad_norm": 1.7339122295379639, "learning_rate": 7.275519833441915e-06, "loss": 0.4448, "num_input_tokens_seen": 60482904, "step": 104170 }, { "epoch": 15.516085790884718, "grad_norm": 1.518048644065857, "learning_rate": 7.273228395674664e-06, "loss": 0.6293, "num_input_tokens_seen": 60485496, "step": 104175 }, { "epoch": 15.516830503425677, "grad_norm": 2.1827378273010254, "learning_rate": 7.270937257383195e-06, "loss": 0.649, "num_input_tokens_seen": 60488408, "step": 104180 }, { "epoch": 15.517575215966637, "grad_norm": 1.5646830797195435, "learning_rate": 7.268646418606229e-06, "loss": 0.6763, "num_input_tokens_seen": 60491288, "step": 104185 }, { "epoch": 15.518319928507596, "grad_norm": 2.315966844558716, "learning_rate": 7.266355879382461e-06, "loss": 0.6442, "num_input_tokens_seen": 60494200, "step": 104190 }, { "epoch": 15.519064641048555, "grad_norm": 2.0812528133392334, "learning_rate": 7.2640656397505805e-06, "loss": 0.6155, "num_input_tokens_seen": 60496952, "step": 104195 }, { "epoch": 15.519809353589514, "grad_norm": 1.3737711906433105, "learning_rate": 7.26177569974929e-06, "loss": 0.6544, "num_input_tokens_seen": 60499928, "step": 104200 }, { "epoch": 15.520554066130474, "grad_norm": 2.947627305984497, "learning_rate": 7.259486059417265e-06, "loss": 0.8526, "num_input_tokens_seen": 60503128, "step": 104205 }, { "epoch": 15.521298778671433, "grad_norm": 1.647225022315979, "learning_rate": 7.2571967187932e-06, "loss": 0.5385, "num_input_tokens_seen": 60506264, "step": 104210 }, { "epoch": 15.522043491212392, "grad_norm": 2.7633063793182373, "learning_rate": 7.2549076779157565e-06, "loss": 0.6555, "num_input_tokens_seen": 60509176, "step": 104215 }, { "epoch": 15.52278820375335, "grad_norm": 1.9151610136032104, "learning_rate": 7.252618936823618e-06, "loss": 0.6989, "num_input_tokens_seen": 60511960, "step": 104220 }, { "epoch": 15.523532916294311, "grad_norm": 1.2926740646362305, "learning_rate": 7.250330495555438e-06, "loss": 0.494, "num_input_tokens_seen": 60514904, "step": 104225 }, { "epoch": 15.52427762883527, "grad_norm": 2.378143310546875, "learning_rate": 7.248042354149892e-06, "loss": 0.5749, "num_input_tokens_seen": 60517752, "step": 104230 }, { "epoch": 15.525022341376228, "grad_norm": 2.029508113861084, "learning_rate": 7.2457545126456275e-06, "loss": 0.5952, "num_input_tokens_seen": 60520824, "step": 104235 }, { "epoch": 15.525767053917187, "grad_norm": 1.318902611732483, "learning_rate": 7.243466971081297e-06, "loss": 0.5022, "num_input_tokens_seen": 60523512, "step": 104240 }, { "epoch": 15.526511766458148, "grad_norm": 1.4432320594787598, "learning_rate": 7.2411797294955455e-06, "loss": 0.6452, "num_input_tokens_seen": 60526456, "step": 104245 }, { "epoch": 15.527256478999107, "grad_norm": 1.027783751487732, "learning_rate": 7.238892787927004e-06, "loss": 0.5715, "num_input_tokens_seen": 60529048, "step": 104250 }, { "epoch": 15.528001191540065, "grad_norm": 0.9839670062065125, "learning_rate": 7.2366061464143265e-06, "loss": 0.7524, "num_input_tokens_seen": 60532024, "step": 104255 }, { "epoch": 15.528745904081024, "grad_norm": 2.1258130073547363, "learning_rate": 7.234319804996126e-06, "loss": 0.7071, "num_input_tokens_seen": 60534744, "step": 104260 }, { "epoch": 15.529490616621985, "grad_norm": 1.416965365409851, "learning_rate": 7.232033763711044e-06, "loss": 0.5269, "num_input_tokens_seen": 60537688, "step": 104265 }, { "epoch": 15.530235329162943, "grad_norm": 1.5534178018569946, "learning_rate": 7.229748022597693e-06, "loss": 0.6942, "num_input_tokens_seen": 60540600, "step": 104270 }, { "epoch": 15.530980041703902, "grad_norm": 4.696823596954346, "learning_rate": 7.22746258169468e-06, "loss": 0.7535, "num_input_tokens_seen": 60543192, "step": 104275 }, { "epoch": 15.53172475424486, "grad_norm": 1.1625264883041382, "learning_rate": 7.225177441040632e-06, "loss": 0.5558, "num_input_tokens_seen": 60545880, "step": 104280 }, { "epoch": 15.532469466785821, "grad_norm": 2.1535837650299072, "learning_rate": 7.2228926006741385e-06, "loss": 0.7934, "num_input_tokens_seen": 60548792, "step": 104285 }, { "epoch": 15.53321417932678, "grad_norm": 1.2937217950820923, "learning_rate": 7.220608060633813e-06, "loss": 0.6645, "num_input_tokens_seen": 60551864, "step": 104290 }, { "epoch": 15.533958891867739, "grad_norm": 1.1814656257629395, "learning_rate": 7.218323820958237e-06, "loss": 0.5929, "num_input_tokens_seen": 60554808, "step": 104295 }, { "epoch": 15.534703604408698, "grad_norm": 1.9421755075454712, "learning_rate": 7.2160398816860155e-06, "loss": 0.611, "num_input_tokens_seen": 60557784, "step": 104300 }, { "epoch": 15.535448316949658, "grad_norm": 1.6128615140914917, "learning_rate": 7.213756242855724e-06, "loss": 0.6296, "num_input_tokens_seen": 60560696, "step": 104305 }, { "epoch": 15.536193029490617, "grad_norm": 2.239699602127075, "learning_rate": 7.211472904505945e-06, "loss": 0.6258, "num_input_tokens_seen": 60563512, "step": 104310 }, { "epoch": 15.536937742031576, "grad_norm": 1.5276237726211548, "learning_rate": 7.20918986667525e-06, "loss": 0.5327, "num_input_tokens_seen": 60566552, "step": 104315 }, { "epoch": 15.537682454572534, "grad_norm": 1.531021237373352, "learning_rate": 7.206907129402205e-06, "loss": 0.6031, "num_input_tokens_seen": 60569464, "step": 104320 }, { "epoch": 15.538427167113493, "grad_norm": 1.425186038017273, "learning_rate": 7.204624692725387e-06, "loss": 0.4898, "num_input_tokens_seen": 60572280, "step": 104325 }, { "epoch": 15.539171879654454, "grad_norm": 1.9473706483840942, "learning_rate": 7.202342556683339e-06, "loss": 0.7523, "num_input_tokens_seen": 60575192, "step": 104330 }, { "epoch": 15.539916592195413, "grad_norm": 1.3826044797897339, "learning_rate": 7.200060721314636e-06, "loss": 0.5865, "num_input_tokens_seen": 60577912, "step": 104335 }, { "epoch": 15.540661304736371, "grad_norm": 1.4241108894348145, "learning_rate": 7.1977791866578045e-06, "loss": 0.5524, "num_input_tokens_seen": 60580664, "step": 104340 }, { "epoch": 15.541406017277332, "grad_norm": 1.1755290031433105, "learning_rate": 7.195497952751409e-06, "loss": 0.536, "num_input_tokens_seen": 60583640, "step": 104345 }, { "epoch": 15.54215072981829, "grad_norm": 1.7495462894439697, "learning_rate": 7.1932170196339745e-06, "loss": 0.6896, "num_input_tokens_seen": 60586552, "step": 104350 }, { "epoch": 15.54289544235925, "grad_norm": 1.4208617210388184, "learning_rate": 7.190936387344047e-06, "loss": 0.5591, "num_input_tokens_seen": 60589272, "step": 104355 }, { "epoch": 15.543640154900208, "grad_norm": 1.4867970943450928, "learning_rate": 7.188656055920149e-06, "loss": 0.5852, "num_input_tokens_seen": 60592376, "step": 104360 }, { "epoch": 15.544384867441167, "grad_norm": 1.5595574378967285, "learning_rate": 7.186376025400804e-06, "loss": 0.6058, "num_input_tokens_seen": 60595160, "step": 104365 }, { "epoch": 15.545129579982127, "grad_norm": 1.347455620765686, "learning_rate": 7.18409629582453e-06, "loss": 0.7251, "num_input_tokens_seen": 60597912, "step": 104370 }, { "epoch": 15.545874292523086, "grad_norm": 1.5938950777053833, "learning_rate": 7.181816867229835e-06, "loss": 0.718, "num_input_tokens_seen": 60600792, "step": 104375 }, { "epoch": 15.546619005064045, "grad_norm": 1.309899926185608, "learning_rate": 7.179537739655243e-06, "loss": 0.6534, "num_input_tokens_seen": 60603864, "step": 104380 }, { "epoch": 15.547363717605004, "grad_norm": 1.9501571655273438, "learning_rate": 7.17725891313924e-06, "loss": 0.6163, "num_input_tokens_seen": 60606392, "step": 104385 }, { "epoch": 15.548108430145964, "grad_norm": 1.5829039812088013, "learning_rate": 7.17498038772034e-06, "loss": 0.6495, "num_input_tokens_seen": 60609464, "step": 104390 }, { "epoch": 15.548853142686923, "grad_norm": 1.566791296005249, "learning_rate": 7.172702163437034e-06, "loss": 0.6067, "num_input_tokens_seen": 60612376, "step": 104395 }, { "epoch": 15.549597855227882, "grad_norm": 4.611825466156006, "learning_rate": 7.170424240327794e-06, "loss": 0.6725, "num_input_tokens_seen": 60615384, "step": 104400 }, { "epoch": 15.55034256776884, "grad_norm": 1.5759057998657227, "learning_rate": 7.168146618431127e-06, "loss": 0.6567, "num_input_tokens_seen": 60618424, "step": 104405 }, { "epoch": 15.551087280309801, "grad_norm": 1.609615445137024, "learning_rate": 7.165869297785488e-06, "loss": 0.6795, "num_input_tokens_seen": 60621048, "step": 104410 }, { "epoch": 15.55183199285076, "grad_norm": 1.2594207525253296, "learning_rate": 7.163592278429371e-06, "loss": 0.6054, "num_input_tokens_seen": 60623896, "step": 104415 }, { "epoch": 15.552576705391719, "grad_norm": 2.318721294403076, "learning_rate": 7.161315560401224e-06, "loss": 0.6606, "num_input_tokens_seen": 60626616, "step": 104420 }, { "epoch": 15.553321417932677, "grad_norm": 2.454582691192627, "learning_rate": 7.159039143739532e-06, "loss": 0.6763, "num_input_tokens_seen": 60629240, "step": 104425 }, { "epoch": 15.554066130473638, "grad_norm": 1.8491137027740479, "learning_rate": 7.1567630284827384e-06, "loss": 0.5772, "num_input_tokens_seen": 60632248, "step": 104430 }, { "epoch": 15.554810843014597, "grad_norm": 0.7284321188926697, "learning_rate": 7.1544872146693e-06, "loss": 0.632, "num_input_tokens_seen": 60635192, "step": 104435 }, { "epoch": 15.555555555555555, "grad_norm": 1.401545763015747, "learning_rate": 7.1522117023376606e-06, "loss": 0.5717, "num_input_tokens_seen": 60637784, "step": 104440 }, { "epoch": 15.556300268096514, "grad_norm": 1.8584426641464233, "learning_rate": 7.149936491526258e-06, "loss": 0.5872, "num_input_tokens_seen": 60640664, "step": 104445 }, { "epoch": 15.557044980637475, "grad_norm": 1.8278135061264038, "learning_rate": 7.147661582273546e-06, "loss": 0.6748, "num_input_tokens_seen": 60643448, "step": 104450 }, { "epoch": 15.557789693178433, "grad_norm": 2.2432351112365723, "learning_rate": 7.145386974617937e-06, "loss": 0.627, "num_input_tokens_seen": 60646168, "step": 104455 }, { "epoch": 15.558534405719392, "grad_norm": 1.1146420240402222, "learning_rate": 7.143112668597876e-06, "loss": 0.5333, "num_input_tokens_seen": 60649208, "step": 104460 }, { "epoch": 15.559279118260351, "grad_norm": 1.746858835220337, "learning_rate": 7.140838664251773e-06, "loss": 0.569, "num_input_tokens_seen": 60651928, "step": 104465 }, { "epoch": 15.560023830801311, "grad_norm": 3.6536996364593506, "learning_rate": 7.138564961618055e-06, "loss": 0.7391, "num_input_tokens_seen": 60655000, "step": 104470 }, { "epoch": 15.56076854334227, "grad_norm": 1.4719740152359009, "learning_rate": 7.1362915607351285e-06, "loss": 0.6814, "num_input_tokens_seen": 60657752, "step": 104475 }, { "epoch": 15.561513255883229, "grad_norm": 1.2357254028320312, "learning_rate": 7.1340184616413926e-06, "loss": 0.5307, "num_input_tokens_seen": 60660408, "step": 104480 }, { "epoch": 15.562257968424188, "grad_norm": 1.3259960412979126, "learning_rate": 7.131745664375264e-06, "loss": 0.6691, "num_input_tokens_seen": 60663256, "step": 104485 }, { "epoch": 15.563002680965148, "grad_norm": 1.391434669494629, "learning_rate": 7.129473168975123e-06, "loss": 0.5429, "num_input_tokens_seen": 60666136, "step": 104490 }, { "epoch": 15.563747393506107, "grad_norm": 1.5221911668777466, "learning_rate": 7.127200975479381e-06, "loss": 0.4523, "num_input_tokens_seen": 60668824, "step": 104495 }, { "epoch": 15.564492106047066, "grad_norm": 1.1588900089263916, "learning_rate": 7.12492908392641e-06, "loss": 0.6307, "num_input_tokens_seen": 60671480, "step": 104500 }, { "epoch": 15.565236818588025, "grad_norm": 2.9091081619262695, "learning_rate": 7.122657494354596e-06, "loss": 0.448, "num_input_tokens_seen": 60674072, "step": 104505 }, { "epoch": 15.565981531128983, "grad_norm": 1.307065486907959, "learning_rate": 7.120386206802307e-06, "loss": 0.5683, "num_input_tokens_seen": 60677752, "step": 104510 }, { "epoch": 15.566726243669944, "grad_norm": 1.8193548917770386, "learning_rate": 7.1181152213079275e-06, "loss": 0.5057, "num_input_tokens_seen": 60680824, "step": 104515 }, { "epoch": 15.567470956210903, "grad_norm": 1.4315637350082397, "learning_rate": 7.115844537909819e-06, "loss": 0.7173, "num_input_tokens_seen": 60684408, "step": 104520 }, { "epoch": 15.568215668751861, "grad_norm": 0.9958362579345703, "learning_rate": 7.11357415664633e-06, "loss": 0.5651, "num_input_tokens_seen": 60687192, "step": 104525 }, { "epoch": 15.568960381292822, "grad_norm": 1.1549268960952759, "learning_rate": 7.111304077555836e-06, "loss": 0.669, "num_input_tokens_seen": 60690296, "step": 104530 }, { "epoch": 15.56970509383378, "grad_norm": 2.9127309322357178, "learning_rate": 7.1090343006766704e-06, "loss": 0.6445, "num_input_tokens_seen": 60693240, "step": 104535 }, { "epoch": 15.57044980637474, "grad_norm": 1.1438606977462769, "learning_rate": 7.106764826047196e-06, "loss": 0.519, "num_input_tokens_seen": 60696056, "step": 104540 }, { "epoch": 15.571194518915698, "grad_norm": 1.3546284437179565, "learning_rate": 7.104495653705734e-06, "loss": 0.6988, "num_input_tokens_seen": 60698840, "step": 104545 }, { "epoch": 15.571939231456657, "grad_norm": 1.3607336282730103, "learning_rate": 7.102226783690638e-06, "loss": 0.618, "num_input_tokens_seen": 60701464, "step": 104550 }, { "epoch": 15.572683943997617, "grad_norm": 2.7903835773468018, "learning_rate": 7.099958216040231e-06, "loss": 0.6256, "num_input_tokens_seen": 60704856, "step": 104555 }, { "epoch": 15.573428656538576, "grad_norm": 1.968706488609314, "learning_rate": 7.09768995079283e-06, "loss": 0.7053, "num_input_tokens_seen": 60707640, "step": 104560 }, { "epoch": 15.574173369079535, "grad_norm": 1.4768424034118652, "learning_rate": 7.095421987986766e-06, "loss": 0.4374, "num_input_tokens_seen": 60710264, "step": 104565 }, { "epoch": 15.574918081620494, "grad_norm": 1.5899547338485718, "learning_rate": 7.093154327660354e-06, "loss": 0.6516, "num_input_tokens_seen": 60713048, "step": 104570 }, { "epoch": 15.575662794161454, "grad_norm": 2.7318572998046875, "learning_rate": 7.090886969851898e-06, "loss": 0.5848, "num_input_tokens_seen": 60716120, "step": 104575 }, { "epoch": 15.576407506702413, "grad_norm": 1.254357933998108, "learning_rate": 7.088619914599698e-06, "loss": 0.4977, "num_input_tokens_seen": 60719032, "step": 104580 }, { "epoch": 15.577152219243372, "grad_norm": 1.596994400024414, "learning_rate": 7.086353161942066e-06, "loss": 0.4792, "num_input_tokens_seen": 60721560, "step": 104585 }, { "epoch": 15.57789693178433, "grad_norm": 2.10274338722229, "learning_rate": 7.084086711917287e-06, "loss": 0.5246, "num_input_tokens_seen": 60724408, "step": 104590 }, { "epoch": 15.578641644325291, "grad_norm": 1.7010935544967651, "learning_rate": 7.081820564563657e-06, "loss": 0.6038, "num_input_tokens_seen": 60727224, "step": 104595 }, { "epoch": 15.57938635686625, "grad_norm": 1.5778391361236572, "learning_rate": 7.0795547199194624e-06, "loss": 0.7895, "num_input_tokens_seen": 60731064, "step": 104600 }, { "epoch": 15.580131069407209, "grad_norm": 2.8705224990844727, "learning_rate": 7.077289178022967e-06, "loss": 0.6417, "num_input_tokens_seen": 60733944, "step": 104605 }, { "epoch": 15.580875781948167, "grad_norm": 0.9500035643577576, "learning_rate": 7.075023938912461e-06, "loss": 0.4514, "num_input_tokens_seen": 60736856, "step": 104610 }, { "epoch": 15.581620494489128, "grad_norm": 1.1993048191070557, "learning_rate": 7.0727590026262e-06, "loss": 0.7039, "num_input_tokens_seen": 60739928, "step": 104615 }, { "epoch": 15.582365207030087, "grad_norm": 2.708828926086426, "learning_rate": 7.070494369202465e-06, "loss": 0.7533, "num_input_tokens_seen": 60742552, "step": 104620 }, { "epoch": 15.583109919571045, "grad_norm": 0.8434017896652222, "learning_rate": 7.068230038679496e-06, "loss": 0.6091, "num_input_tokens_seen": 60745400, "step": 104625 }, { "epoch": 15.583854632112004, "grad_norm": 1.1317663192749023, "learning_rate": 7.065966011095565e-06, "loss": 0.6003, "num_input_tokens_seen": 60748536, "step": 104630 }, { "epoch": 15.584599344652965, "grad_norm": 1.5715043544769287, "learning_rate": 7.06370228648891e-06, "loss": 0.5957, "num_input_tokens_seen": 60751672, "step": 104635 }, { "epoch": 15.585344057193923, "grad_norm": 1.3210806846618652, "learning_rate": 7.061438864897774e-06, "loss": 0.6545, "num_input_tokens_seen": 60754456, "step": 104640 }, { "epoch": 15.586088769734882, "grad_norm": 1.5777714252471924, "learning_rate": 7.059175746360397e-06, "loss": 0.7174, "num_input_tokens_seen": 60757592, "step": 104645 }, { "epoch": 15.586833482275841, "grad_norm": 3.5191431045532227, "learning_rate": 7.056912930915005e-06, "loss": 0.7499, "num_input_tokens_seen": 60760216, "step": 104650 }, { "epoch": 15.587578194816802, "grad_norm": 2.7285027503967285, "learning_rate": 7.054650418599837e-06, "loss": 0.6382, "num_input_tokens_seen": 60763128, "step": 104655 }, { "epoch": 15.58832290735776, "grad_norm": 0.9170138239860535, "learning_rate": 7.052388209453106e-06, "loss": 0.478, "num_input_tokens_seen": 60765944, "step": 104660 }, { "epoch": 15.589067619898719, "grad_norm": 1.3591840267181396, "learning_rate": 7.0501263035130435e-06, "loss": 0.6178, "num_input_tokens_seen": 60768856, "step": 104665 }, { "epoch": 15.589812332439678, "grad_norm": 3.0187265872955322, "learning_rate": 7.0478647008178435e-06, "loss": 0.7657, "num_input_tokens_seen": 60771672, "step": 104670 }, { "epoch": 15.590557044980638, "grad_norm": 1.4965330362319946, "learning_rate": 7.045603401405735e-06, "loss": 0.4275, "num_input_tokens_seen": 60774648, "step": 104675 }, { "epoch": 15.591301757521597, "grad_norm": 0.8157917857170105, "learning_rate": 7.043342405314907e-06, "loss": 0.7198, "num_input_tokens_seen": 60777464, "step": 104680 }, { "epoch": 15.592046470062556, "grad_norm": 1.3239933252334595, "learning_rate": 7.04108171258355e-06, "loss": 0.4507, "num_input_tokens_seen": 60780120, "step": 104685 }, { "epoch": 15.592791182603515, "grad_norm": 2.2798759937286377, "learning_rate": 7.038821323249875e-06, "loss": 0.7105, "num_input_tokens_seen": 60782904, "step": 104690 }, { "epoch": 15.593535895144473, "grad_norm": 2.4339561462402344, "learning_rate": 7.036561237352057e-06, "loss": 0.5129, "num_input_tokens_seen": 60785976, "step": 104695 }, { "epoch": 15.594280607685434, "grad_norm": 1.411135196685791, "learning_rate": 7.03430145492828e-06, "loss": 0.4607, "num_input_tokens_seen": 60788728, "step": 104700 }, { "epoch": 15.595025320226393, "grad_norm": 1.3588950634002686, "learning_rate": 7.032041976016712e-06, "loss": 0.5722, "num_input_tokens_seen": 60791608, "step": 104705 }, { "epoch": 15.595770032767351, "grad_norm": 1.1472196578979492, "learning_rate": 7.02978280065554e-06, "loss": 0.6244, "num_input_tokens_seen": 60794584, "step": 104710 }, { "epoch": 15.59651474530831, "grad_norm": 3.395928144454956, "learning_rate": 7.027523928882926e-06, "loss": 0.5775, "num_input_tokens_seen": 60797336, "step": 104715 }, { "epoch": 15.59725945784927, "grad_norm": 4.558818817138672, "learning_rate": 7.025265360737021e-06, "loss": 0.6274, "num_input_tokens_seen": 60799992, "step": 104720 }, { "epoch": 15.59800417039023, "grad_norm": 1.573513150215149, "learning_rate": 7.023007096255996e-06, "loss": 0.6406, "num_input_tokens_seen": 60803000, "step": 104725 }, { "epoch": 15.598748882931188, "grad_norm": 5.094999313354492, "learning_rate": 7.020749135477986e-06, "loss": 0.4427, "num_input_tokens_seen": 60805752, "step": 104730 }, { "epoch": 15.599493595472147, "grad_norm": 1.2723692655563354, "learning_rate": 7.0184914784411555e-06, "loss": 0.5023, "num_input_tokens_seen": 60808760, "step": 104735 }, { "epoch": 15.600238308013108, "grad_norm": 1.5463851690292358, "learning_rate": 7.0162341251836264e-06, "loss": 0.7115, "num_input_tokens_seen": 60811928, "step": 104740 }, { "epoch": 15.600983020554066, "grad_norm": 2.370028495788574, "learning_rate": 7.013977075743553e-06, "loss": 0.6642, "num_input_tokens_seen": 60814424, "step": 104745 }, { "epoch": 15.601727733095025, "grad_norm": 1.2252970933914185, "learning_rate": 7.01172033015905e-06, "loss": 0.5955, "num_input_tokens_seen": 60817368, "step": 104750 }, { "epoch": 15.602472445635984, "grad_norm": 1.7542481422424316, "learning_rate": 7.009463888468254e-06, "loss": 0.5133, "num_input_tokens_seen": 60820088, "step": 104755 }, { "epoch": 15.603217158176944, "grad_norm": 1.3029513359069824, "learning_rate": 7.0072077507092825e-06, "loss": 0.603, "num_input_tokens_seen": 60822776, "step": 104760 }, { "epoch": 15.603961870717903, "grad_norm": 1.066854476928711, "learning_rate": 7.004951916920249e-06, "loss": 0.5501, "num_input_tokens_seen": 60825720, "step": 104765 }, { "epoch": 15.604706583258862, "grad_norm": 1.1823605298995972, "learning_rate": 7.002696387139265e-06, "loss": 0.5003, "num_input_tokens_seen": 60828568, "step": 104770 }, { "epoch": 15.60545129579982, "grad_norm": 1.3521126508712769, "learning_rate": 7.000441161404425e-06, "loss": 0.6184, "num_input_tokens_seen": 60831640, "step": 104775 }, { "epoch": 15.606196008340781, "grad_norm": 0.8985639214515686, "learning_rate": 6.998186239753846e-06, "loss": 0.4965, "num_input_tokens_seen": 60834648, "step": 104780 }, { "epoch": 15.60694072088174, "grad_norm": 2.502213716506958, "learning_rate": 6.995931622225605e-06, "loss": 0.6949, "num_input_tokens_seen": 60837784, "step": 104785 }, { "epoch": 15.607685433422699, "grad_norm": 1.7779186964035034, "learning_rate": 6.99367730885781e-06, "loss": 0.4347, "num_input_tokens_seen": 60840632, "step": 104790 }, { "epoch": 15.608430145963657, "grad_norm": 2.57075834274292, "learning_rate": 6.991423299688535e-06, "loss": 0.5711, "num_input_tokens_seen": 60843608, "step": 104795 }, { "epoch": 15.609174858504618, "grad_norm": 1.9783002138137817, "learning_rate": 6.989169594755854e-06, "loss": 0.5663, "num_input_tokens_seen": 60846328, "step": 104800 }, { "epoch": 15.609919571045577, "grad_norm": 2.2036778926849365, "learning_rate": 6.9869161940978535e-06, "loss": 0.5634, "num_input_tokens_seen": 60849336, "step": 104805 }, { "epoch": 15.610664283586535, "grad_norm": 1.8677127361297607, "learning_rate": 6.984663097752589e-06, "loss": 0.5491, "num_input_tokens_seen": 60852248, "step": 104810 }, { "epoch": 15.611408996127494, "grad_norm": 4.060658931732178, "learning_rate": 6.982410305758138e-06, "loss": 0.6652, "num_input_tokens_seen": 60855512, "step": 104815 }, { "epoch": 15.612153708668455, "grad_norm": 1.2705271244049072, "learning_rate": 6.980157818152547e-06, "loss": 0.5677, "num_input_tokens_seen": 60858648, "step": 104820 }, { "epoch": 15.612898421209414, "grad_norm": 3.0393130779266357, "learning_rate": 6.97790563497388e-06, "loss": 0.6304, "num_input_tokens_seen": 60861624, "step": 104825 }, { "epoch": 15.613643133750372, "grad_norm": 2.85685396194458, "learning_rate": 6.9756537562601835e-06, "loss": 0.8212, "num_input_tokens_seen": 60864568, "step": 104830 }, { "epoch": 15.614387846291331, "grad_norm": 1.836337685585022, "learning_rate": 6.973402182049496e-06, "loss": 0.6677, "num_input_tokens_seen": 60867512, "step": 104835 }, { "epoch": 15.615132558832292, "grad_norm": 1.6821008920669556, "learning_rate": 6.971150912379859e-06, "loss": 0.5217, "num_input_tokens_seen": 60870328, "step": 104840 }, { "epoch": 15.61587727137325, "grad_norm": 0.9961541295051575, "learning_rate": 6.968899947289295e-06, "loss": 0.5359, "num_input_tokens_seen": 60872952, "step": 104845 }, { "epoch": 15.616621983914209, "grad_norm": 1.7132630348205566, "learning_rate": 6.966649286815846e-06, "loss": 0.5878, "num_input_tokens_seen": 60875928, "step": 104850 }, { "epoch": 15.617366696455168, "grad_norm": 1.7226144075393677, "learning_rate": 6.9643989309975235e-06, "loss": 0.6137, "num_input_tokens_seen": 60878872, "step": 104855 }, { "epoch": 15.618111408996128, "grad_norm": 1.0690093040466309, "learning_rate": 6.962148879872357e-06, "loss": 0.4727, "num_input_tokens_seen": 60881848, "step": 104860 }, { "epoch": 15.618856121537087, "grad_norm": 3.4686989784240723, "learning_rate": 6.9598991334783485e-06, "loss": 0.6328, "num_input_tokens_seen": 60884824, "step": 104865 }, { "epoch": 15.619600834078046, "grad_norm": 1.4637693166732788, "learning_rate": 6.957649691853513e-06, "loss": 0.4062, "num_input_tokens_seen": 60887416, "step": 104870 }, { "epoch": 15.620345546619005, "grad_norm": 1.9564425945281982, "learning_rate": 6.955400555035849e-06, "loss": 0.5583, "num_input_tokens_seen": 60890520, "step": 104875 }, { "epoch": 15.621090259159963, "grad_norm": 1.4852826595306396, "learning_rate": 6.953151723063345e-06, "loss": 0.639, "num_input_tokens_seen": 60893304, "step": 104880 }, { "epoch": 15.621834971700924, "grad_norm": 1.5351645946502686, "learning_rate": 6.95090319597401e-06, "loss": 0.5992, "num_input_tokens_seen": 60896344, "step": 104885 }, { "epoch": 15.622579684241883, "grad_norm": 1.2213945388793945, "learning_rate": 6.94865497380581e-06, "loss": 0.3974, "num_input_tokens_seen": 60899544, "step": 104890 }, { "epoch": 15.623324396782841, "grad_norm": 1.4344780445098877, "learning_rate": 6.9464070565967486e-06, "loss": 0.9012, "num_input_tokens_seen": 60902488, "step": 104895 }, { "epoch": 15.6240691093238, "grad_norm": 1.7720305919647217, "learning_rate": 6.94415944438479e-06, "loss": 0.496, "num_input_tokens_seen": 60905400, "step": 104900 }, { "epoch": 15.62481382186476, "grad_norm": 2.655397415161133, "learning_rate": 6.941912137207907e-06, "loss": 0.4659, "num_input_tokens_seen": 60908024, "step": 104905 }, { "epoch": 15.62555853440572, "grad_norm": 1.5238142013549805, "learning_rate": 6.939665135104056e-06, "loss": 0.5225, "num_input_tokens_seen": 60910936, "step": 104910 }, { "epoch": 15.626303246946678, "grad_norm": 1.0490086078643799, "learning_rate": 6.9374184381112155e-06, "loss": 0.5867, "num_input_tokens_seen": 60913624, "step": 104915 }, { "epoch": 15.627047959487637, "grad_norm": 3.129681348800659, "learning_rate": 6.935172046267333e-06, "loss": 0.553, "num_input_tokens_seen": 60916696, "step": 104920 }, { "epoch": 15.627792672028598, "grad_norm": 1.2268810272216797, "learning_rate": 6.932925959610351e-06, "loss": 0.429, "num_input_tokens_seen": 60919384, "step": 104925 }, { "epoch": 15.628537384569556, "grad_norm": 2.4664652347564697, "learning_rate": 6.930680178178228e-06, "loss": 0.9214, "num_input_tokens_seen": 60922168, "step": 104930 }, { "epoch": 15.629282097110515, "grad_norm": 0.9681996703147888, "learning_rate": 6.928434702008893e-06, "loss": 0.4923, "num_input_tokens_seen": 60925240, "step": 104935 }, { "epoch": 15.630026809651474, "grad_norm": 1.7738571166992188, "learning_rate": 6.9261895311402925e-06, "loss": 0.5405, "num_input_tokens_seen": 60927832, "step": 104940 }, { "epoch": 15.630771522192434, "grad_norm": 1.8085545301437378, "learning_rate": 6.923944665610344e-06, "loss": 0.6089, "num_input_tokens_seen": 60931032, "step": 104945 }, { "epoch": 15.631516234733393, "grad_norm": 0.9556962847709656, "learning_rate": 6.921700105456985e-06, "loss": 0.487, "num_input_tokens_seen": 60933560, "step": 104950 }, { "epoch": 15.632260947274352, "grad_norm": 2.540464162826538, "learning_rate": 6.919455850718123e-06, "loss": 0.7067, "num_input_tokens_seen": 60936376, "step": 104955 }, { "epoch": 15.63300565981531, "grad_norm": 3.2493984699249268, "learning_rate": 6.917211901431683e-06, "loss": 0.6277, "num_input_tokens_seen": 60939192, "step": 104960 }, { "epoch": 15.633750372356271, "grad_norm": 2.4843833446502686, "learning_rate": 6.914968257635573e-06, "loss": 0.6526, "num_input_tokens_seen": 60942008, "step": 104965 }, { "epoch": 15.63449508489723, "grad_norm": 1.8541548252105713, "learning_rate": 6.912724919367691e-06, "loss": 0.5828, "num_input_tokens_seen": 60944696, "step": 104970 }, { "epoch": 15.635239797438189, "grad_norm": 2.2432281970977783, "learning_rate": 6.91048188666594e-06, "loss": 0.6396, "num_input_tokens_seen": 60947416, "step": 104975 }, { "epoch": 15.635984509979147, "grad_norm": 1.40690016746521, "learning_rate": 6.908239159568203e-06, "loss": 0.3618, "num_input_tokens_seen": 60950488, "step": 104980 }, { "epoch": 15.636729222520108, "grad_norm": 2.2349460124969482, "learning_rate": 6.9059967381123854e-06, "loss": 0.5201, "num_input_tokens_seen": 60953272, "step": 104985 }, { "epoch": 15.637473935061067, "grad_norm": 2.153351068496704, "learning_rate": 6.903754622336358e-06, "loss": 0.5795, "num_input_tokens_seen": 60955928, "step": 104990 }, { "epoch": 15.638218647602026, "grad_norm": 1.3192970752716064, "learning_rate": 6.90151281227801e-06, "loss": 0.5079, "num_input_tokens_seen": 60958840, "step": 104995 }, { "epoch": 15.638963360142984, "grad_norm": 2.564934492111206, "learning_rate": 6.899271307975208e-06, "loss": 0.5273, "num_input_tokens_seen": 60961560, "step": 105000 }, { "epoch": 15.639708072683945, "grad_norm": 1.6230548620224, "learning_rate": 6.897030109465813e-06, "loss": 0.5191, "num_input_tokens_seen": 60964312, "step": 105005 }, { "epoch": 15.640452785224904, "grad_norm": 0.9527946710586548, "learning_rate": 6.894789216787703e-06, "loss": 0.4769, "num_input_tokens_seen": 60967576, "step": 105010 }, { "epoch": 15.641197497765862, "grad_norm": 1.0401487350463867, "learning_rate": 6.892548629978721e-06, "loss": 0.4336, "num_input_tokens_seen": 60970680, "step": 105015 }, { "epoch": 15.641942210306821, "grad_norm": 1.6328166723251343, "learning_rate": 6.890308349076732e-06, "loss": 0.4042, "num_input_tokens_seen": 60973528, "step": 105020 }, { "epoch": 15.64268692284778, "grad_norm": 2.820936918258667, "learning_rate": 6.88806837411958e-06, "loss": 0.6571, "num_input_tokens_seen": 60976376, "step": 105025 }, { "epoch": 15.64343163538874, "grad_norm": 1.6431649923324585, "learning_rate": 6.885828705145103e-06, "loss": 0.6791, "num_input_tokens_seen": 60979128, "step": 105030 }, { "epoch": 15.6441763479297, "grad_norm": 1.1189733743667603, "learning_rate": 6.883589342191132e-06, "loss": 0.6494, "num_input_tokens_seen": 60981848, "step": 105035 }, { "epoch": 15.644921060470658, "grad_norm": 2.385978937149048, "learning_rate": 6.881350285295515e-06, "loss": 0.6009, "num_input_tokens_seen": 60984856, "step": 105040 }, { "epoch": 15.645665773011618, "grad_norm": 0.7879773378372192, "learning_rate": 6.879111534496069e-06, "loss": 0.5412, "num_input_tokens_seen": 60987832, "step": 105045 }, { "epoch": 15.646410485552577, "grad_norm": 1.823938250541687, "learning_rate": 6.87687308983061e-06, "loss": 0.583, "num_input_tokens_seen": 60990584, "step": 105050 }, { "epoch": 15.647155198093536, "grad_norm": 1.0680218935012817, "learning_rate": 6.874634951336967e-06, "loss": 0.4869, "num_input_tokens_seen": 60993304, "step": 105055 }, { "epoch": 15.647899910634495, "grad_norm": 2.389124631881714, "learning_rate": 6.872397119052937e-06, "loss": 0.4913, "num_input_tokens_seen": 60996504, "step": 105060 }, { "epoch": 15.648644623175453, "grad_norm": 1.2890957593917847, "learning_rate": 6.870159593016343e-06, "loss": 0.7167, "num_input_tokens_seen": 60999544, "step": 105065 }, { "epoch": 15.649389335716414, "grad_norm": 1.5398365259170532, "learning_rate": 6.867922373264968e-06, "loss": 0.4802, "num_input_tokens_seen": 61002232, "step": 105070 }, { "epoch": 15.650134048257373, "grad_norm": 1.623233437538147, "learning_rate": 6.865685459836621e-06, "loss": 0.7626, "num_input_tokens_seen": 61004888, "step": 105075 }, { "epoch": 15.650878760798332, "grad_norm": 1.5157700777053833, "learning_rate": 6.8634488527690915e-06, "loss": 0.7089, "num_input_tokens_seen": 61007672, "step": 105080 }, { "epoch": 15.65162347333929, "grad_norm": 1.3343075513839722, "learning_rate": 6.861212552100149e-06, "loss": 0.6217, "num_input_tokens_seen": 61010680, "step": 105085 }, { "epoch": 15.65236818588025, "grad_norm": 1.4692744016647339, "learning_rate": 6.858976557867594e-06, "loss": 0.5881, "num_input_tokens_seen": 61014008, "step": 105090 }, { "epoch": 15.65311289842121, "grad_norm": 0.9582772254943848, "learning_rate": 6.856740870109194e-06, "loss": 0.6048, "num_input_tokens_seen": 61016920, "step": 105095 }, { "epoch": 15.653857610962168, "grad_norm": 1.710510015487671, "learning_rate": 6.854505488862714e-06, "loss": 0.5507, "num_input_tokens_seen": 61020120, "step": 105100 }, { "epoch": 15.654602323503127, "grad_norm": 1.8090606927871704, "learning_rate": 6.852270414165915e-06, "loss": 0.6847, "num_input_tokens_seen": 61023064, "step": 105105 }, { "epoch": 15.655347036044088, "grad_norm": 2.008213996887207, "learning_rate": 6.850035646056571e-06, "loss": 0.5571, "num_input_tokens_seen": 61026104, "step": 105110 }, { "epoch": 15.656091748585046, "grad_norm": 2.021148920059204, "learning_rate": 6.847801184572422e-06, "loss": 0.5446, "num_input_tokens_seen": 61029112, "step": 105115 }, { "epoch": 15.656836461126005, "grad_norm": 2.8128368854522705, "learning_rate": 6.845567029751229e-06, "loss": 0.7417, "num_input_tokens_seen": 61031896, "step": 105120 }, { "epoch": 15.657581173666964, "grad_norm": 1.5755620002746582, "learning_rate": 6.843333181630729e-06, "loss": 0.4625, "num_input_tokens_seen": 61034872, "step": 105125 }, { "epoch": 15.658325886207924, "grad_norm": 3.1406545639038086, "learning_rate": 6.841099640248655e-06, "loss": 0.5746, "num_input_tokens_seen": 61037528, "step": 105130 }, { "epoch": 15.659070598748883, "grad_norm": 1.6966278553009033, "learning_rate": 6.838866405642752e-06, "loss": 0.5718, "num_input_tokens_seen": 61040376, "step": 105135 }, { "epoch": 15.659815311289842, "grad_norm": 2.187964916229248, "learning_rate": 6.836633477850737e-06, "loss": 0.5249, "num_input_tokens_seen": 61043128, "step": 105140 }, { "epoch": 15.6605600238308, "grad_norm": 3.2084157466888428, "learning_rate": 6.834400856910348e-06, "loss": 0.6993, "num_input_tokens_seen": 61046424, "step": 105145 }, { "epoch": 15.661304736371761, "grad_norm": 0.9709337949752808, "learning_rate": 6.832168542859283e-06, "loss": 0.4678, "num_input_tokens_seen": 61049688, "step": 105150 }, { "epoch": 15.66204944891272, "grad_norm": 1.6564722061157227, "learning_rate": 6.829936535735273e-06, "loss": 0.6531, "num_input_tokens_seen": 61052280, "step": 105155 }, { "epoch": 15.662794161453679, "grad_norm": 1.7327549457550049, "learning_rate": 6.827704835576021e-06, "loss": 0.5547, "num_input_tokens_seen": 61055288, "step": 105160 }, { "epoch": 15.663538873994638, "grad_norm": 1.7388651371002197, "learning_rate": 6.8254734424192255e-06, "loss": 0.4038, "num_input_tokens_seen": 61058072, "step": 105165 }, { "epoch": 15.664283586535598, "grad_norm": 1.7491124868392944, "learning_rate": 6.823242356302584e-06, "loss": 0.4112, "num_input_tokens_seen": 61060920, "step": 105170 }, { "epoch": 15.665028299076557, "grad_norm": 1.1747190952301025, "learning_rate": 6.821011577263781e-06, "loss": 0.3277, "num_input_tokens_seen": 61063768, "step": 105175 }, { "epoch": 15.665773011617516, "grad_norm": 3.421098470687866, "learning_rate": 6.8187811053405185e-06, "loss": 0.6641, "num_input_tokens_seen": 61066808, "step": 105180 }, { "epoch": 15.666517724158474, "grad_norm": 1.8403855562210083, "learning_rate": 6.816550940570463e-06, "loss": 0.652, "num_input_tokens_seen": 61069976, "step": 105185 }, { "epoch": 15.667262436699435, "grad_norm": 1.7648675441741943, "learning_rate": 6.8143210829913065e-06, "loss": 0.5471, "num_input_tokens_seen": 61072792, "step": 105190 }, { "epoch": 15.668007149240394, "grad_norm": 2.362391710281372, "learning_rate": 6.812091532640705e-06, "loss": 0.5573, "num_input_tokens_seen": 61075896, "step": 105195 }, { "epoch": 15.668751861781352, "grad_norm": 2.230242967605591, "learning_rate": 6.80986228955634e-06, "loss": 0.6, "num_input_tokens_seen": 61078456, "step": 105200 }, { "epoch": 15.669496574322311, "grad_norm": 1.229987382888794, "learning_rate": 6.807633353775861e-06, "loss": 0.6515, "num_input_tokens_seen": 61081272, "step": 105205 }, { "epoch": 15.67024128686327, "grad_norm": 1.3364472389221191, "learning_rate": 6.805404725336923e-06, "loss": 0.5672, "num_input_tokens_seen": 61084088, "step": 105210 }, { "epoch": 15.67098599940423, "grad_norm": 1.7631254196166992, "learning_rate": 6.803176404277184e-06, "loss": 0.6817, "num_input_tokens_seen": 61087032, "step": 105215 }, { "epoch": 15.67173071194519, "grad_norm": 2.3289194107055664, "learning_rate": 6.800948390634279e-06, "loss": 0.6345, "num_input_tokens_seen": 61089752, "step": 105220 }, { "epoch": 15.672475424486148, "grad_norm": 1.346638560295105, "learning_rate": 6.798720684445861e-06, "loss": 0.739, "num_input_tokens_seen": 61092984, "step": 105225 }, { "epoch": 15.673220137027108, "grad_norm": 1.4669779539108276, "learning_rate": 6.796493285749561e-06, "loss": 0.4875, "num_input_tokens_seen": 61095864, "step": 105230 }, { "epoch": 15.673964849568067, "grad_norm": 1.6019859313964844, "learning_rate": 6.794266194583005e-06, "loss": 0.4142, "num_input_tokens_seen": 61098776, "step": 105235 }, { "epoch": 15.674709562109026, "grad_norm": 1.7358477115631104, "learning_rate": 6.792039410983817e-06, "loss": 0.7693, "num_input_tokens_seen": 61101752, "step": 105240 }, { "epoch": 15.675454274649985, "grad_norm": 1.41220223903656, "learning_rate": 6.789812934989609e-06, "loss": 0.6349, "num_input_tokens_seen": 61104792, "step": 105245 }, { "epoch": 15.676198987190944, "grad_norm": 2.221329927444458, "learning_rate": 6.78758676663801e-06, "loss": 0.5029, "num_input_tokens_seen": 61107512, "step": 105250 }, { "epoch": 15.676943699731904, "grad_norm": 0.8291958570480347, "learning_rate": 6.785360905966617e-06, "loss": 0.4728, "num_input_tokens_seen": 61110328, "step": 105255 }, { "epoch": 15.677688412272863, "grad_norm": 1.8301022052764893, "learning_rate": 6.783135353013045e-06, "loss": 0.7847, "num_input_tokens_seen": 61113528, "step": 105260 }, { "epoch": 15.678433124813822, "grad_norm": 1.5170156955718994, "learning_rate": 6.7809101078148805e-06, "loss": 0.5629, "num_input_tokens_seen": 61116248, "step": 105265 }, { "epoch": 15.67917783735478, "grad_norm": 1.3266875743865967, "learning_rate": 6.7786851704097295e-06, "loss": 0.5968, "num_input_tokens_seen": 61119032, "step": 105270 }, { "epoch": 15.67992254989574, "grad_norm": 1.504880428314209, "learning_rate": 6.776460540835167e-06, "loss": 0.513, "num_input_tokens_seen": 61121912, "step": 105275 }, { "epoch": 15.6806672624367, "grad_norm": 1.5750246047973633, "learning_rate": 6.774236219128788e-06, "loss": 0.6311, "num_input_tokens_seen": 61124664, "step": 105280 }, { "epoch": 15.681411974977658, "grad_norm": 1.4011945724487305, "learning_rate": 6.772012205328166e-06, "loss": 0.5338, "num_input_tokens_seen": 61127608, "step": 105285 }, { "epoch": 15.682156687518617, "grad_norm": 1.1150656938552856, "learning_rate": 6.769788499470861e-06, "loss": 0.6316, "num_input_tokens_seen": 61130424, "step": 105290 }, { "epoch": 15.682901400059578, "grad_norm": 1.9440206289291382, "learning_rate": 6.767565101594461e-06, "loss": 0.7059, "num_input_tokens_seen": 61133304, "step": 105295 }, { "epoch": 15.683646112600536, "grad_norm": 1.4275976419448853, "learning_rate": 6.765342011736517e-06, "loss": 0.6944, "num_input_tokens_seen": 61135960, "step": 105300 }, { "epoch": 15.684390825141495, "grad_norm": 1.0795199871063232, "learning_rate": 6.763119229934589e-06, "loss": 0.4669, "num_input_tokens_seen": 61138680, "step": 105305 }, { "epoch": 15.685135537682454, "grad_norm": 1.8380845785140991, "learning_rate": 6.760896756226215e-06, "loss": 0.5879, "num_input_tokens_seen": 61141336, "step": 105310 }, { "epoch": 15.685880250223414, "grad_norm": 0.7228028774261475, "learning_rate": 6.758674590648964e-06, "loss": 0.39, "num_input_tokens_seen": 61144280, "step": 105315 }, { "epoch": 15.686624962764373, "grad_norm": 1.4115670919418335, "learning_rate": 6.756452733240365e-06, "loss": 0.5769, "num_input_tokens_seen": 61147128, "step": 105320 }, { "epoch": 15.687369675305332, "grad_norm": 1.5666923522949219, "learning_rate": 6.7542311840379455e-06, "loss": 0.6355, "num_input_tokens_seen": 61150008, "step": 105325 }, { "epoch": 15.68811438784629, "grad_norm": 1.8475276231765747, "learning_rate": 6.752009943079257e-06, "loss": 0.4914, "num_input_tokens_seen": 61153112, "step": 105330 }, { "epoch": 15.688859100387251, "grad_norm": 2.7607152462005615, "learning_rate": 6.749789010401805e-06, "loss": 0.5446, "num_input_tokens_seen": 61156056, "step": 105335 }, { "epoch": 15.68960381292821, "grad_norm": 1.7978589534759521, "learning_rate": 6.7475683860431266e-06, "loss": 0.7674, "num_input_tokens_seen": 61159352, "step": 105340 }, { "epoch": 15.690348525469169, "grad_norm": 2.568624258041382, "learning_rate": 6.745348070040722e-06, "loss": 0.4399, "num_input_tokens_seen": 61162168, "step": 105345 }, { "epoch": 15.691093238010128, "grad_norm": 1.259403944015503, "learning_rate": 6.743128062432113e-06, "loss": 0.4824, "num_input_tokens_seen": 61165112, "step": 105350 }, { "epoch": 15.691837950551088, "grad_norm": 2.235438823699951, "learning_rate": 6.740908363254805e-06, "loss": 0.7673, "num_input_tokens_seen": 61167864, "step": 105355 }, { "epoch": 15.692582663092047, "grad_norm": 1.359164834022522, "learning_rate": 6.7386889725462894e-06, "loss": 0.5611, "num_input_tokens_seen": 61170936, "step": 105360 }, { "epoch": 15.693327375633006, "grad_norm": 0.8973144888877869, "learning_rate": 6.736469890344058e-06, "loss": 0.4748, "num_input_tokens_seen": 61173592, "step": 105365 }, { "epoch": 15.694072088173964, "grad_norm": 1.7376482486724854, "learning_rate": 6.734251116685611e-06, "loss": 0.5893, "num_input_tokens_seen": 61176248, "step": 105370 }, { "epoch": 15.694816800714925, "grad_norm": 0.7021514773368835, "learning_rate": 6.732032651608427e-06, "loss": 0.5544, "num_input_tokens_seen": 61179064, "step": 105375 }, { "epoch": 15.695561513255884, "grad_norm": 3.293299913406372, "learning_rate": 6.7298144951499774e-06, "loss": 0.6924, "num_input_tokens_seen": 61181592, "step": 105380 }, { "epoch": 15.696306225796842, "grad_norm": 2.602783679962158, "learning_rate": 6.727596647347753e-06, "loss": 0.5436, "num_input_tokens_seen": 61184088, "step": 105385 }, { "epoch": 15.697050938337801, "grad_norm": 2.3004136085510254, "learning_rate": 6.725379108239202e-06, "loss": 0.6249, "num_input_tokens_seen": 61186872, "step": 105390 }, { "epoch": 15.69779565087876, "grad_norm": 1.8596270084381104, "learning_rate": 6.723161877861805e-06, "loss": 0.4876, "num_input_tokens_seen": 61190136, "step": 105395 }, { "epoch": 15.69854036341972, "grad_norm": 1.3634880781173706, "learning_rate": 6.720944956253012e-06, "loss": 0.587, "num_input_tokens_seen": 61193048, "step": 105400 }, { "epoch": 15.69928507596068, "grad_norm": 1.111771583557129, "learning_rate": 6.71872834345027e-06, "loss": 0.6907, "num_input_tokens_seen": 61195992, "step": 105405 }, { "epoch": 15.700029788501638, "grad_norm": 1.325143814086914, "learning_rate": 6.716512039491038e-06, "loss": 0.5409, "num_input_tokens_seen": 61198808, "step": 105410 }, { "epoch": 15.700774501042597, "grad_norm": 1.2352147102355957, "learning_rate": 6.714296044412746e-06, "loss": 0.4662, "num_input_tokens_seen": 61201880, "step": 105415 }, { "epoch": 15.701519213583557, "grad_norm": 0.8281300663948059, "learning_rate": 6.712080358252845e-06, "loss": 0.5876, "num_input_tokens_seen": 61205208, "step": 105420 }, { "epoch": 15.702263926124516, "grad_norm": 1.2324422597885132, "learning_rate": 6.709864981048761e-06, "loss": 0.5236, "num_input_tokens_seen": 61207896, "step": 105425 }, { "epoch": 15.703008638665475, "grad_norm": 1.5912708044052124, "learning_rate": 6.707649912837919e-06, "loss": 0.4657, "num_input_tokens_seen": 61210744, "step": 105430 }, { "epoch": 15.703753351206434, "grad_norm": 2.1139321327209473, "learning_rate": 6.70543515365773e-06, "loss": 0.8274, "num_input_tokens_seen": 61213720, "step": 105435 }, { "epoch": 15.704498063747394, "grad_norm": 1.8231587409973145, "learning_rate": 6.703220703545629e-06, "loss": 0.4719, "num_input_tokens_seen": 61216408, "step": 105440 }, { "epoch": 15.705242776288353, "grad_norm": 1.5103731155395508, "learning_rate": 6.701006562539019e-06, "loss": 0.5267, "num_input_tokens_seen": 61219192, "step": 105445 }, { "epoch": 15.705987488829312, "grad_norm": 2.0754265785217285, "learning_rate": 6.698792730675296e-06, "loss": 0.6129, "num_input_tokens_seen": 61222488, "step": 105450 }, { "epoch": 15.70673220137027, "grad_norm": 2.1489169597625732, "learning_rate": 6.6965792079918765e-06, "loss": 0.6694, "num_input_tokens_seen": 61225656, "step": 105455 }, { "epoch": 15.707476913911231, "grad_norm": 1.1041074991226196, "learning_rate": 6.694365994526142e-06, "loss": 0.7888, "num_input_tokens_seen": 61228664, "step": 105460 }, { "epoch": 15.70822162645219, "grad_norm": 1.2158658504486084, "learning_rate": 6.692153090315498e-06, "loss": 0.466, "num_input_tokens_seen": 61231384, "step": 105465 }, { "epoch": 15.708966338993148, "grad_norm": 1.7564723491668701, "learning_rate": 6.689940495397309e-06, "loss": 0.5862, "num_input_tokens_seen": 61234296, "step": 105470 }, { "epoch": 15.709711051534107, "grad_norm": 1.694544792175293, "learning_rate": 6.687728209808977e-06, "loss": 0.4637, "num_input_tokens_seen": 61237208, "step": 105475 }, { "epoch": 15.710455764075068, "grad_norm": 1.9507414102554321, "learning_rate": 6.6855162335878626e-06, "loss": 0.6433, "num_input_tokens_seen": 61240312, "step": 105480 }, { "epoch": 15.711200476616026, "grad_norm": 1.2331559658050537, "learning_rate": 6.683304566771331e-06, "loss": 0.5564, "num_input_tokens_seen": 61243288, "step": 105485 }, { "epoch": 15.711945189156985, "grad_norm": 2.3107998371124268, "learning_rate": 6.68109320939676e-06, "loss": 0.4908, "num_input_tokens_seen": 61246392, "step": 105490 }, { "epoch": 15.712689901697944, "grad_norm": 1.455843448638916, "learning_rate": 6.678882161501502e-06, "loss": 0.5169, "num_input_tokens_seen": 61249624, "step": 105495 }, { "epoch": 15.713434614238905, "grad_norm": 1.2950458526611328, "learning_rate": 6.676671423122907e-06, "loss": 0.6562, "num_input_tokens_seen": 61252568, "step": 105500 }, { "epoch": 15.714179326779863, "grad_norm": 1.9942737817764282, "learning_rate": 6.674460994298317e-06, "loss": 0.6135, "num_input_tokens_seen": 61255672, "step": 105505 }, { "epoch": 15.714924039320822, "grad_norm": 0.8985573053359985, "learning_rate": 6.672250875065095e-06, "loss": 0.7008, "num_input_tokens_seen": 61258360, "step": 105510 }, { "epoch": 15.71566875186178, "grad_norm": 1.9918720722198486, "learning_rate": 6.670041065460555e-06, "loss": 0.5465, "num_input_tokens_seen": 61261336, "step": 105515 }, { "epoch": 15.716413464402741, "grad_norm": 3.5668375492095947, "learning_rate": 6.667831565522051e-06, "loss": 0.5232, "num_input_tokens_seen": 61264184, "step": 105520 }, { "epoch": 15.7171581769437, "grad_norm": 1.2708065509796143, "learning_rate": 6.665622375286901e-06, "loss": 0.5207, "num_input_tokens_seen": 61267128, "step": 105525 }, { "epoch": 15.717902889484659, "grad_norm": 1.7174930572509766, "learning_rate": 6.66341349479242e-06, "loss": 0.6604, "num_input_tokens_seen": 61270008, "step": 105530 }, { "epoch": 15.718647602025618, "grad_norm": 2.0907318592071533, "learning_rate": 6.661204924075937e-06, "loss": 0.7501, "num_input_tokens_seen": 61273272, "step": 105535 }, { "epoch": 15.719392314566576, "grad_norm": 1.1890382766723633, "learning_rate": 6.658996663174752e-06, "loss": 0.4766, "num_input_tokens_seen": 61276024, "step": 105540 }, { "epoch": 15.720137027107537, "grad_norm": 1.3045819997787476, "learning_rate": 6.656788712126183e-06, "loss": 0.5636, "num_input_tokens_seen": 61279128, "step": 105545 }, { "epoch": 15.720881739648496, "grad_norm": 2.3527164459228516, "learning_rate": 6.654581070967519e-06, "loss": 0.6675, "num_input_tokens_seen": 61281912, "step": 105550 }, { "epoch": 15.721626452189454, "grad_norm": 2.1148829460144043, "learning_rate": 6.6523737397360705e-06, "loss": 0.5451, "num_input_tokens_seen": 61284984, "step": 105555 }, { "epoch": 15.722371164730415, "grad_norm": 2.0426297187805176, "learning_rate": 6.65016671846912e-06, "loss": 0.5449, "num_input_tokens_seen": 61287928, "step": 105560 }, { "epoch": 15.723115877271374, "grad_norm": 1.3395593166351318, "learning_rate": 6.647960007203952e-06, "loss": 0.3342, "num_input_tokens_seen": 61290584, "step": 105565 }, { "epoch": 15.723860589812332, "grad_norm": 0.829874575138092, "learning_rate": 6.645753605977847e-06, "loss": 0.5916, "num_input_tokens_seen": 61293176, "step": 105570 }, { "epoch": 15.724605302353291, "grad_norm": 2.000112533569336, "learning_rate": 6.643547514828075e-06, "loss": 0.463, "num_input_tokens_seen": 61296280, "step": 105575 }, { "epoch": 15.72535001489425, "grad_norm": 1.1298933029174805, "learning_rate": 6.641341733791917e-06, "loss": 0.5519, "num_input_tokens_seen": 61299352, "step": 105580 }, { "epoch": 15.72609472743521, "grad_norm": 0.2882194221019745, "learning_rate": 6.639136262906625e-06, "loss": 0.2607, "num_input_tokens_seen": 61302104, "step": 105585 }, { "epoch": 15.72683943997617, "grad_norm": 2.018950939178467, "learning_rate": 6.636931102209471e-06, "loss": 0.6586, "num_input_tokens_seen": 61305144, "step": 105590 }, { "epoch": 15.727584152517128, "grad_norm": 2.7311227321624756, "learning_rate": 6.634726251737697e-06, "loss": 0.7609, "num_input_tokens_seen": 61308280, "step": 105595 }, { "epoch": 15.728328865058087, "grad_norm": 2.06417179107666, "learning_rate": 6.632521711528564e-06, "loss": 0.6002, "num_input_tokens_seen": 61311128, "step": 105600 }, { "epoch": 15.729073577599047, "grad_norm": 2.148872137069702, "learning_rate": 6.630317481619308e-06, "loss": 0.6267, "num_input_tokens_seen": 61314104, "step": 105605 }, { "epoch": 15.729818290140006, "grad_norm": 1.4277774095535278, "learning_rate": 6.628113562047161e-06, "loss": 0.4878, "num_input_tokens_seen": 61317208, "step": 105610 }, { "epoch": 15.730563002680965, "grad_norm": 2.633368730545044, "learning_rate": 6.625909952849368e-06, "loss": 0.4511, "num_input_tokens_seen": 61319928, "step": 105615 }, { "epoch": 15.731307715221924, "grad_norm": 1.0140440464019775, "learning_rate": 6.62370665406315e-06, "loss": 0.6275, "num_input_tokens_seen": 61322712, "step": 105620 }, { "epoch": 15.732052427762884, "grad_norm": 1.2726272344589233, "learning_rate": 6.621503665725734e-06, "loss": 0.5205, "num_input_tokens_seen": 61325688, "step": 105625 }, { "epoch": 15.732797140303843, "grad_norm": 1.6536351442337036, "learning_rate": 6.619300987874336e-06, "loss": 0.6031, "num_input_tokens_seen": 61328408, "step": 105630 }, { "epoch": 15.733541852844802, "grad_norm": 1.352799415588379, "learning_rate": 6.617098620546166e-06, "loss": 0.5426, "num_input_tokens_seen": 61330872, "step": 105635 }, { "epoch": 15.73428656538576, "grad_norm": 1.1760540008544922, "learning_rate": 6.614896563778425e-06, "loss": 0.5931, "num_input_tokens_seen": 61333752, "step": 105640 }, { "epoch": 15.735031277926721, "grad_norm": 1.6419460773468018, "learning_rate": 6.6126948176083284e-06, "loss": 0.4111, "num_input_tokens_seen": 61336600, "step": 105645 }, { "epoch": 15.73577599046768, "grad_norm": 1.537805199623108, "learning_rate": 6.610493382073063e-06, "loss": 0.6067, "num_input_tokens_seen": 61339704, "step": 105650 }, { "epoch": 15.736520703008638, "grad_norm": 0.9401296973228455, "learning_rate": 6.6082922572098135e-06, "loss": 0.7535, "num_input_tokens_seen": 61342296, "step": 105655 }, { "epoch": 15.737265415549597, "grad_norm": 1.6720399856567383, "learning_rate": 6.6060914430557845e-06, "loss": 0.5359, "num_input_tokens_seen": 61345304, "step": 105660 }, { "epoch": 15.738010128090558, "grad_norm": 2.0593225955963135, "learning_rate": 6.603890939648136e-06, "loss": 0.5759, "num_input_tokens_seen": 61348120, "step": 105665 }, { "epoch": 15.738754840631517, "grad_norm": 2.3774073123931885, "learning_rate": 6.601690747024061e-06, "loss": 0.721, "num_input_tokens_seen": 61351000, "step": 105670 }, { "epoch": 15.739499553172475, "grad_norm": 1.5763421058654785, "learning_rate": 6.599490865220714e-06, "loss": 0.6908, "num_input_tokens_seen": 61353720, "step": 105675 }, { "epoch": 15.740244265713434, "grad_norm": 1.57001793384552, "learning_rate": 6.597291294275276e-06, "loss": 0.6843, "num_input_tokens_seen": 61356728, "step": 105680 }, { "epoch": 15.740988978254395, "grad_norm": 1.7905818223953247, "learning_rate": 6.595092034224898e-06, "loss": 0.6089, "num_input_tokens_seen": 61359768, "step": 105685 }, { "epoch": 15.741733690795353, "grad_norm": 0.9679456949234009, "learning_rate": 6.592893085106733e-06, "loss": 0.5637, "num_input_tokens_seen": 61362808, "step": 105690 }, { "epoch": 15.742478403336312, "grad_norm": 1.3276538848876953, "learning_rate": 6.590694446957924e-06, "loss": 0.6138, "num_input_tokens_seen": 61365432, "step": 105695 }, { "epoch": 15.74322311587727, "grad_norm": 1.1842535734176636, "learning_rate": 6.588496119815629e-06, "loss": 0.5664, "num_input_tokens_seen": 61368536, "step": 105700 }, { "epoch": 15.743967828418231, "grad_norm": 1.5348254442214966, "learning_rate": 6.5862981037169816e-06, "loss": 0.6868, "num_input_tokens_seen": 61371608, "step": 105705 }, { "epoch": 15.74471254095919, "grad_norm": 1.5806269645690918, "learning_rate": 6.584100398699103e-06, "loss": 0.4422, "num_input_tokens_seen": 61374520, "step": 105710 }, { "epoch": 15.745457253500149, "grad_norm": 2.1965067386627197, "learning_rate": 6.581903004799139e-06, "loss": 0.6139, "num_input_tokens_seen": 61377176, "step": 105715 }, { "epoch": 15.746201966041108, "grad_norm": 1.8132816553115845, "learning_rate": 6.5797059220541965e-06, "loss": 0.5382, "num_input_tokens_seen": 61379992, "step": 105720 }, { "epoch": 15.746946678582066, "grad_norm": 2.761551856994629, "learning_rate": 6.57750915050141e-06, "loss": 0.5923, "num_input_tokens_seen": 61382872, "step": 105725 }, { "epoch": 15.747691391123027, "grad_norm": 1.8066378831863403, "learning_rate": 6.575312690177882e-06, "loss": 0.6593, "num_input_tokens_seen": 61385752, "step": 105730 }, { "epoch": 15.748436103663986, "grad_norm": 2.6139919757843018, "learning_rate": 6.573116541120714e-06, "loss": 0.834, "num_input_tokens_seen": 61388984, "step": 105735 }, { "epoch": 15.749180816204944, "grad_norm": 2.802462100982666, "learning_rate": 6.570920703367017e-06, "loss": 0.6489, "num_input_tokens_seen": 61391832, "step": 105740 }, { "epoch": 15.749925528745905, "grad_norm": 3.2313308715820312, "learning_rate": 6.5687251769538795e-06, "loss": 0.613, "num_input_tokens_seen": 61394648, "step": 105745 }, { "epoch": 15.750670241286864, "grad_norm": 1.3457568883895874, "learning_rate": 6.566529961918405e-06, "loss": 0.7776, "num_input_tokens_seen": 61397528, "step": 105750 }, { "epoch": 15.751414953827823, "grad_norm": 1.0877609252929688, "learning_rate": 6.564335058297674e-06, "loss": 0.5052, "num_input_tokens_seen": 61400536, "step": 105755 }, { "epoch": 15.752159666368781, "grad_norm": 1.8110367059707642, "learning_rate": 6.56214046612876e-06, "loss": 0.571, "num_input_tokens_seen": 61403416, "step": 105760 }, { "epoch": 15.75290437890974, "grad_norm": 1.1004979610443115, "learning_rate": 6.5599461854487485e-06, "loss": 0.4993, "num_input_tokens_seen": 61406104, "step": 105765 }, { "epoch": 15.7536490914507, "grad_norm": 1.2816427946090698, "learning_rate": 6.557752216294693e-06, "loss": 0.5176, "num_input_tokens_seen": 61408888, "step": 105770 }, { "epoch": 15.75439380399166, "grad_norm": 3.0265989303588867, "learning_rate": 6.555558558703681e-06, "loss": 0.5156, "num_input_tokens_seen": 61411896, "step": 105775 }, { "epoch": 15.755138516532618, "grad_norm": 1.6335597038269043, "learning_rate": 6.5533652127127505e-06, "loss": 0.7449, "num_input_tokens_seen": 61414968, "step": 105780 }, { "epoch": 15.755883229073577, "grad_norm": 1.3178551197052002, "learning_rate": 6.551172178358975e-06, "loss": 0.4741, "num_input_tokens_seen": 61417688, "step": 105785 }, { "epoch": 15.756627941614537, "grad_norm": 2.721437931060791, "learning_rate": 6.548979455679388e-06, "loss": 0.7737, "num_input_tokens_seen": 61420408, "step": 105790 }, { "epoch": 15.757372654155496, "grad_norm": 1.1850967407226562, "learning_rate": 6.5467870447110475e-06, "loss": 0.6174, "num_input_tokens_seen": 61423288, "step": 105795 }, { "epoch": 15.758117366696455, "grad_norm": 1.6574268341064453, "learning_rate": 6.544594945490978e-06, "loss": 0.4855, "num_input_tokens_seen": 61426040, "step": 105800 }, { "epoch": 15.758862079237414, "grad_norm": 0.8411419987678528, "learning_rate": 6.542403158056226e-06, "loss": 0.5287, "num_input_tokens_seen": 61428952, "step": 105805 }, { "epoch": 15.759606791778374, "grad_norm": 1.3342573642730713, "learning_rate": 6.540211682443814e-06, "loss": 0.8314, "num_input_tokens_seen": 61432280, "step": 105810 }, { "epoch": 15.760351504319333, "grad_norm": 1.1080971956253052, "learning_rate": 6.5380205186907545e-06, "loss": 0.6042, "num_input_tokens_seen": 61434744, "step": 105815 }, { "epoch": 15.761096216860292, "grad_norm": 0.792221188545227, "learning_rate": 6.535829666834084e-06, "loss": 0.4716, "num_input_tokens_seen": 61437496, "step": 105820 }, { "epoch": 15.76184092940125, "grad_norm": 1.3053514957427979, "learning_rate": 6.533639126910804e-06, "loss": 0.5226, "num_input_tokens_seen": 61440312, "step": 105825 }, { "epoch": 15.762585641942211, "grad_norm": 1.7261823415756226, "learning_rate": 6.531448898957923e-06, "loss": 0.726, "num_input_tokens_seen": 61443800, "step": 105830 }, { "epoch": 15.76333035448317, "grad_norm": 1.6097623109817505, "learning_rate": 6.529258983012437e-06, "loss": 0.8729, "num_input_tokens_seen": 61446808, "step": 105835 }, { "epoch": 15.764075067024129, "grad_norm": 3.137979030609131, "learning_rate": 6.527069379111353e-06, "loss": 0.5943, "num_input_tokens_seen": 61449848, "step": 105840 }, { "epoch": 15.764819779565087, "grad_norm": 1.7019622325897217, "learning_rate": 6.5248800872916574e-06, "loss": 0.5562, "num_input_tokens_seen": 61452440, "step": 105845 }, { "epoch": 15.765564492106048, "grad_norm": 1.3344329595565796, "learning_rate": 6.5226911075903255e-06, "loss": 0.4411, "num_input_tokens_seen": 61455160, "step": 105850 }, { "epoch": 15.766309204647007, "grad_norm": 1.0972398519515991, "learning_rate": 6.5205024400443575e-06, "loss": 0.5065, "num_input_tokens_seen": 61458072, "step": 105855 }, { "epoch": 15.767053917187965, "grad_norm": 3.8642101287841797, "learning_rate": 6.51831408469071e-06, "loss": 0.6124, "num_input_tokens_seen": 61461048, "step": 105860 }, { "epoch": 15.767798629728924, "grad_norm": 1.6110705137252808, "learning_rate": 6.516126041566373e-06, "loss": 0.6133, "num_input_tokens_seen": 61464248, "step": 105865 }, { "epoch": 15.768543342269885, "grad_norm": 0.8465328216552734, "learning_rate": 6.51393831070829e-06, "loss": 0.5179, "num_input_tokens_seen": 61467256, "step": 105870 }, { "epoch": 15.769288054810843, "grad_norm": 1.4046893119812012, "learning_rate": 6.511750892153439e-06, "loss": 0.6524, "num_input_tokens_seen": 61469720, "step": 105875 }, { "epoch": 15.770032767351802, "grad_norm": 0.9757773280143738, "learning_rate": 6.50956378593876e-06, "loss": 0.3901, "num_input_tokens_seen": 61472632, "step": 105880 }, { "epoch": 15.770777479892761, "grad_norm": 1.5931624174118042, "learning_rate": 6.507376992101214e-06, "loss": 0.544, "num_input_tokens_seen": 61475256, "step": 105885 }, { "epoch": 15.771522192433721, "grad_norm": 2.0160129070281982, "learning_rate": 6.505190510677739e-06, "loss": 0.638, "num_input_tokens_seen": 61477944, "step": 105890 }, { "epoch": 15.77226690497468, "grad_norm": 1.541849136352539, "learning_rate": 6.503004341705271e-06, "loss": 0.4129, "num_input_tokens_seen": 61480952, "step": 105895 }, { "epoch": 15.773011617515639, "grad_norm": 2.864513635635376, "learning_rate": 6.5008184852207475e-06, "loss": 0.5892, "num_input_tokens_seen": 61483800, "step": 105900 }, { "epoch": 15.773756330056598, "grad_norm": 1.2618144750595093, "learning_rate": 6.498632941261088e-06, "loss": 0.6503, "num_input_tokens_seen": 61486744, "step": 105905 }, { "epoch": 15.774501042597556, "grad_norm": 1.3126417398452759, "learning_rate": 6.496447709863227e-06, "loss": 0.4991, "num_input_tokens_seen": 61489912, "step": 105910 }, { "epoch": 15.775245755138517, "grad_norm": 1.292249083518982, "learning_rate": 6.494262791064065e-06, "loss": 0.5467, "num_input_tokens_seen": 61492760, "step": 105915 }, { "epoch": 15.775990467679476, "grad_norm": 1.2280904054641724, "learning_rate": 6.4920781849005355e-06, "loss": 0.5771, "num_input_tokens_seen": 61495704, "step": 105920 }, { "epoch": 15.776735180220435, "grad_norm": 1.4309996366500854, "learning_rate": 6.489893891409535e-06, "loss": 0.5929, "num_input_tokens_seen": 61498680, "step": 105925 }, { "epoch": 15.777479892761393, "grad_norm": 1.927780032157898, "learning_rate": 6.487709910627957e-06, "loss": 0.6651, "num_input_tokens_seen": 61501656, "step": 105930 }, { "epoch": 15.778224605302354, "grad_norm": 1.5388104915618896, "learning_rate": 6.48552624259271e-06, "loss": 0.6063, "num_input_tokens_seen": 61504600, "step": 105935 }, { "epoch": 15.778969317843313, "grad_norm": 1.2520664930343628, "learning_rate": 6.483342887340674e-06, "loss": 0.6489, "num_input_tokens_seen": 61507352, "step": 105940 }, { "epoch": 15.779714030384271, "grad_norm": 1.6929562091827393, "learning_rate": 6.481159844908746e-06, "loss": 0.6417, "num_input_tokens_seen": 61510072, "step": 105945 }, { "epoch": 15.78045874292523, "grad_norm": 2.4229891300201416, "learning_rate": 6.478977115333796e-06, "loss": 0.5862, "num_input_tokens_seen": 61513144, "step": 105950 }, { "epoch": 15.78120345546619, "grad_norm": 1.715206265449524, "learning_rate": 6.4767946986527105e-06, "loss": 0.4354, "num_input_tokens_seen": 61516024, "step": 105955 }, { "epoch": 15.78194816800715, "grad_norm": 1.2425308227539062, "learning_rate": 6.474612594902351e-06, "loss": 0.719, "num_input_tokens_seen": 61519320, "step": 105960 }, { "epoch": 15.782692880548108, "grad_norm": 1.566324234008789, "learning_rate": 6.472430804119584e-06, "loss": 0.5383, "num_input_tokens_seen": 61522424, "step": 105965 }, { "epoch": 15.783437593089067, "grad_norm": 1.3804924488067627, "learning_rate": 6.470249326341269e-06, "loss": 0.6178, "num_input_tokens_seen": 61525432, "step": 105970 }, { "epoch": 15.784182305630027, "grad_norm": 0.426608145236969, "learning_rate": 6.468068161604251e-06, "loss": 0.5222, "num_input_tokens_seen": 61528088, "step": 105975 }, { "epoch": 15.784927018170986, "grad_norm": 1.783894419670105, "learning_rate": 6.465887309945393e-06, "loss": 0.5264, "num_input_tokens_seen": 61530936, "step": 105980 }, { "epoch": 15.785671730711945, "grad_norm": 1.4039415121078491, "learning_rate": 6.4637067714015255e-06, "loss": 0.5288, "num_input_tokens_seen": 61533752, "step": 105985 }, { "epoch": 15.786416443252904, "grad_norm": 1.1147280931472778, "learning_rate": 6.4615265460095e-06, "loss": 0.438, "num_input_tokens_seen": 61536664, "step": 105990 }, { "epoch": 15.787161155793864, "grad_norm": 1.5754224061965942, "learning_rate": 6.459346633806132e-06, "loss": 0.5997, "num_input_tokens_seen": 61539736, "step": 105995 }, { "epoch": 15.787905868334823, "grad_norm": 1.554533839225769, "learning_rate": 6.457167034828268e-06, "loss": 0.5078, "num_input_tokens_seen": 61542584, "step": 106000 }, { "epoch": 15.788650580875782, "grad_norm": 0.8124816417694092, "learning_rate": 6.45498774911272e-06, "loss": 0.4036, "num_input_tokens_seen": 61545304, "step": 106005 }, { "epoch": 15.78939529341674, "grad_norm": 1.745360255241394, "learning_rate": 6.452808776696301e-06, "loss": 0.5093, "num_input_tokens_seen": 61548376, "step": 106010 }, { "epoch": 15.790140005957701, "grad_norm": 1.2075998783111572, "learning_rate": 6.450630117615833e-06, "loss": 0.5365, "num_input_tokens_seen": 61551256, "step": 106015 }, { "epoch": 15.79088471849866, "grad_norm": 0.9886025190353394, "learning_rate": 6.448451771908115e-06, "loss": 0.7205, "num_input_tokens_seen": 61554424, "step": 106020 }, { "epoch": 15.791629431039619, "grad_norm": 2.068464994430542, "learning_rate": 6.446273739609943e-06, "loss": 0.7922, "num_input_tokens_seen": 61557240, "step": 106025 }, { "epoch": 15.792374143580577, "grad_norm": 1.7581509351730347, "learning_rate": 6.444096020758125e-06, "loss": 0.5469, "num_input_tokens_seen": 61560248, "step": 106030 }, { "epoch": 15.793118856121538, "grad_norm": 1.3945084810256958, "learning_rate": 6.4419186153894475e-06, "loss": 0.5725, "num_input_tokens_seen": 61562968, "step": 106035 }, { "epoch": 15.793863568662497, "grad_norm": 1.9221364259719849, "learning_rate": 6.439741523540685e-06, "loss": 0.6406, "num_input_tokens_seen": 61565784, "step": 106040 }, { "epoch": 15.794608281203455, "grad_norm": 1.1253167390823364, "learning_rate": 6.437564745248634e-06, "loss": 0.6375, "num_input_tokens_seen": 61569016, "step": 106045 }, { "epoch": 15.795352993744414, "grad_norm": 3.2423007488250732, "learning_rate": 6.435388280550062e-06, "loss": 0.8739, "num_input_tokens_seen": 61571704, "step": 106050 }, { "epoch": 15.796097706285373, "grad_norm": 1.4924194812774658, "learning_rate": 6.4332121294817245e-06, "loss": 0.5744, "num_input_tokens_seen": 61574744, "step": 106055 }, { "epoch": 15.796842418826333, "grad_norm": 2.5723319053649902, "learning_rate": 6.431036292080409e-06, "loss": 0.8223, "num_input_tokens_seen": 61577720, "step": 106060 }, { "epoch": 15.797587131367292, "grad_norm": 1.4069550037384033, "learning_rate": 6.428860768382855e-06, "loss": 0.7493, "num_input_tokens_seen": 61581080, "step": 106065 }, { "epoch": 15.798331843908251, "grad_norm": 1.2704367637634277, "learning_rate": 6.42668555842583e-06, "loss": 0.4853, "num_input_tokens_seen": 61583768, "step": 106070 }, { "epoch": 15.799076556449211, "grad_norm": 1.6363182067871094, "learning_rate": 6.4245106622460665e-06, "loss": 0.4935, "num_input_tokens_seen": 61586296, "step": 106075 }, { "epoch": 15.79982126899017, "grad_norm": 1.8086127042770386, "learning_rate": 6.422336079880325e-06, "loss": 0.7613, "num_input_tokens_seen": 61589144, "step": 106080 }, { "epoch": 15.800565981531129, "grad_norm": 1.0848844051361084, "learning_rate": 6.420161811365336e-06, "loss": 0.5163, "num_input_tokens_seen": 61592088, "step": 106085 }, { "epoch": 15.801310694072088, "grad_norm": 2.3290672302246094, "learning_rate": 6.417987856737825e-06, "loss": 0.389, "num_input_tokens_seen": 61595032, "step": 106090 }, { "epoch": 15.802055406613047, "grad_norm": 1.1834962368011475, "learning_rate": 6.415814216034527e-06, "loss": 0.5294, "num_input_tokens_seen": 61597848, "step": 106095 }, { "epoch": 15.802800119154007, "grad_norm": 1.7005332708358765, "learning_rate": 6.41364088929215e-06, "loss": 0.6454, "num_input_tokens_seen": 61600824, "step": 106100 }, { "epoch": 15.803544831694966, "grad_norm": 1.1501410007476807, "learning_rate": 6.4114678765474275e-06, "loss": 0.5114, "num_input_tokens_seen": 61603576, "step": 106105 }, { "epoch": 15.804289544235925, "grad_norm": 1.4592033624649048, "learning_rate": 6.409295177837058e-06, "loss": 0.643, "num_input_tokens_seen": 61606616, "step": 106110 }, { "epoch": 15.805034256776883, "grad_norm": 1.3029627799987793, "learning_rate": 6.407122793197756e-06, "loss": 0.4639, "num_input_tokens_seen": 61609400, "step": 106115 }, { "epoch": 15.805778969317844, "grad_norm": 1.1151820421218872, "learning_rate": 6.404950722666211e-06, "loss": 0.3633, "num_input_tokens_seen": 61612184, "step": 106120 }, { "epoch": 15.806523681858803, "grad_norm": 1.96040678024292, "learning_rate": 6.402778966279133e-06, "loss": 0.4044, "num_input_tokens_seen": 61614872, "step": 106125 }, { "epoch": 15.807268394399761, "grad_norm": 3.1282176971435547, "learning_rate": 6.400607524073201e-06, "loss": 0.6396, "num_input_tokens_seen": 61617880, "step": 106130 }, { "epoch": 15.80801310694072, "grad_norm": 1.354691505432129, "learning_rate": 6.3984363960850945e-06, "loss": 0.5338, "num_input_tokens_seen": 61621176, "step": 106135 }, { "epoch": 15.80875781948168, "grad_norm": 4.674306869506836, "learning_rate": 6.396265582351508e-06, "loss": 0.5158, "num_input_tokens_seen": 61624120, "step": 106140 }, { "epoch": 15.80950253202264, "grad_norm": 1.3245328664779663, "learning_rate": 6.394095082909099e-06, "loss": 0.6632, "num_input_tokens_seen": 61627064, "step": 106145 }, { "epoch": 15.810247244563598, "grad_norm": 1.7397656440734863, "learning_rate": 6.391924897794549e-06, "loss": 0.585, "num_input_tokens_seen": 61629688, "step": 106150 }, { "epoch": 15.810991957104557, "grad_norm": 2.4858734607696533, "learning_rate": 6.3897550270445165e-06, "loss": 0.4413, "num_input_tokens_seen": 61632600, "step": 106155 }, { "epoch": 15.811736669645517, "grad_norm": 1.2469278573989868, "learning_rate": 6.387585470695659e-06, "loss": 0.5743, "num_input_tokens_seen": 61635576, "step": 106160 }, { "epoch": 15.812481382186476, "grad_norm": 1.6610504388809204, "learning_rate": 6.385416228784618e-06, "loss": 0.5454, "num_input_tokens_seen": 61638456, "step": 106165 }, { "epoch": 15.813226094727435, "grad_norm": 1.5626075267791748, "learning_rate": 6.383247301348061e-06, "loss": 0.4789, "num_input_tokens_seen": 61641304, "step": 106170 }, { "epoch": 15.813970807268394, "grad_norm": 1.816675066947937, "learning_rate": 6.381078688422617e-06, "loss": 0.6798, "num_input_tokens_seen": 61644120, "step": 106175 }, { "epoch": 15.814715519809354, "grad_norm": 2.3757755756378174, "learning_rate": 6.3789103900449205e-06, "loss": 0.6352, "num_input_tokens_seen": 61647128, "step": 106180 }, { "epoch": 15.815460232350313, "grad_norm": 2.075882911682129, "learning_rate": 6.3767424062516155e-06, "loss": 0.4455, "num_input_tokens_seen": 61650008, "step": 106185 }, { "epoch": 15.816204944891272, "grad_norm": 0.9479721188545227, "learning_rate": 6.374574737079309e-06, "loss": 0.5005, "num_input_tokens_seen": 61653080, "step": 106190 }, { "epoch": 15.81694965743223, "grad_norm": 0.8623307347297668, "learning_rate": 6.372407382564641e-06, "loss": 0.4329, "num_input_tokens_seen": 61655736, "step": 106195 }, { "epoch": 15.817694369973191, "grad_norm": 1.0594511032104492, "learning_rate": 6.37024034274421e-06, "loss": 0.5213, "num_input_tokens_seen": 61658552, "step": 106200 }, { "epoch": 15.81843908251415, "grad_norm": 1.7824198007583618, "learning_rate": 6.368073617654643e-06, "loss": 0.5513, "num_input_tokens_seen": 61661592, "step": 106205 }, { "epoch": 15.819183795055109, "grad_norm": 1.950500726699829, "learning_rate": 6.365907207332536e-06, "loss": 0.607, "num_input_tokens_seen": 61664440, "step": 106210 }, { "epoch": 15.819928507596067, "grad_norm": 1.7440506219863892, "learning_rate": 6.3637411118144776e-06, "loss": 0.649, "num_input_tokens_seen": 61667192, "step": 106215 }, { "epoch": 15.820673220137028, "grad_norm": 1.6956291198730469, "learning_rate": 6.361575331137082e-06, "loss": 0.4482, "num_input_tokens_seen": 61670008, "step": 106220 }, { "epoch": 15.821417932677987, "grad_norm": 1.7967369556427002, "learning_rate": 6.359409865336927e-06, "loss": 0.4878, "num_input_tokens_seen": 61672984, "step": 106225 }, { "epoch": 15.822162645218945, "grad_norm": 1.0753638744354248, "learning_rate": 6.357244714450597e-06, "loss": 0.5728, "num_input_tokens_seen": 61675704, "step": 106230 }, { "epoch": 15.822907357759904, "grad_norm": 2.7119767665863037, "learning_rate": 6.355079878514661e-06, "loss": 0.6079, "num_input_tokens_seen": 61678712, "step": 106235 }, { "epoch": 15.823652070300863, "grad_norm": 1.748315691947937, "learning_rate": 6.352915357565712e-06, "loss": 0.6901, "num_input_tokens_seen": 61681816, "step": 106240 }, { "epoch": 15.824396782841823, "grad_norm": 1.2231197357177734, "learning_rate": 6.350751151640294e-06, "loss": 0.6179, "num_input_tokens_seen": 61684600, "step": 106245 }, { "epoch": 15.825141495382782, "grad_norm": 2.891317367553711, "learning_rate": 6.348587260774991e-06, "loss": 0.4745, "num_input_tokens_seen": 61687288, "step": 106250 }, { "epoch": 15.825886207923741, "grad_norm": 2.174826145172119, "learning_rate": 6.346423685006348e-06, "loss": 0.456, "num_input_tokens_seen": 61690072, "step": 106255 }, { "epoch": 15.826630920464702, "grad_norm": 3.89497971534729, "learning_rate": 6.344260424370912e-06, "loss": 0.7714, "num_input_tokens_seen": 61692792, "step": 106260 }, { "epoch": 15.82737563300566, "grad_norm": 1.7277016639709473, "learning_rate": 6.342097478905243e-06, "loss": 0.5813, "num_input_tokens_seen": 61696056, "step": 106265 }, { "epoch": 15.828120345546619, "grad_norm": 2.4291927814483643, "learning_rate": 6.339934848645868e-06, "loss": 0.6233, "num_input_tokens_seen": 61699128, "step": 106270 }, { "epoch": 15.828865058087578, "grad_norm": 1.2757608890533447, "learning_rate": 6.337772533629333e-06, "loss": 0.5204, "num_input_tokens_seen": 61702008, "step": 106275 }, { "epoch": 15.829609770628537, "grad_norm": 1.1197468042373657, "learning_rate": 6.335610533892156e-06, "loss": 0.587, "num_input_tokens_seen": 61705048, "step": 106280 }, { "epoch": 15.830354483169497, "grad_norm": 2.2063143253326416, "learning_rate": 6.33344884947088e-06, "loss": 0.4666, "num_input_tokens_seen": 61708216, "step": 106285 }, { "epoch": 15.831099195710456, "grad_norm": 0.996676504611969, "learning_rate": 6.331287480402012e-06, "loss": 0.629, "num_input_tokens_seen": 61710904, "step": 106290 }, { "epoch": 15.831843908251415, "grad_norm": 1.5374565124511719, "learning_rate": 6.329126426722068e-06, "loss": 0.5534, "num_input_tokens_seen": 61713720, "step": 106295 }, { "epoch": 15.832588620792373, "grad_norm": 1.433841586112976, "learning_rate": 6.326965688467557e-06, "loss": 0.5915, "num_input_tokens_seen": 61716888, "step": 106300 }, { "epoch": 15.833333333333334, "grad_norm": 2.307806968688965, "learning_rate": 6.324805265674974e-06, "loss": 0.6374, "num_input_tokens_seen": 61720024, "step": 106305 }, { "epoch": 15.834078045874293, "grad_norm": 0.9474868774414062, "learning_rate": 6.322645158380833e-06, "loss": 0.4711, "num_input_tokens_seen": 61723288, "step": 106310 }, { "epoch": 15.834822758415251, "grad_norm": 1.27878999710083, "learning_rate": 6.320485366621612e-06, "loss": 0.6396, "num_input_tokens_seen": 61726168, "step": 106315 }, { "epoch": 15.83556747095621, "grad_norm": 1.4951895475387573, "learning_rate": 6.318325890433813e-06, "loss": 0.7146, "num_input_tokens_seen": 61728824, "step": 106320 }, { "epoch": 15.83631218349717, "grad_norm": 0.8696302771568298, "learning_rate": 6.316166729853906e-06, "loss": 0.5219, "num_input_tokens_seen": 61731544, "step": 106325 }, { "epoch": 15.83705689603813, "grad_norm": 1.58018159866333, "learning_rate": 6.314007884918377e-06, "loss": 0.5907, "num_input_tokens_seen": 61734840, "step": 106330 }, { "epoch": 15.837801608579088, "grad_norm": 2.4243221282958984, "learning_rate": 6.311849355663693e-06, "loss": 0.7105, "num_input_tokens_seen": 61737656, "step": 106335 }, { "epoch": 15.838546321120047, "grad_norm": 2.6163253784179688, "learning_rate": 6.309691142126315e-06, "loss": 0.522, "num_input_tokens_seen": 61740792, "step": 106340 }, { "epoch": 15.839291033661008, "grad_norm": 2.36449933052063, "learning_rate": 6.307533244342717e-06, "loss": 0.571, "num_input_tokens_seen": 61743896, "step": 106345 }, { "epoch": 15.840035746201966, "grad_norm": 1.7110648155212402, "learning_rate": 6.305375662349344e-06, "loss": 0.6951, "num_input_tokens_seen": 61746488, "step": 106350 }, { "epoch": 15.840780458742925, "grad_norm": 1.0259652137756348, "learning_rate": 6.303218396182644e-06, "loss": 0.7461, "num_input_tokens_seen": 61749304, "step": 106355 }, { "epoch": 15.841525171283884, "grad_norm": 2.0914957523345947, "learning_rate": 6.301061445879072e-06, "loss": 0.5917, "num_input_tokens_seen": 61752536, "step": 106360 }, { "epoch": 15.842269883824844, "grad_norm": 3.3425452709198, "learning_rate": 6.298904811475062e-06, "loss": 0.698, "num_input_tokens_seen": 61755448, "step": 106365 }, { "epoch": 15.843014596365803, "grad_norm": 1.296473741531372, "learning_rate": 6.296748493007051e-06, "loss": 0.6305, "num_input_tokens_seen": 61758200, "step": 106370 }, { "epoch": 15.843759308906762, "grad_norm": 1.3759489059448242, "learning_rate": 6.294592490511456e-06, "loss": 0.4318, "num_input_tokens_seen": 61760920, "step": 106375 }, { "epoch": 15.84450402144772, "grad_norm": 2.316741466522217, "learning_rate": 6.292436804024715e-06, "loss": 0.6107, "num_input_tokens_seen": 61763800, "step": 106380 }, { "epoch": 15.845248733988681, "grad_norm": 1.136475682258606, "learning_rate": 6.290281433583237e-06, "loss": 0.4474, "num_input_tokens_seen": 61766488, "step": 106385 }, { "epoch": 15.84599344652964, "grad_norm": 0.8642442226409912, "learning_rate": 6.288126379223444e-06, "loss": 0.5294, "num_input_tokens_seen": 61769240, "step": 106390 }, { "epoch": 15.846738159070599, "grad_norm": 1.8400368690490723, "learning_rate": 6.285971640981731e-06, "loss": 0.6459, "num_input_tokens_seen": 61772056, "step": 106395 }, { "epoch": 15.847482871611557, "grad_norm": 0.7467383742332458, "learning_rate": 6.283817218894514e-06, "loss": 0.527, "num_input_tokens_seen": 61775000, "step": 106400 }, { "epoch": 15.848227584152518, "grad_norm": 2.771895170211792, "learning_rate": 6.281663112998174e-06, "loss": 0.546, "num_input_tokens_seen": 61778008, "step": 106405 }, { "epoch": 15.848972296693477, "grad_norm": 1.5552952289581299, "learning_rate": 6.2795093233291195e-06, "loss": 0.5533, "num_input_tokens_seen": 61781080, "step": 106410 }, { "epoch": 15.849717009234435, "grad_norm": 1.0298765897750854, "learning_rate": 6.27735584992373e-06, "loss": 0.4987, "num_input_tokens_seen": 61784184, "step": 106415 }, { "epoch": 15.850461721775394, "grad_norm": 1.7127174139022827, "learning_rate": 6.275202692818383e-06, "loss": 0.6741, "num_input_tokens_seen": 61787128, "step": 106420 }, { "epoch": 15.851206434316353, "grad_norm": 2.7279789447784424, "learning_rate": 6.2730498520494565e-06, "loss": 0.6429, "num_input_tokens_seen": 61789848, "step": 106425 }, { "epoch": 15.851951146857314, "grad_norm": 0.4447152316570282, "learning_rate": 6.27089732765331e-06, "loss": 0.509, "num_input_tokens_seen": 61792728, "step": 106430 }, { "epoch": 15.852695859398272, "grad_norm": 1.4311445951461792, "learning_rate": 6.2687451196663275e-06, "loss": 0.5579, "num_input_tokens_seen": 61795736, "step": 106435 }, { "epoch": 15.853440571939231, "grad_norm": 1.2265478372573853, "learning_rate": 6.266593228124851e-06, "loss": 0.618, "num_input_tokens_seen": 61798584, "step": 106440 }, { "epoch": 15.85418528448019, "grad_norm": 1.0820987224578857, "learning_rate": 6.264441653065248e-06, "loss": 0.5476, "num_input_tokens_seen": 61801720, "step": 106445 }, { "epoch": 15.85492999702115, "grad_norm": 1.1491105556488037, "learning_rate": 6.262290394523862e-06, "loss": 0.5265, "num_input_tokens_seen": 61804696, "step": 106450 }, { "epoch": 15.85567470956211, "grad_norm": 1.2767277956008911, "learning_rate": 6.260139452537028e-06, "loss": 0.6275, "num_input_tokens_seen": 61807736, "step": 106455 }, { "epoch": 15.856419422103068, "grad_norm": 1.3843603134155273, "learning_rate": 6.257988827141101e-06, "loss": 0.6279, "num_input_tokens_seen": 61810776, "step": 106460 }, { "epoch": 15.857164134644027, "grad_norm": 1.4107571840286255, "learning_rate": 6.255838518372395e-06, "loss": 0.7084, "num_input_tokens_seen": 61813656, "step": 106465 }, { "epoch": 15.857908847184987, "grad_norm": 1.4008055925369263, "learning_rate": 6.253688526267254e-06, "loss": 0.4065, "num_input_tokens_seen": 61816920, "step": 106470 }, { "epoch": 15.858653559725946, "grad_norm": 1.5658248662948608, "learning_rate": 6.251538850861985e-06, "loss": 0.5332, "num_input_tokens_seen": 61819608, "step": 106475 }, { "epoch": 15.859398272266905, "grad_norm": 2.3719966411590576, "learning_rate": 6.24938949219292e-06, "loss": 0.5428, "num_input_tokens_seen": 61822552, "step": 106480 }, { "epoch": 15.860142984807863, "grad_norm": 1.2745641469955444, "learning_rate": 6.2472404502963625e-06, "loss": 0.5688, "num_input_tokens_seen": 61825752, "step": 106485 }, { "epoch": 15.860887697348824, "grad_norm": 1.7291407585144043, "learning_rate": 6.245091725208616e-06, "loss": 0.6808, "num_input_tokens_seen": 61828504, "step": 106490 }, { "epoch": 15.861632409889783, "grad_norm": 1.3958288431167603, "learning_rate": 6.242943316965985e-06, "loss": 0.5332, "num_input_tokens_seen": 61831480, "step": 106495 }, { "epoch": 15.862377122430741, "grad_norm": 1.3136252164840698, "learning_rate": 6.2407952256047565e-06, "loss": 0.4616, "num_input_tokens_seen": 61834456, "step": 106500 }, { "epoch": 15.8631218349717, "grad_norm": 1.004774808883667, "learning_rate": 6.238647451161231e-06, "loss": 0.5723, "num_input_tokens_seen": 61837208, "step": 106505 }, { "epoch": 15.86386654751266, "grad_norm": 2.1559665203094482, "learning_rate": 6.2364999936716825e-06, "loss": 0.7049, "num_input_tokens_seen": 61840024, "step": 106510 }, { "epoch": 15.86461126005362, "grad_norm": 1.838285207748413, "learning_rate": 6.234352853172404e-06, "loss": 0.5124, "num_input_tokens_seen": 61842616, "step": 106515 }, { "epoch": 15.865355972594578, "grad_norm": 0.47023630142211914, "learning_rate": 6.232206029699655e-06, "loss": 0.4167, "num_input_tokens_seen": 61845464, "step": 106520 }, { "epoch": 15.866100685135537, "grad_norm": 0.8993905186653137, "learning_rate": 6.230059523289716e-06, "loss": 0.5469, "num_input_tokens_seen": 61848248, "step": 106525 }, { "epoch": 15.866845397676498, "grad_norm": 1.5497753620147705, "learning_rate": 6.227913333978847e-06, "loss": 0.5721, "num_input_tokens_seen": 61850968, "step": 106530 }, { "epoch": 15.867590110217456, "grad_norm": 1.3768163919448853, "learning_rate": 6.225767461803295e-06, "loss": 0.6617, "num_input_tokens_seen": 61853752, "step": 106535 }, { "epoch": 15.868334822758415, "grad_norm": 2.440610885620117, "learning_rate": 6.223621906799326e-06, "loss": 0.7567, "num_input_tokens_seen": 61856952, "step": 106540 }, { "epoch": 15.869079535299374, "grad_norm": 1.0990830659866333, "learning_rate": 6.221476669003176e-06, "loss": 0.5281, "num_input_tokens_seen": 61859992, "step": 106545 }, { "epoch": 15.869824247840334, "grad_norm": 2.553600549697876, "learning_rate": 6.219331748451096e-06, "loss": 0.5419, "num_input_tokens_seen": 61862936, "step": 106550 }, { "epoch": 15.870568960381293, "grad_norm": 2.4750893115997314, "learning_rate": 6.217187145179321e-06, "loss": 0.54, "num_input_tokens_seen": 61865912, "step": 106555 }, { "epoch": 15.871313672922252, "grad_norm": 1.3693434000015259, "learning_rate": 6.2150428592240795e-06, "loss": 0.5912, "num_input_tokens_seen": 61868440, "step": 106560 }, { "epoch": 15.87205838546321, "grad_norm": 1.550066351890564, "learning_rate": 6.212898890621588e-06, "loss": 0.6317, "num_input_tokens_seen": 61871448, "step": 106565 }, { "epoch": 15.872803098004171, "grad_norm": 1.0892524719238281, "learning_rate": 6.210755239408083e-06, "loss": 0.4686, "num_input_tokens_seen": 61874232, "step": 106570 }, { "epoch": 15.87354781054513, "grad_norm": 1.7964062690734863, "learning_rate": 6.208611905619774e-06, "loss": 0.4845, "num_input_tokens_seen": 61877144, "step": 106575 }, { "epoch": 15.874292523086089, "grad_norm": 1.215056300163269, "learning_rate": 6.20646888929286e-06, "loss": 0.6231, "num_input_tokens_seen": 61880056, "step": 106580 }, { "epoch": 15.875037235627047, "grad_norm": 1.8593456745147705, "learning_rate": 6.204326190463558e-06, "loss": 0.6042, "num_input_tokens_seen": 61882840, "step": 106585 }, { "epoch": 15.875781948168008, "grad_norm": 2.4844014644622803, "learning_rate": 6.202183809168058e-06, "loss": 0.5645, "num_input_tokens_seen": 61885880, "step": 106590 }, { "epoch": 15.876526660708967, "grad_norm": 1.310867428779602, "learning_rate": 6.200041745442561e-06, "loss": 0.5758, "num_input_tokens_seen": 61888664, "step": 106595 }, { "epoch": 15.877271373249926, "grad_norm": 1.3877897262573242, "learning_rate": 6.197899999323245e-06, "loss": 0.5023, "num_input_tokens_seen": 61891320, "step": 106600 }, { "epoch": 15.878016085790884, "grad_norm": 1.2780059576034546, "learning_rate": 6.195758570846308e-06, "loss": 0.6054, "num_input_tokens_seen": 61894296, "step": 106605 }, { "epoch": 15.878760798331843, "grad_norm": 1.0246195793151855, "learning_rate": 6.193617460047918e-06, "loss": 0.4999, "num_input_tokens_seen": 61897208, "step": 106610 }, { "epoch": 15.879505510872804, "grad_norm": 1.864925742149353, "learning_rate": 6.191476666964238e-06, "loss": 0.6393, "num_input_tokens_seen": 61900184, "step": 106615 }, { "epoch": 15.880250223413762, "grad_norm": 1.272074580192566, "learning_rate": 6.189336191631451e-06, "loss": 0.4844, "num_input_tokens_seen": 61902840, "step": 106620 }, { "epoch": 15.880994935954721, "grad_norm": 1.618100881576538, "learning_rate": 6.1871960340857126e-06, "loss": 0.6057, "num_input_tokens_seen": 61906104, "step": 106625 }, { "epoch": 15.88173964849568, "grad_norm": 1.54947829246521, "learning_rate": 6.185056194363176e-06, "loss": 0.6173, "num_input_tokens_seen": 61908888, "step": 106630 }, { "epoch": 15.88248436103664, "grad_norm": 1.9250930547714233, "learning_rate": 6.182916672499983e-06, "loss": 0.5222, "num_input_tokens_seen": 61911832, "step": 106635 }, { "epoch": 15.8832290735776, "grad_norm": 1.5621801614761353, "learning_rate": 6.180777468532298e-06, "loss": 0.4935, "num_input_tokens_seen": 61914776, "step": 106640 }, { "epoch": 15.883973786118558, "grad_norm": 1.9450069665908813, "learning_rate": 6.1786385824962436e-06, "loss": 0.5647, "num_input_tokens_seen": 61917432, "step": 106645 }, { "epoch": 15.884718498659517, "grad_norm": 1.8888185024261475, "learning_rate": 6.176500014427966e-06, "loss": 0.5778, "num_input_tokens_seen": 61920344, "step": 106650 }, { "epoch": 15.885463211200477, "grad_norm": 1.0117833614349365, "learning_rate": 6.17436176436359e-06, "loss": 0.6464, "num_input_tokens_seen": 61922904, "step": 106655 }, { "epoch": 15.886207923741436, "grad_norm": 1.601912498474121, "learning_rate": 6.1722238323392325e-06, "loss": 0.4417, "num_input_tokens_seen": 61926008, "step": 106660 }, { "epoch": 15.886952636282395, "grad_norm": 2.7525131702423096, "learning_rate": 6.1700862183910245e-06, "loss": 0.7408, "num_input_tokens_seen": 61928792, "step": 106665 }, { "epoch": 15.887697348823353, "grad_norm": 1.651214599609375, "learning_rate": 6.167948922555064e-06, "loss": 0.5707, "num_input_tokens_seen": 61931864, "step": 106670 }, { "epoch": 15.888442061364314, "grad_norm": 1.4377585649490356, "learning_rate": 6.165811944867475e-06, "loss": 0.6722, "num_input_tokens_seen": 61934648, "step": 106675 }, { "epoch": 15.889186773905273, "grad_norm": 1.308894395828247, "learning_rate": 6.163675285364348e-06, "loss": 0.7712, "num_input_tokens_seen": 61937528, "step": 106680 }, { "epoch": 15.889931486446232, "grad_norm": 1.272934079170227, "learning_rate": 6.161538944081779e-06, "loss": 0.6247, "num_input_tokens_seen": 61940760, "step": 106685 }, { "epoch": 15.89067619898719, "grad_norm": 1.8261737823486328, "learning_rate": 6.1594029210558675e-06, "loss": 0.3834, "num_input_tokens_seen": 61943800, "step": 106690 }, { "epoch": 15.89142091152815, "grad_norm": 1.5781267881393433, "learning_rate": 6.157267216322696e-06, "loss": 0.5543, "num_input_tokens_seen": 61946680, "step": 106695 }, { "epoch": 15.89216562406911, "grad_norm": 0.7491827011108398, "learning_rate": 6.155131829918345e-06, "loss": 0.422, "num_input_tokens_seen": 61949784, "step": 106700 }, { "epoch": 15.892910336610068, "grad_norm": 1.2295459508895874, "learning_rate": 6.1529967618788795e-06, "loss": 0.4646, "num_input_tokens_seen": 61953048, "step": 106705 }, { "epoch": 15.893655049151027, "grad_norm": 2.821054458618164, "learning_rate": 6.1508620122403885e-06, "loss": 0.6966, "num_input_tokens_seen": 61955736, "step": 106710 }, { "epoch": 15.894399761691988, "grad_norm": 2.071986675262451, "learning_rate": 6.148727581038915e-06, "loss": 0.5231, "num_input_tokens_seen": 61958904, "step": 106715 }, { "epoch": 15.895144474232946, "grad_norm": 1.6555696725845337, "learning_rate": 6.146593468310541e-06, "loss": 0.9054, "num_input_tokens_seen": 61962072, "step": 106720 }, { "epoch": 15.895889186773905, "grad_norm": 1.7747244834899902, "learning_rate": 6.144459674091299e-06, "loss": 0.775, "num_input_tokens_seen": 61965144, "step": 106725 }, { "epoch": 15.896633899314864, "grad_norm": 1.9604257345199585, "learning_rate": 6.1423261984172535e-06, "loss": 0.6205, "num_input_tokens_seen": 61968248, "step": 106730 }, { "epoch": 15.897378611855824, "grad_norm": 1.100189208984375, "learning_rate": 6.140193041324444e-06, "loss": 0.5499, "num_input_tokens_seen": 61971160, "step": 106735 }, { "epoch": 15.898123324396783, "grad_norm": 1.5498408079147339, "learning_rate": 6.138060202848894e-06, "loss": 0.6811, "num_input_tokens_seen": 61974008, "step": 106740 }, { "epoch": 15.898868036937742, "grad_norm": 1.0640016794204712, "learning_rate": 6.135927683026654e-06, "loss": 0.6429, "num_input_tokens_seen": 61976824, "step": 106745 }, { "epoch": 15.8996127494787, "grad_norm": 2.008004903793335, "learning_rate": 6.133795481893745e-06, "loss": 0.5239, "num_input_tokens_seen": 61979608, "step": 106750 }, { "epoch": 15.90035746201966, "grad_norm": 2.01533579826355, "learning_rate": 6.1316635994861875e-06, "loss": 0.8521, "num_input_tokens_seen": 61982488, "step": 106755 }, { "epoch": 15.90110217456062, "grad_norm": 1.4398632049560547, "learning_rate": 6.129532035839985e-06, "loss": 0.6128, "num_input_tokens_seen": 61985400, "step": 106760 }, { "epoch": 15.901846887101579, "grad_norm": 1.731142520904541, "learning_rate": 6.12740079099117e-06, "loss": 0.5957, "num_input_tokens_seen": 61988280, "step": 106765 }, { "epoch": 15.902591599642538, "grad_norm": 1.1211475133895874, "learning_rate": 6.12526986497573e-06, "loss": 0.4844, "num_input_tokens_seen": 61991096, "step": 106770 }, { "epoch": 15.903336312183498, "grad_norm": 1.586608648300171, "learning_rate": 6.12313925782968e-06, "loss": 0.6825, "num_input_tokens_seen": 61994008, "step": 106775 }, { "epoch": 15.904081024724457, "grad_norm": 0.6279308795928955, "learning_rate": 6.1210089695890065e-06, "loss": 0.5376, "num_input_tokens_seen": 61997240, "step": 106780 }, { "epoch": 15.904825737265416, "grad_norm": 1.867490530014038, "learning_rate": 6.11887900028969e-06, "loss": 0.5623, "num_input_tokens_seen": 62000184, "step": 106785 }, { "epoch": 15.905570449806374, "grad_norm": 1.3406002521514893, "learning_rate": 6.116749349967732e-06, "loss": 0.6501, "num_input_tokens_seen": 62003096, "step": 106790 }, { "epoch": 15.906315162347333, "grad_norm": 1.2351107597351074, "learning_rate": 6.114620018659093e-06, "loss": 0.5432, "num_input_tokens_seen": 62006296, "step": 106795 }, { "epoch": 15.907059874888294, "grad_norm": 3.216216564178467, "learning_rate": 6.112491006399762e-06, "loss": 0.6631, "num_input_tokens_seen": 62009336, "step": 106800 }, { "epoch": 15.907804587429252, "grad_norm": 1.5101542472839355, "learning_rate": 6.110362313225693e-06, "loss": 0.5198, "num_input_tokens_seen": 62012152, "step": 106805 }, { "epoch": 15.908549299970211, "grad_norm": 1.9397653341293335, "learning_rate": 6.108233939172858e-06, "loss": 0.6506, "num_input_tokens_seen": 62014776, "step": 106810 }, { "epoch": 15.90929401251117, "grad_norm": 1.6617732048034668, "learning_rate": 6.106105884277213e-06, "loss": 0.5446, "num_input_tokens_seen": 62017752, "step": 106815 }, { "epoch": 15.91003872505213, "grad_norm": 1.146293044090271, "learning_rate": 6.1039781485747045e-06, "loss": 0.6087, "num_input_tokens_seen": 62020600, "step": 106820 }, { "epoch": 15.91078343759309, "grad_norm": 1.680190086364746, "learning_rate": 6.101850732101283e-06, "loss": 0.6989, "num_input_tokens_seen": 62023384, "step": 106825 }, { "epoch": 15.911528150134048, "grad_norm": 1.3418676853179932, "learning_rate": 6.099723634892876e-06, "loss": 0.6124, "num_input_tokens_seen": 62026488, "step": 106830 }, { "epoch": 15.912272862675007, "grad_norm": 1.0661193132400513, "learning_rate": 6.097596856985435e-06, "loss": 0.6444, "num_input_tokens_seen": 62029080, "step": 106835 }, { "epoch": 15.913017575215967, "grad_norm": 1.6319386959075928, "learning_rate": 6.095470398414879e-06, "loss": 0.6914, "num_input_tokens_seen": 62032344, "step": 106840 }, { "epoch": 15.913762287756926, "grad_norm": 1.6694393157958984, "learning_rate": 6.093344259217143e-06, "loss": 0.6236, "num_input_tokens_seen": 62035224, "step": 106845 }, { "epoch": 15.914507000297885, "grad_norm": 0.5816614627838135, "learning_rate": 6.091218439428134e-06, "loss": 0.327, "num_input_tokens_seen": 62037976, "step": 106850 }, { "epoch": 15.915251712838844, "grad_norm": 1.1279149055480957, "learning_rate": 6.089092939083779e-06, "loss": 0.356, "num_input_tokens_seen": 62040888, "step": 106855 }, { "epoch": 15.915996425379804, "grad_norm": 2.0454602241516113, "learning_rate": 6.0869677582199775e-06, "loss": 0.7343, "num_input_tokens_seen": 62043576, "step": 106860 }, { "epoch": 15.916741137920763, "grad_norm": 1.967368483543396, "learning_rate": 6.084842896872625e-06, "loss": 0.5556, "num_input_tokens_seen": 62046776, "step": 106865 }, { "epoch": 15.917485850461722, "grad_norm": 1.3915860652923584, "learning_rate": 6.082718355077635e-06, "loss": 0.6981, "num_input_tokens_seen": 62049720, "step": 106870 }, { "epoch": 15.91823056300268, "grad_norm": 3.492180585861206, "learning_rate": 6.080594132870885e-06, "loss": 0.5387, "num_input_tokens_seen": 62052760, "step": 106875 }, { "epoch": 15.918975275543641, "grad_norm": 1.3678299188613892, "learning_rate": 6.078470230288274e-06, "loss": 0.6205, "num_input_tokens_seen": 62055768, "step": 106880 }, { "epoch": 15.9197199880846, "grad_norm": 1.3628475666046143, "learning_rate": 6.07634664736568e-06, "loss": 0.5552, "num_input_tokens_seen": 62058424, "step": 106885 }, { "epoch": 15.920464700625558, "grad_norm": 2.1918342113494873, "learning_rate": 6.074223384138977e-06, "loss": 0.5546, "num_input_tokens_seen": 62061272, "step": 106890 }, { "epoch": 15.921209413166517, "grad_norm": 1.909230351448059, "learning_rate": 6.072100440644033e-06, "loss": 0.7071, "num_input_tokens_seen": 62063960, "step": 106895 }, { "epoch": 15.921954125707478, "grad_norm": 3.132537603378296, "learning_rate": 6.069977816916705e-06, "loss": 0.5362, "num_input_tokens_seen": 62067128, "step": 106900 }, { "epoch": 15.922698838248436, "grad_norm": 1.5336021184921265, "learning_rate": 6.067855512992873e-06, "loss": 0.7576, "num_input_tokens_seen": 62069912, "step": 106905 }, { "epoch": 15.923443550789395, "grad_norm": 1.9851012229919434, "learning_rate": 6.065733528908371e-06, "loss": 0.6436, "num_input_tokens_seen": 62072760, "step": 106910 }, { "epoch": 15.924188263330354, "grad_norm": 1.5352271795272827, "learning_rate": 6.0636118646990644e-06, "loss": 0.5203, "num_input_tokens_seen": 62075608, "step": 106915 }, { "epoch": 15.924932975871315, "grad_norm": 2.912662982940674, "learning_rate": 6.061490520400784e-06, "loss": 0.7641, "num_input_tokens_seen": 62079032, "step": 106920 }, { "epoch": 15.925677688412273, "grad_norm": 2.1720428466796875, "learning_rate": 6.059369496049377e-06, "loss": 0.6603, "num_input_tokens_seen": 62081848, "step": 106925 }, { "epoch": 15.926422400953232, "grad_norm": 2.5336718559265137, "learning_rate": 6.05724879168067e-06, "loss": 0.5909, "num_input_tokens_seen": 62084568, "step": 106930 }, { "epoch": 15.92716711349419, "grad_norm": 1.8251726627349854, "learning_rate": 6.055128407330493e-06, "loss": 0.6487, "num_input_tokens_seen": 62087544, "step": 106935 }, { "epoch": 15.92791182603515, "grad_norm": 1.8736188411712646, "learning_rate": 6.053008343034671e-06, "loss": 0.6342, "num_input_tokens_seen": 62090488, "step": 106940 }, { "epoch": 15.92865653857611, "grad_norm": 1.274552583694458, "learning_rate": 6.0508885988290075e-06, "loss": 0.5313, "num_input_tokens_seen": 62093336, "step": 106945 }, { "epoch": 15.929401251117069, "grad_norm": 1.757529616355896, "learning_rate": 6.04876917474933e-06, "loss": 0.4994, "num_input_tokens_seen": 62096088, "step": 106950 }, { "epoch": 15.930145963658028, "grad_norm": 1.2619653940200806, "learning_rate": 6.046650070831436e-06, "loss": 0.5138, "num_input_tokens_seen": 62098968, "step": 106955 }, { "epoch": 15.930890676198988, "grad_norm": 1.7977231740951538, "learning_rate": 6.044531287111124e-06, "loss": 0.7431, "num_input_tokens_seen": 62101880, "step": 106960 }, { "epoch": 15.931635388739947, "grad_norm": 1.6682442426681519, "learning_rate": 6.042412823624186e-06, "loss": 0.6343, "num_input_tokens_seen": 62104824, "step": 106965 }, { "epoch": 15.932380101280906, "grad_norm": 1.3216403722763062, "learning_rate": 6.040294680406422e-06, "loss": 0.5244, "num_input_tokens_seen": 62107768, "step": 106970 }, { "epoch": 15.933124813821864, "grad_norm": 1.6096110343933105, "learning_rate": 6.0381768574936104e-06, "loss": 0.6192, "num_input_tokens_seen": 62110520, "step": 106975 }, { "epoch": 15.933869526362823, "grad_norm": 2.138341188430786, "learning_rate": 6.03605935492152e-06, "loss": 0.4669, "num_input_tokens_seen": 62113272, "step": 106980 }, { "epoch": 15.934614238903784, "grad_norm": 1.5621328353881836, "learning_rate": 6.0339421727259395e-06, "loss": 0.5521, "num_input_tokens_seen": 62116152, "step": 106985 }, { "epoch": 15.935358951444742, "grad_norm": 1.6238315105438232, "learning_rate": 6.031825310942624e-06, "loss": 0.5399, "num_input_tokens_seen": 62119192, "step": 106990 }, { "epoch": 15.936103663985701, "grad_norm": 1.023168921470642, "learning_rate": 6.029708769607348e-06, "loss": 0.546, "num_input_tokens_seen": 62122232, "step": 106995 }, { "epoch": 15.93684837652666, "grad_norm": 1.5473564863204956, "learning_rate": 6.027592548755853e-06, "loss": 0.5684, "num_input_tokens_seen": 62124952, "step": 107000 }, { "epoch": 15.93759308906762, "grad_norm": 5.787594795227051, "learning_rate": 6.025476648423908e-06, "loss": 0.6387, "num_input_tokens_seen": 62127832, "step": 107005 }, { "epoch": 15.93833780160858, "grad_norm": 0.9275864362716675, "learning_rate": 6.023361068647251e-06, "loss": 0.4536, "num_input_tokens_seen": 62131000, "step": 107010 }, { "epoch": 15.939082514149538, "grad_norm": 2.0957753658294678, "learning_rate": 6.021245809461615e-06, "loss": 0.4573, "num_input_tokens_seen": 62133816, "step": 107015 }, { "epoch": 15.939827226690497, "grad_norm": 1.4687044620513916, "learning_rate": 6.0191308709027475e-06, "loss": 0.6472, "num_input_tokens_seen": 62136792, "step": 107020 }, { "epoch": 15.940571939231457, "grad_norm": 1.6300421953201294, "learning_rate": 6.017016253006372e-06, "loss": 0.7512, "num_input_tokens_seen": 62139576, "step": 107025 }, { "epoch": 15.941316651772416, "grad_norm": 1.2532762289047241, "learning_rate": 6.014901955808216e-06, "loss": 0.6282, "num_input_tokens_seen": 62142424, "step": 107030 }, { "epoch": 15.942061364313375, "grad_norm": 1.2378438711166382, "learning_rate": 6.012787979343987e-06, "loss": 0.5524, "num_input_tokens_seen": 62145272, "step": 107035 }, { "epoch": 15.942806076854334, "grad_norm": 1.3144257068634033, "learning_rate": 6.010674323649415e-06, "loss": 0.6016, "num_input_tokens_seen": 62148248, "step": 107040 }, { "epoch": 15.943550789395294, "grad_norm": 1.0875135660171509, "learning_rate": 6.0085609887601925e-06, "loss": 0.7356, "num_input_tokens_seen": 62150968, "step": 107045 }, { "epoch": 15.944295501936253, "grad_norm": 1.4092024564743042, "learning_rate": 6.0064479747120375e-06, "loss": 0.6309, "num_input_tokens_seen": 62153912, "step": 107050 }, { "epoch": 15.945040214477212, "grad_norm": 1.9523428678512573, "learning_rate": 6.004335281540641e-06, "loss": 0.724, "num_input_tokens_seen": 62157272, "step": 107055 }, { "epoch": 15.94578492701817, "grad_norm": 0.8090953230857849, "learning_rate": 6.002222909281685e-06, "loss": 0.4384, "num_input_tokens_seen": 62160024, "step": 107060 }, { "epoch": 15.946529639559131, "grad_norm": 1.7161651849746704, "learning_rate": 6.000110857970873e-06, "loss": 0.5445, "num_input_tokens_seen": 62162936, "step": 107065 }, { "epoch": 15.94727435210009, "grad_norm": 1.9080519676208496, "learning_rate": 5.9979991276438695e-06, "loss": 0.6123, "num_input_tokens_seen": 62165880, "step": 107070 }, { "epoch": 15.948019064641048, "grad_norm": 1.915419340133667, "learning_rate": 5.995887718336363e-06, "loss": 0.6439, "num_input_tokens_seen": 62168632, "step": 107075 }, { "epoch": 15.948763777182007, "grad_norm": 1.8990371227264404, "learning_rate": 5.993776630084022e-06, "loss": 0.6579, "num_input_tokens_seen": 62171576, "step": 107080 }, { "epoch": 15.949508489722968, "grad_norm": 2.551436424255371, "learning_rate": 5.991665862922505e-06, "loss": 0.6843, "num_input_tokens_seen": 62174488, "step": 107085 }, { "epoch": 15.950253202263927, "grad_norm": 4.169955253601074, "learning_rate": 5.989555416887469e-06, "loss": 0.4307, "num_input_tokens_seen": 62177112, "step": 107090 }, { "epoch": 15.950997914804885, "grad_norm": 2.662062883377075, "learning_rate": 5.987445292014579e-06, "loss": 0.4298, "num_input_tokens_seen": 62179928, "step": 107095 }, { "epoch": 15.951742627345844, "grad_norm": 0.8304665088653564, "learning_rate": 5.985335488339477e-06, "loss": 0.555, "num_input_tokens_seen": 62182872, "step": 107100 }, { "epoch": 15.952487339886805, "grad_norm": 2.8599610328674316, "learning_rate": 5.983226005897799e-06, "loss": 0.6743, "num_input_tokens_seen": 62185848, "step": 107105 }, { "epoch": 15.953232052427763, "grad_norm": 1.9197546243667603, "learning_rate": 5.981116844725199e-06, "loss": 0.6335, "num_input_tokens_seen": 62188632, "step": 107110 }, { "epoch": 15.953976764968722, "grad_norm": 1.1648132801055908, "learning_rate": 5.979008004857292e-06, "loss": 0.4048, "num_input_tokens_seen": 62191480, "step": 107115 }, { "epoch": 15.95472147750968, "grad_norm": 1.4989038705825806, "learning_rate": 5.976899486329718e-06, "loss": 0.4314, "num_input_tokens_seen": 62194488, "step": 107120 }, { "epoch": 15.95546619005064, "grad_norm": 1.9305375814437866, "learning_rate": 5.974791289178089e-06, "loss": 0.6406, "num_input_tokens_seen": 62197304, "step": 107125 }, { "epoch": 15.9562109025916, "grad_norm": 2.2003865242004395, "learning_rate": 5.972683413438029e-06, "loss": 0.6049, "num_input_tokens_seen": 62200344, "step": 107130 }, { "epoch": 15.956955615132559, "grad_norm": 1.0316189527511597, "learning_rate": 5.970575859145144e-06, "loss": 0.4599, "num_input_tokens_seen": 62203224, "step": 107135 }, { "epoch": 15.957700327673518, "grad_norm": 1.4847136735916138, "learning_rate": 5.968468626335033e-06, "loss": 0.384, "num_input_tokens_seen": 62206072, "step": 107140 }, { "epoch": 15.958445040214476, "grad_norm": 1.3856264352798462, "learning_rate": 5.966361715043312e-06, "loss": 0.4597, "num_input_tokens_seen": 62209080, "step": 107145 }, { "epoch": 15.959189752755437, "grad_norm": 1.4506422281265259, "learning_rate": 5.96425512530556e-06, "loss": 0.5902, "num_input_tokens_seen": 62211832, "step": 107150 }, { "epoch": 15.959934465296396, "grad_norm": 1.0102977752685547, "learning_rate": 5.962148857157373e-06, "loss": 0.8332, "num_input_tokens_seen": 62214680, "step": 107155 }, { "epoch": 15.960679177837354, "grad_norm": 1.0165622234344482, "learning_rate": 5.960042910634325e-06, "loss": 0.5394, "num_input_tokens_seen": 62217432, "step": 107160 }, { "epoch": 15.961423890378313, "grad_norm": 2.3527090549468994, "learning_rate": 5.9579372857720085e-06, "loss": 0.6092, "num_input_tokens_seen": 62220280, "step": 107165 }, { "epoch": 15.962168602919274, "grad_norm": 2.2687106132507324, "learning_rate": 5.9558319826059775e-06, "loss": 0.5052, "num_input_tokens_seen": 62223160, "step": 107170 }, { "epoch": 15.962913315460233, "grad_norm": 1.0066466331481934, "learning_rate": 5.953727001171819e-06, "loss": 0.3621, "num_input_tokens_seen": 62226104, "step": 107175 }, { "epoch": 15.963658028001191, "grad_norm": 2.1422438621520996, "learning_rate": 5.951622341505086e-06, "loss": 0.5018, "num_input_tokens_seen": 62229048, "step": 107180 }, { "epoch": 15.96440274054215, "grad_norm": 2.1532137393951416, "learning_rate": 5.949518003641325e-06, "loss": 0.4462, "num_input_tokens_seen": 62231800, "step": 107185 }, { "epoch": 15.96514745308311, "grad_norm": 2.5200555324554443, "learning_rate": 5.947413987616105e-06, "loss": 0.8651, "num_input_tokens_seen": 62234584, "step": 107190 }, { "epoch": 15.96589216562407, "grad_norm": 1.85788094997406, "learning_rate": 5.94531029346495e-06, "loss": 0.7026, "num_input_tokens_seen": 62237432, "step": 107195 }, { "epoch": 15.966636878165028, "grad_norm": 2.218726873397827, "learning_rate": 5.943206921223421e-06, "loss": 0.5847, "num_input_tokens_seen": 62240344, "step": 107200 }, { "epoch": 15.967381590705987, "grad_norm": 2.0317821502685547, "learning_rate": 5.9411038709270365e-06, "loss": 0.6233, "num_input_tokens_seen": 62243320, "step": 107205 }, { "epoch": 15.968126303246947, "grad_norm": 1.5241726636886597, "learning_rate": 5.939001142611336e-06, "loss": 0.5968, "num_input_tokens_seen": 62246392, "step": 107210 }, { "epoch": 15.968871015787906, "grad_norm": 1.7204256057739258, "learning_rate": 5.9368987363118415e-06, "loss": 0.6679, "num_input_tokens_seen": 62249336, "step": 107215 }, { "epoch": 15.969615728328865, "grad_norm": 4.363892555236816, "learning_rate": 5.934796652064065e-06, "loss": 0.6696, "num_input_tokens_seen": 62252152, "step": 107220 }, { "epoch": 15.970360440869824, "grad_norm": 2.2445919513702393, "learning_rate": 5.932694889903523e-06, "loss": 0.6249, "num_input_tokens_seen": 62254840, "step": 107225 }, { "epoch": 15.971105153410784, "grad_norm": 2.204798460006714, "learning_rate": 5.930593449865715e-06, "loss": 0.5587, "num_input_tokens_seen": 62257816, "step": 107230 }, { "epoch": 15.971849865951743, "grad_norm": 2.1233770847320557, "learning_rate": 5.928492331986155e-06, "loss": 0.7055, "num_input_tokens_seen": 62260728, "step": 107235 }, { "epoch": 15.972594578492702, "grad_norm": 1.3735262155532837, "learning_rate": 5.9263915363003294e-06, "loss": 0.5541, "num_input_tokens_seen": 62263416, "step": 107240 }, { "epoch": 15.97333929103366, "grad_norm": 1.8253358602523804, "learning_rate": 5.924291062843737e-06, "loss": 0.5761, "num_input_tokens_seen": 62266552, "step": 107245 }, { "epoch": 15.974084003574621, "grad_norm": 2.3405656814575195, "learning_rate": 5.922190911651857e-06, "loss": 0.822, "num_input_tokens_seen": 62269464, "step": 107250 }, { "epoch": 15.97482871611558, "grad_norm": 2.2284047603607178, "learning_rate": 5.920091082760174e-06, "loss": 0.5639, "num_input_tokens_seen": 62272408, "step": 107255 }, { "epoch": 15.975573428656539, "grad_norm": 1.3235396146774292, "learning_rate": 5.917991576204163e-06, "loss": 0.671, "num_input_tokens_seen": 62275256, "step": 107260 }, { "epoch": 15.976318141197497, "grad_norm": 2.880000114440918, "learning_rate": 5.915892392019282e-06, "loss": 0.5736, "num_input_tokens_seen": 62278264, "step": 107265 }, { "epoch": 15.977062853738456, "grad_norm": 1.82926344871521, "learning_rate": 5.913793530241011e-06, "loss": 0.7268, "num_input_tokens_seen": 62281016, "step": 107270 }, { "epoch": 15.977807566279417, "grad_norm": 1.284600853919983, "learning_rate": 5.91169499090479e-06, "loss": 0.7312, "num_input_tokens_seen": 62283608, "step": 107275 }, { "epoch": 15.978552278820375, "grad_norm": 1.40542471408844, "learning_rate": 5.909596774046092e-06, "loss": 0.5236, "num_input_tokens_seen": 62286584, "step": 107280 }, { "epoch": 15.979296991361334, "grad_norm": 1.6956545114517212, "learning_rate": 5.907498879700352e-06, "loss": 0.7075, "num_input_tokens_seen": 62289496, "step": 107285 }, { "epoch": 15.980041703902295, "grad_norm": 1.6156331300735474, "learning_rate": 5.905401307903013e-06, "loss": 0.6181, "num_input_tokens_seen": 62292536, "step": 107290 }, { "epoch": 15.980786416443253, "grad_norm": 2.045881509780884, "learning_rate": 5.903304058689507e-06, "loss": 0.6457, "num_input_tokens_seen": 62295608, "step": 107295 }, { "epoch": 15.981531128984212, "grad_norm": 1.6637805700302124, "learning_rate": 5.901207132095276e-06, "loss": 0.6487, "num_input_tokens_seen": 62299256, "step": 107300 }, { "epoch": 15.982275841525171, "grad_norm": 1.013018250465393, "learning_rate": 5.899110528155741e-06, "loss": 0.5075, "num_input_tokens_seen": 62302328, "step": 107305 }, { "epoch": 15.98302055406613, "grad_norm": 1.9269204139709473, "learning_rate": 5.897014246906312e-06, "loss": 0.4987, "num_input_tokens_seen": 62305464, "step": 107310 }, { "epoch": 15.98376526660709, "grad_norm": 1.4821641445159912, "learning_rate": 5.894918288382417e-06, "loss": 0.5765, "num_input_tokens_seen": 62308376, "step": 107315 }, { "epoch": 15.984509979148049, "grad_norm": 2.837967872619629, "learning_rate": 5.8928226526194565e-06, "loss": 0.6988, "num_input_tokens_seen": 62311096, "step": 107320 }, { "epoch": 15.985254691689008, "grad_norm": 2.0527396202087402, "learning_rate": 5.890727339652843e-06, "loss": 0.4973, "num_input_tokens_seen": 62314008, "step": 107325 }, { "epoch": 15.985999404229966, "grad_norm": 2.8861331939697266, "learning_rate": 5.888632349517962e-06, "loss": 0.5609, "num_input_tokens_seen": 62317112, "step": 107330 }, { "epoch": 15.986744116770927, "grad_norm": 2.1828269958496094, "learning_rate": 5.886537682250221e-06, "loss": 0.6462, "num_input_tokens_seen": 62319928, "step": 107335 }, { "epoch": 15.987488829311886, "grad_norm": 0.9549733400344849, "learning_rate": 5.8844433378849986e-06, "loss": 0.5722, "num_input_tokens_seen": 62322872, "step": 107340 }, { "epoch": 15.988233541852845, "grad_norm": 1.6678224802017212, "learning_rate": 5.882349316457672e-06, "loss": 0.5365, "num_input_tokens_seen": 62325912, "step": 107345 }, { "epoch": 15.988978254393803, "grad_norm": 1.180429220199585, "learning_rate": 5.88025561800363e-06, "loss": 0.7296, "num_input_tokens_seen": 62328952, "step": 107350 }, { "epoch": 15.989722966934764, "grad_norm": 1.4771833419799805, "learning_rate": 5.878162242558239e-06, "loss": 0.6006, "num_input_tokens_seen": 62331960, "step": 107355 }, { "epoch": 15.990467679475723, "grad_norm": 0.7813927531242371, "learning_rate": 5.87606919015686e-06, "loss": 0.5819, "num_input_tokens_seen": 62334648, "step": 107360 }, { "epoch": 15.991212392016681, "grad_norm": 1.4501334428787231, "learning_rate": 5.873976460834848e-06, "loss": 0.6429, "num_input_tokens_seen": 62337464, "step": 107365 }, { "epoch": 15.99195710455764, "grad_norm": 1.5580424070358276, "learning_rate": 5.871884054627571e-06, "loss": 0.5289, "num_input_tokens_seen": 62340408, "step": 107370 }, { "epoch": 15.9927018170986, "grad_norm": 1.3757692575454712, "learning_rate": 5.869791971570368e-06, "loss": 0.8257, "num_input_tokens_seen": 62343288, "step": 107375 }, { "epoch": 15.99344652963956, "grad_norm": 1.6860060691833496, "learning_rate": 5.867700211698593e-06, "loss": 0.6358, "num_input_tokens_seen": 62346360, "step": 107380 }, { "epoch": 15.994191242180518, "grad_norm": 2.69301438331604, "learning_rate": 5.8656087750475765e-06, "loss": 0.7532, "num_input_tokens_seen": 62349432, "step": 107385 }, { "epoch": 15.994935954721477, "grad_norm": 1.3305946588516235, "learning_rate": 5.863517661652645e-06, "loss": 0.5832, "num_input_tokens_seen": 62352504, "step": 107390 }, { "epoch": 15.995680667262437, "grad_norm": 1.6966564655303955, "learning_rate": 5.861426871549142e-06, "loss": 0.6146, "num_input_tokens_seen": 62355448, "step": 107395 }, { "epoch": 15.996425379803396, "grad_norm": 1.2946540117263794, "learning_rate": 5.859336404772372e-06, "loss": 0.5361, "num_input_tokens_seen": 62358392, "step": 107400 }, { "epoch": 15.997170092344355, "grad_norm": 2.1838419437408447, "learning_rate": 5.857246261357666e-06, "loss": 0.5847, "num_input_tokens_seen": 62361144, "step": 107405 }, { "epoch": 15.997914804885314, "grad_norm": 3.5014026165008545, "learning_rate": 5.855156441340331e-06, "loss": 0.7401, "num_input_tokens_seen": 62363768, "step": 107410 }, { "epoch": 15.998659517426274, "grad_norm": 1.7978547811508179, "learning_rate": 5.853066944755667e-06, "loss": 0.5314, "num_input_tokens_seen": 62366360, "step": 107415 }, { "epoch": 15.999404229967233, "grad_norm": 1.4032663106918335, "learning_rate": 5.8509777716389715e-06, "loss": 0.373, "num_input_tokens_seen": 62369464, "step": 107420 }, { "epoch": 16.0, "eval_loss": 0.657461941242218, "eval_runtime": 47.0015, "eval_samples_per_second": 63.487, "eval_steps_per_second": 15.872, "num_input_tokens_seen": 62371472, "step": 107424 }, { "epoch": 16.000148942508194, "grad_norm": 0.9630569815635681, "learning_rate": 5.848888922025553e-06, "loss": 0.4822, "num_input_tokens_seen": 62372016, "step": 107425 }, { "epoch": 16.00089365504915, "grad_norm": 0.8765626549720764, "learning_rate": 5.8468003959506915e-06, "loss": 0.4956, "num_input_tokens_seen": 62374608, "step": 107430 }, { "epoch": 16.00163836759011, "grad_norm": 1.7221115827560425, "learning_rate": 5.844712193449662e-06, "loss": 0.5425, "num_input_tokens_seen": 62377584, "step": 107435 }, { "epoch": 16.002383080131068, "grad_norm": 0.6308751702308655, "learning_rate": 5.842624314557757e-06, "loss": 0.5494, "num_input_tokens_seen": 62380464, "step": 107440 }, { "epoch": 16.00312779267203, "grad_norm": 2.3536875247955322, "learning_rate": 5.840536759310239e-06, "loss": 0.6239, "num_input_tokens_seen": 62383312, "step": 107445 }, { "epoch": 16.00387250521299, "grad_norm": 0.8917315006256104, "learning_rate": 5.838449527742388e-06, "loss": 0.5754, "num_input_tokens_seen": 62386320, "step": 107450 }, { "epoch": 16.004617217753946, "grad_norm": 1.8197687864303589, "learning_rate": 5.836362619889446e-06, "loss": 0.6611, "num_input_tokens_seen": 62389168, "step": 107455 }, { "epoch": 16.005361930294907, "grad_norm": 1.2287745475769043, "learning_rate": 5.83427603578669e-06, "loss": 0.3817, "num_input_tokens_seen": 62392144, "step": 107460 }, { "epoch": 16.006106642835864, "grad_norm": 1.8615862131118774, "learning_rate": 5.832189775469363e-06, "loss": 0.5402, "num_input_tokens_seen": 62395568, "step": 107465 }, { "epoch": 16.006851355376824, "grad_norm": 2.009418249130249, "learning_rate": 5.8301038389727005e-06, "loss": 0.5781, "num_input_tokens_seen": 62398448, "step": 107470 }, { "epoch": 16.007596067917785, "grad_norm": 2.0282719135284424, "learning_rate": 5.8280182263319545e-06, "loss": 0.5962, "num_input_tokens_seen": 62401232, "step": 107475 }, { "epoch": 16.00834078045874, "grad_norm": 2.7536468505859375, "learning_rate": 5.825932937582357e-06, "loss": 0.7592, "num_input_tokens_seen": 62404016, "step": 107480 }, { "epoch": 16.009085492999702, "grad_norm": 1.4207872152328491, "learning_rate": 5.823847972759136e-06, "loss": 0.5548, "num_input_tokens_seen": 62406800, "step": 107485 }, { "epoch": 16.009830205540663, "grad_norm": 1.494655966758728, "learning_rate": 5.821763331897503e-06, "loss": 0.5515, "num_input_tokens_seen": 62409648, "step": 107490 }, { "epoch": 16.01057491808162, "grad_norm": 0.9809096455574036, "learning_rate": 5.819679015032697e-06, "loss": 0.4895, "num_input_tokens_seen": 62412624, "step": 107495 }, { "epoch": 16.01131963062258, "grad_norm": 2.41715407371521, "learning_rate": 5.81759502219992e-06, "loss": 0.4853, "num_input_tokens_seen": 62415536, "step": 107500 }, { "epoch": 16.012064343163537, "grad_norm": 2.4236364364624023, "learning_rate": 5.815511353434372e-06, "loss": 0.7357, "num_input_tokens_seen": 62418448, "step": 107505 }, { "epoch": 16.012809055704498, "grad_norm": 2.2391605377197266, "learning_rate": 5.813428008771266e-06, "loss": 0.6657, "num_input_tokens_seen": 62421136, "step": 107510 }, { "epoch": 16.01355376824546, "grad_norm": 1.6435160636901855, "learning_rate": 5.811344988245787e-06, "loss": 0.7721, "num_input_tokens_seen": 62424400, "step": 107515 }, { "epoch": 16.014298480786415, "grad_norm": 1.6567133665084839, "learning_rate": 5.809262291893141e-06, "loss": 0.6331, "num_input_tokens_seen": 62427152, "step": 107520 }, { "epoch": 16.015043193327376, "grad_norm": 1.6300040483474731, "learning_rate": 5.807179919748496e-06, "loss": 0.5872, "num_input_tokens_seen": 62429936, "step": 107525 }, { "epoch": 16.015787905868336, "grad_norm": 1.810653805732727, "learning_rate": 5.805097871847046e-06, "loss": 0.7922, "num_input_tokens_seen": 62432976, "step": 107530 }, { "epoch": 16.016532618409293, "grad_norm": 0.9549045562744141, "learning_rate": 5.803016148223953e-06, "loss": 0.5075, "num_input_tokens_seen": 62435568, "step": 107535 }, { "epoch": 16.017277330950254, "grad_norm": 2.1561055183410645, "learning_rate": 5.800934748914397e-06, "loss": 0.6081, "num_input_tokens_seen": 62438288, "step": 107540 }, { "epoch": 16.01802204349121, "grad_norm": 1.381911039352417, "learning_rate": 5.798853673953536e-06, "loss": 0.5443, "num_input_tokens_seen": 62441264, "step": 107545 }, { "epoch": 16.01876675603217, "grad_norm": 1.4083280563354492, "learning_rate": 5.796772923376526e-06, "loss": 0.5134, "num_input_tokens_seen": 62444432, "step": 107550 }, { "epoch": 16.019511468573132, "grad_norm": 1.3271749019622803, "learning_rate": 5.794692497218521e-06, "loss": 0.6759, "num_input_tokens_seen": 62447440, "step": 107555 }, { "epoch": 16.02025618111409, "grad_norm": 1.2623413801193237, "learning_rate": 5.79261239551466e-06, "loss": 0.9915, "num_input_tokens_seen": 62450224, "step": 107560 }, { "epoch": 16.02100089365505, "grad_norm": 2.4788150787353516, "learning_rate": 5.790532618300099e-06, "loss": 0.8858, "num_input_tokens_seen": 62452912, "step": 107565 }, { "epoch": 16.02174560619601, "grad_norm": 3.870495319366455, "learning_rate": 5.788453165609955e-06, "loss": 0.7386, "num_input_tokens_seen": 62456208, "step": 107570 }, { "epoch": 16.022490318736967, "grad_norm": 1.2350797653198242, "learning_rate": 5.786374037479381e-06, "loss": 0.5175, "num_input_tokens_seen": 62459344, "step": 107575 }, { "epoch": 16.023235031277927, "grad_norm": 2.1536264419555664, "learning_rate": 5.784295233943488e-06, "loss": 0.6399, "num_input_tokens_seen": 62462448, "step": 107580 }, { "epoch": 16.023979743818884, "grad_norm": 1.4682869911193848, "learning_rate": 5.7822167550373865e-06, "loss": 0.6031, "num_input_tokens_seen": 62465264, "step": 107585 }, { "epoch": 16.024724456359845, "grad_norm": 1.3090533018112183, "learning_rate": 5.780138600796212e-06, "loss": 0.651, "num_input_tokens_seen": 62468496, "step": 107590 }, { "epoch": 16.025469168900806, "grad_norm": 1.0157830715179443, "learning_rate": 5.778060771255053e-06, "loss": 0.6497, "num_input_tokens_seen": 62471696, "step": 107595 }, { "epoch": 16.026213881441763, "grad_norm": 4.182753086090088, "learning_rate": 5.775983266449029e-06, "loss": 0.5998, "num_input_tokens_seen": 62474512, "step": 107600 }, { "epoch": 16.026958593982723, "grad_norm": 0.8931442499160767, "learning_rate": 5.773906086413222e-06, "loss": 0.5123, "num_input_tokens_seen": 62477424, "step": 107605 }, { "epoch": 16.027703306523684, "grad_norm": 1.928075909614563, "learning_rate": 5.771829231182737e-06, "loss": 0.5055, "num_input_tokens_seen": 62480240, "step": 107610 }, { "epoch": 16.02844801906464, "grad_norm": 1.3153187036514282, "learning_rate": 5.769752700792655e-06, "loss": 0.7857, "num_input_tokens_seen": 62483056, "step": 107615 }, { "epoch": 16.0291927316056, "grad_norm": 2.9734854698181152, "learning_rate": 5.767676495278057e-06, "loss": 0.5498, "num_input_tokens_seen": 62486032, "step": 107620 }, { "epoch": 16.029937444146558, "grad_norm": 2.2535927295684814, "learning_rate": 5.765600614674019e-06, "loss": 0.4797, "num_input_tokens_seen": 62488880, "step": 107625 }, { "epoch": 16.03068215668752, "grad_norm": 2.916853189468384, "learning_rate": 5.763525059015601e-06, "loss": 0.629, "num_input_tokens_seen": 62491952, "step": 107630 }, { "epoch": 16.03142686922848, "grad_norm": 2.187856912612915, "learning_rate": 5.761449828337881e-06, "loss": 0.5607, "num_input_tokens_seen": 62494800, "step": 107635 }, { "epoch": 16.032171581769436, "grad_norm": 2.2424278259277344, "learning_rate": 5.759374922675908e-06, "loss": 0.608, "num_input_tokens_seen": 62497584, "step": 107640 }, { "epoch": 16.032916294310397, "grad_norm": 0.9102573990821838, "learning_rate": 5.757300342064748e-06, "loss": 0.5789, "num_input_tokens_seen": 62501136, "step": 107645 }, { "epoch": 16.033661006851354, "grad_norm": 1.3768446445465088, "learning_rate": 5.755226086539433e-06, "loss": 0.6887, "num_input_tokens_seen": 62504112, "step": 107650 }, { "epoch": 16.034405719392314, "grad_norm": 1.32047700881958, "learning_rate": 5.753152156135022e-06, "loss": 0.7141, "num_input_tokens_seen": 62506640, "step": 107655 }, { "epoch": 16.035150431933275, "grad_norm": 1.0138620138168335, "learning_rate": 5.751078550886543e-06, "loss": 0.6546, "num_input_tokens_seen": 62509392, "step": 107660 }, { "epoch": 16.03589514447423, "grad_norm": 1.9190406799316406, "learning_rate": 5.749005270829022e-06, "loss": 0.6907, "num_input_tokens_seen": 62512464, "step": 107665 }, { "epoch": 16.036639857015192, "grad_norm": 1.0594605207443237, "learning_rate": 5.746932315997497e-06, "loss": 0.5758, "num_input_tokens_seen": 62515312, "step": 107670 }, { "epoch": 16.037384569556153, "grad_norm": 1.3526649475097656, "learning_rate": 5.744859686426976e-06, "loss": 0.5686, "num_input_tokens_seen": 62518192, "step": 107675 }, { "epoch": 16.03812928209711, "grad_norm": 1.887967824935913, "learning_rate": 5.742787382152489e-06, "loss": 0.5968, "num_input_tokens_seen": 62521232, "step": 107680 }, { "epoch": 16.03887399463807, "grad_norm": 1.3184335231781006, "learning_rate": 5.740715403209035e-06, "loss": 0.6066, "num_input_tokens_seen": 62524176, "step": 107685 }, { "epoch": 16.039618707179027, "grad_norm": 2.987861156463623, "learning_rate": 5.738643749631623e-06, "loss": 0.7068, "num_input_tokens_seen": 62527568, "step": 107690 }, { "epoch": 16.040363419719988, "grad_norm": 1.8321051597595215, "learning_rate": 5.736572421455239e-06, "loss": 0.504, "num_input_tokens_seen": 62530288, "step": 107695 }, { "epoch": 16.04110813226095, "grad_norm": 2.337669610977173, "learning_rate": 5.734501418714891e-06, "loss": 0.5999, "num_input_tokens_seen": 62533040, "step": 107700 }, { "epoch": 16.041852844801905, "grad_norm": 1.7130135297775269, "learning_rate": 5.732430741445563e-06, "loss": 0.507, "num_input_tokens_seen": 62536272, "step": 107705 }, { "epoch": 16.042597557342866, "grad_norm": 1.7610092163085938, "learning_rate": 5.730360389682227e-06, "loss": 0.6683, "num_input_tokens_seen": 62539056, "step": 107710 }, { "epoch": 16.043342269883826, "grad_norm": 1.2814050912857056, "learning_rate": 5.728290363459876e-06, "loss": 0.6814, "num_input_tokens_seen": 62541776, "step": 107715 }, { "epoch": 16.044086982424783, "grad_norm": 2.804857015609741, "learning_rate": 5.726220662813464e-06, "loss": 0.7237, "num_input_tokens_seen": 62545008, "step": 107720 }, { "epoch": 16.044831694965744, "grad_norm": 1.8649128675460815, "learning_rate": 5.72415128777797e-06, "loss": 0.3623, "num_input_tokens_seen": 62548080, "step": 107725 }, { "epoch": 16.0455764075067, "grad_norm": 2.582038164138794, "learning_rate": 5.722082238388346e-06, "loss": 0.4644, "num_input_tokens_seen": 62550704, "step": 107730 }, { "epoch": 16.04632112004766, "grad_norm": 0.9347460865974426, "learning_rate": 5.720013514679553e-06, "loss": 0.4948, "num_input_tokens_seen": 62553456, "step": 107735 }, { "epoch": 16.047065832588622, "grad_norm": 1.8040844202041626, "learning_rate": 5.71794511668654e-06, "loss": 0.6093, "num_input_tokens_seen": 62556464, "step": 107740 }, { "epoch": 16.04781054512958, "grad_norm": 1.784324049949646, "learning_rate": 5.7158770444442425e-06, "loss": 0.628, "num_input_tokens_seen": 62559408, "step": 107745 }, { "epoch": 16.04855525767054, "grad_norm": 1.3808273077011108, "learning_rate": 5.713809297987599e-06, "loss": 0.3739, "num_input_tokens_seen": 62562128, "step": 107750 }, { "epoch": 16.0492999702115, "grad_norm": 1.4151157140731812, "learning_rate": 5.7117418773515535e-06, "loss": 0.7132, "num_input_tokens_seen": 62565008, "step": 107755 }, { "epoch": 16.050044682752457, "grad_norm": 2.0623066425323486, "learning_rate": 5.709674782571023e-06, "loss": 0.4153, "num_input_tokens_seen": 62567792, "step": 107760 }, { "epoch": 16.050789395293418, "grad_norm": 2.2699737548828125, "learning_rate": 5.707608013680923e-06, "loss": 0.5967, "num_input_tokens_seen": 62570576, "step": 107765 }, { "epoch": 16.051534107834375, "grad_norm": 1.5665174722671509, "learning_rate": 5.705541570716189e-06, "loss": 0.5331, "num_input_tokens_seen": 62573200, "step": 107770 }, { "epoch": 16.052278820375335, "grad_norm": 1.9153234958648682, "learning_rate": 5.70347545371171e-06, "loss": 0.4123, "num_input_tokens_seen": 62576720, "step": 107775 }, { "epoch": 16.053023532916296, "grad_norm": 2.4162797927856445, "learning_rate": 5.701409662702409e-06, "loss": 0.7115, "num_input_tokens_seen": 62579440, "step": 107780 }, { "epoch": 16.053768245457253, "grad_norm": 1.3307631015777588, "learning_rate": 5.699344197723178e-06, "loss": 0.8084, "num_input_tokens_seen": 62582384, "step": 107785 }, { "epoch": 16.054512957998213, "grad_norm": 1.834615707397461, "learning_rate": 5.697279058808902e-06, "loss": 0.6275, "num_input_tokens_seen": 62585328, "step": 107790 }, { "epoch": 16.055257670539174, "grad_norm": 1.485021948814392, "learning_rate": 5.6952142459944845e-06, "loss": 0.6831, "num_input_tokens_seen": 62588176, "step": 107795 }, { "epoch": 16.05600238308013, "grad_norm": 1.8640079498291016, "learning_rate": 5.693149759314798e-06, "loss": 0.4856, "num_input_tokens_seen": 62591120, "step": 107800 }, { "epoch": 16.05674709562109, "grad_norm": 1.9046729803085327, "learning_rate": 5.691085598804727e-06, "loss": 0.75, "num_input_tokens_seen": 62594160, "step": 107805 }, { "epoch": 16.057491808162048, "grad_norm": 1.463721513748169, "learning_rate": 5.689021764499142e-06, "loss": 0.5695, "num_input_tokens_seen": 62596784, "step": 107810 }, { "epoch": 16.05823652070301, "grad_norm": 0.9247623085975647, "learning_rate": 5.6869582564329085e-06, "loss": 0.5984, "num_input_tokens_seen": 62599664, "step": 107815 }, { "epoch": 16.05898123324397, "grad_norm": 2.5811874866485596, "learning_rate": 5.684895074640884e-06, "loss": 0.6219, "num_input_tokens_seen": 62602512, "step": 107820 }, { "epoch": 16.059725945784926, "grad_norm": 1.5307196378707886, "learning_rate": 5.682832219157922e-06, "loss": 0.5544, "num_input_tokens_seen": 62605488, "step": 107825 }, { "epoch": 16.060470658325887, "grad_norm": 1.6660627126693726, "learning_rate": 5.68076969001888e-06, "loss": 0.4909, "num_input_tokens_seen": 62608080, "step": 107830 }, { "epoch": 16.061215370866844, "grad_norm": 2.005314350128174, "learning_rate": 5.678707487258594e-06, "loss": 0.6596, "num_input_tokens_seen": 62610768, "step": 107835 }, { "epoch": 16.061960083407804, "grad_norm": 1.234118103981018, "learning_rate": 5.676645610911916e-06, "loss": 0.6707, "num_input_tokens_seen": 62613936, "step": 107840 }, { "epoch": 16.062704795948765, "grad_norm": 1.3705183267593384, "learning_rate": 5.674584061013663e-06, "loss": 0.5831, "num_input_tokens_seen": 62616976, "step": 107845 }, { "epoch": 16.06344950848972, "grad_norm": 0.9663455486297607, "learning_rate": 5.672522837598676e-06, "loss": 0.6449, "num_input_tokens_seen": 62620080, "step": 107850 }, { "epoch": 16.064194221030682, "grad_norm": 1.4585732221603394, "learning_rate": 5.670461940701768e-06, "loss": 0.4479, "num_input_tokens_seen": 62623024, "step": 107855 }, { "epoch": 16.064938933571643, "grad_norm": 0.9776303172111511, "learning_rate": 5.668401370357765e-06, "loss": 0.575, "num_input_tokens_seen": 62626064, "step": 107860 }, { "epoch": 16.0656836461126, "grad_norm": 1.2511311769485474, "learning_rate": 5.666341126601474e-06, "loss": 0.5588, "num_input_tokens_seen": 62629104, "step": 107865 }, { "epoch": 16.06642835865356, "grad_norm": 2.4757251739501953, "learning_rate": 5.664281209467692e-06, "loss": 0.6436, "num_input_tokens_seen": 62631920, "step": 107870 }, { "epoch": 16.067173071194517, "grad_norm": 2.016496419906616, "learning_rate": 5.662221618991234e-06, "loss": 0.7133, "num_input_tokens_seen": 62634864, "step": 107875 }, { "epoch": 16.067917783735478, "grad_norm": 1.416105031967163, "learning_rate": 5.660162355206888e-06, "loss": 0.5795, "num_input_tokens_seen": 62637552, "step": 107880 }, { "epoch": 16.06866249627644, "grad_norm": 1.8832988739013672, "learning_rate": 5.658103418149443e-06, "loss": 0.637, "num_input_tokens_seen": 62640560, "step": 107885 }, { "epoch": 16.069407208817395, "grad_norm": 2.718151092529297, "learning_rate": 5.656044807853675e-06, "loss": 0.5865, "num_input_tokens_seen": 62643504, "step": 107890 }, { "epoch": 16.070151921358356, "grad_norm": 2.0972506999969482, "learning_rate": 5.653986524354377e-06, "loss": 0.5894, "num_input_tokens_seen": 62646640, "step": 107895 }, { "epoch": 16.070896633899316, "grad_norm": 1.9198088645935059, "learning_rate": 5.651928567686307e-06, "loss": 0.7177, "num_input_tokens_seen": 62649456, "step": 107900 }, { "epoch": 16.071641346440273, "grad_norm": 1.2405331134796143, "learning_rate": 5.649870937884247e-06, "loss": 0.6449, "num_input_tokens_seen": 62652560, "step": 107905 }, { "epoch": 16.072386058981234, "grad_norm": 1.1553012132644653, "learning_rate": 5.647813634982952e-06, "loss": 0.5089, "num_input_tokens_seen": 62655440, "step": 107910 }, { "epoch": 16.07313077152219, "grad_norm": 1.0415033102035522, "learning_rate": 5.6457566590171675e-06, "loss": 0.5533, "num_input_tokens_seen": 62658192, "step": 107915 }, { "epoch": 16.07387548406315, "grad_norm": 1.2851449251174927, "learning_rate": 5.64370001002166e-06, "loss": 0.5153, "num_input_tokens_seen": 62660880, "step": 107920 }, { "epoch": 16.074620196604112, "grad_norm": 1.296623945236206, "learning_rate": 5.641643688031162e-06, "loss": 0.5875, "num_input_tokens_seen": 62663664, "step": 107925 }, { "epoch": 16.07536490914507, "grad_norm": 1.133073329925537, "learning_rate": 5.639587693080428e-06, "loss": 0.6195, "num_input_tokens_seen": 62666672, "step": 107930 }, { "epoch": 16.07610962168603, "grad_norm": 1.1873618364334106, "learning_rate": 5.637532025204173e-06, "loss": 0.5715, "num_input_tokens_seen": 62669456, "step": 107935 }, { "epoch": 16.07685433422699, "grad_norm": 2.3514649868011475, "learning_rate": 5.635476684437144e-06, "loss": 0.809, "num_input_tokens_seen": 62672336, "step": 107940 }, { "epoch": 16.077599046767947, "grad_norm": 1.6696724891662598, "learning_rate": 5.633421670814054e-06, "loss": 0.578, "num_input_tokens_seen": 62675248, "step": 107945 }, { "epoch": 16.078343759308908, "grad_norm": 1.135697603225708, "learning_rate": 5.631366984369624e-06, "loss": 0.6172, "num_input_tokens_seen": 62678064, "step": 107950 }, { "epoch": 16.079088471849865, "grad_norm": 3.2121031284332275, "learning_rate": 5.629312625138561e-06, "loss": 0.7554, "num_input_tokens_seen": 62681168, "step": 107955 }, { "epoch": 16.079833184390825, "grad_norm": 1.384037971496582, "learning_rate": 5.627258593155568e-06, "loss": 0.4406, "num_input_tokens_seen": 62683664, "step": 107960 }, { "epoch": 16.080577896931786, "grad_norm": 1.5007541179656982, "learning_rate": 5.625204888455357e-06, "loss": 0.6716, "num_input_tokens_seen": 62686672, "step": 107965 }, { "epoch": 16.081322609472743, "grad_norm": 2.089362621307373, "learning_rate": 5.623151511072613e-06, "loss": 0.4823, "num_input_tokens_seen": 62689680, "step": 107970 }, { "epoch": 16.082067322013703, "grad_norm": 2.4884226322174072, "learning_rate": 5.6210984610420345e-06, "loss": 0.5608, "num_input_tokens_seen": 62692368, "step": 107975 }, { "epoch": 16.082812034554664, "grad_norm": 1.2546495199203491, "learning_rate": 5.619045738398299e-06, "loss": 0.7011, "num_input_tokens_seen": 62695472, "step": 107980 }, { "epoch": 16.08355674709562, "grad_norm": 3.8661348819732666, "learning_rate": 5.616993343176091e-06, "loss": 0.7345, "num_input_tokens_seen": 62698288, "step": 107985 }, { "epoch": 16.08430145963658, "grad_norm": 1.2844997644424438, "learning_rate": 5.614941275410082e-06, "loss": 0.5428, "num_input_tokens_seen": 62701040, "step": 107990 }, { "epoch": 16.085046172177538, "grad_norm": 2.0773420333862305, "learning_rate": 5.61288953513493e-06, "loss": 0.6922, "num_input_tokens_seen": 62703984, "step": 107995 }, { "epoch": 16.0857908847185, "grad_norm": 1.4750510454177856, "learning_rate": 5.610838122385312e-06, "loss": 0.5132, "num_input_tokens_seen": 62706736, "step": 108000 }, { "epoch": 16.08653559725946, "grad_norm": 1.2998898029327393, "learning_rate": 5.608787037195873e-06, "loss": 0.6629, "num_input_tokens_seen": 62709296, "step": 108005 }, { "epoch": 16.087280309800416, "grad_norm": 1.8151780366897583, "learning_rate": 5.606736279601274e-06, "loss": 0.623, "num_input_tokens_seen": 62712272, "step": 108010 }, { "epoch": 16.088025022341377, "grad_norm": 1.6042715311050415, "learning_rate": 5.6046858496361545e-06, "loss": 0.5513, "num_input_tokens_seen": 62715184, "step": 108015 }, { "epoch": 16.088769734882334, "grad_norm": 1.5171183347702026, "learning_rate": 5.602635747335155e-06, "loss": 0.4633, "num_input_tokens_seen": 62718288, "step": 108020 }, { "epoch": 16.089514447423294, "grad_norm": 0.5588974356651306, "learning_rate": 5.600585972732911e-06, "loss": 0.5718, "num_input_tokens_seen": 62721168, "step": 108025 }, { "epoch": 16.090259159964255, "grad_norm": 1.734605073928833, "learning_rate": 5.598536525864042e-06, "loss": 0.6266, "num_input_tokens_seen": 62723728, "step": 108030 }, { "epoch": 16.091003872505212, "grad_norm": 3.4371678829193115, "learning_rate": 5.59648740676319e-06, "loss": 0.7441, "num_input_tokens_seen": 62726768, "step": 108035 }, { "epoch": 16.091748585046172, "grad_norm": 1.9430776834487915, "learning_rate": 5.594438615464953e-06, "loss": 0.5799, "num_input_tokens_seen": 62729616, "step": 108040 }, { "epoch": 16.092493297587133, "grad_norm": 2.330061674118042, "learning_rate": 5.59239015200396e-06, "loss": 0.659, "num_input_tokens_seen": 62732432, "step": 108045 }, { "epoch": 16.09323801012809, "grad_norm": 0.8798039555549622, "learning_rate": 5.590342016414801e-06, "loss": 0.4705, "num_input_tokens_seen": 62735440, "step": 108050 }, { "epoch": 16.09398272266905, "grad_norm": 0.978800892829895, "learning_rate": 5.588294208732098e-06, "loss": 0.4928, "num_input_tokens_seen": 62738288, "step": 108055 }, { "epoch": 16.094727435210007, "grad_norm": 0.9683980941772461, "learning_rate": 5.586246728990424e-06, "loss": 0.6435, "num_input_tokens_seen": 62741008, "step": 108060 }, { "epoch": 16.095472147750968, "grad_norm": 2.2110238075256348, "learning_rate": 5.584199577224389e-06, "loss": 0.621, "num_input_tokens_seen": 62744048, "step": 108065 }, { "epoch": 16.09621686029193, "grad_norm": 2.141413927078247, "learning_rate": 5.58215275346857e-06, "loss": 0.4525, "num_input_tokens_seen": 62747376, "step": 108070 }, { "epoch": 16.096961572832885, "grad_norm": 1.2819533348083496, "learning_rate": 5.580106257757542e-06, "loss": 0.7728, "num_input_tokens_seen": 62750160, "step": 108075 }, { "epoch": 16.097706285373846, "grad_norm": 2.1525652408599854, "learning_rate": 5.5780600901258774e-06, "loss": 0.7179, "num_input_tokens_seen": 62753104, "step": 108080 }, { "epoch": 16.098450997914806, "grad_norm": 1.6961543560028076, "learning_rate": 5.576014250608152e-06, "loss": 0.5882, "num_input_tokens_seen": 62755984, "step": 108085 }, { "epoch": 16.099195710455763, "grad_norm": 1.7091492414474487, "learning_rate": 5.573968739238927e-06, "loss": 0.6535, "num_input_tokens_seen": 62759024, "step": 108090 }, { "epoch": 16.099940422996724, "grad_norm": 2.644718647003174, "learning_rate": 5.571923556052749e-06, "loss": 0.619, "num_input_tokens_seen": 62762000, "step": 108095 }, { "epoch": 16.10068513553768, "grad_norm": 1.5261118412017822, "learning_rate": 5.569878701084183e-06, "loss": 0.5213, "num_input_tokens_seen": 62764688, "step": 108100 }, { "epoch": 16.10142984807864, "grad_norm": 1.2846765518188477, "learning_rate": 5.567834174367767e-06, "loss": 0.4073, "num_input_tokens_seen": 62767440, "step": 108105 }, { "epoch": 16.102174560619602, "grad_norm": 2.2040464878082275, "learning_rate": 5.565789975938038e-06, "loss": 0.6906, "num_input_tokens_seen": 62770224, "step": 108110 }, { "epoch": 16.10291927316056, "grad_norm": 1.4284647703170776, "learning_rate": 5.56374610582954e-06, "loss": 0.7318, "num_input_tokens_seen": 62773168, "step": 108115 }, { "epoch": 16.10366398570152, "grad_norm": 1.2477270364761353, "learning_rate": 5.561702564076793e-06, "loss": 0.6682, "num_input_tokens_seen": 62775888, "step": 108120 }, { "epoch": 16.10440869824248, "grad_norm": 2.4934685230255127, "learning_rate": 5.5596593507143304e-06, "loss": 0.6568, "num_input_tokens_seen": 62778672, "step": 108125 }, { "epoch": 16.105153410783437, "grad_norm": 5.113245964050293, "learning_rate": 5.557616465776658e-06, "loss": 0.6943, "num_input_tokens_seen": 62781552, "step": 108130 }, { "epoch": 16.105898123324398, "grad_norm": 1.4048395156860352, "learning_rate": 5.5555739092983e-06, "loss": 0.6268, "num_input_tokens_seen": 62784368, "step": 108135 }, { "epoch": 16.106642835865355, "grad_norm": 1.4636106491088867, "learning_rate": 5.553531681313762e-06, "loss": 0.4064, "num_input_tokens_seen": 62787120, "step": 108140 }, { "epoch": 16.107387548406315, "grad_norm": 4.230257511138916, "learning_rate": 5.5514897818575415e-06, "loss": 0.6923, "num_input_tokens_seen": 62789712, "step": 108145 }, { "epoch": 16.108132260947276, "grad_norm": 1.029123306274414, "learning_rate": 5.549448210964131e-06, "loss": 0.5378, "num_input_tokens_seen": 62792624, "step": 108150 }, { "epoch": 16.108876973488233, "grad_norm": 1.3123000860214233, "learning_rate": 5.5474069686680205e-06, "loss": 0.6283, "num_input_tokens_seen": 62795440, "step": 108155 }, { "epoch": 16.109621686029193, "grad_norm": 1.136393666267395, "learning_rate": 5.545366055003706e-06, "loss": 0.534, "num_input_tokens_seen": 62798256, "step": 108160 }, { "epoch": 16.11036639857015, "grad_norm": 1.5626368522644043, "learning_rate": 5.54332547000565e-06, "loss": 0.5977, "num_input_tokens_seen": 62801232, "step": 108165 }, { "epoch": 16.11111111111111, "grad_norm": 1.0061061382293701, "learning_rate": 5.541285213708342e-06, "loss": 0.5351, "num_input_tokens_seen": 62804112, "step": 108170 }, { "epoch": 16.11185582365207, "grad_norm": 3.4537458419799805, "learning_rate": 5.539245286146238e-06, "loss": 0.627, "num_input_tokens_seen": 62806928, "step": 108175 }, { "epoch": 16.11260053619303, "grad_norm": 1.8515310287475586, "learning_rate": 5.537205687353813e-06, "loss": 0.6402, "num_input_tokens_seen": 62810032, "step": 108180 }, { "epoch": 16.11334524873399, "grad_norm": 1.276427984237671, "learning_rate": 5.535166417365517e-06, "loss": 0.5595, "num_input_tokens_seen": 62813136, "step": 108185 }, { "epoch": 16.11408996127495, "grad_norm": 2.126659631729126, "learning_rate": 5.533127476215791e-06, "loss": 0.5111, "num_input_tokens_seen": 62816176, "step": 108190 }, { "epoch": 16.114834673815906, "grad_norm": 2.6775991916656494, "learning_rate": 5.531088863939101e-06, "loss": 0.5358, "num_input_tokens_seen": 62818960, "step": 108195 }, { "epoch": 16.115579386356867, "grad_norm": 3.155369520187378, "learning_rate": 5.529050580569869e-06, "loss": 0.762, "num_input_tokens_seen": 62821712, "step": 108200 }, { "epoch": 16.116324098897824, "grad_norm": 0.9382445216178894, "learning_rate": 5.527012626142547e-06, "loss": 0.6535, "num_input_tokens_seen": 62824496, "step": 108205 }, { "epoch": 16.117068811438784, "grad_norm": 1.4051469564437866, "learning_rate": 5.524975000691554e-06, "loss": 0.5547, "num_input_tokens_seen": 62827408, "step": 108210 }, { "epoch": 16.117813523979745, "grad_norm": 0.9758440852165222, "learning_rate": 5.522937704251316e-06, "loss": 0.7601, "num_input_tokens_seen": 62830000, "step": 108215 }, { "epoch": 16.118558236520702, "grad_norm": 1.4911799430847168, "learning_rate": 5.520900736856241e-06, "loss": 0.496, "num_input_tokens_seen": 62833040, "step": 108220 }, { "epoch": 16.119302949061662, "grad_norm": 1.4345108270645142, "learning_rate": 5.5188640985407575e-06, "loss": 0.6287, "num_input_tokens_seen": 62836240, "step": 108225 }, { "epoch": 16.120047661602623, "grad_norm": 1.266210913658142, "learning_rate": 5.516827789339266e-06, "loss": 0.5385, "num_input_tokens_seen": 62838896, "step": 108230 }, { "epoch": 16.12079237414358, "grad_norm": 1.3462090492248535, "learning_rate": 5.51479180928616e-06, "loss": 0.6745, "num_input_tokens_seen": 62842000, "step": 108235 }, { "epoch": 16.12153708668454, "grad_norm": 2.0025172233581543, "learning_rate": 5.5127561584158495e-06, "loss": 0.6717, "num_input_tokens_seen": 62845072, "step": 108240 }, { "epoch": 16.122281799225497, "grad_norm": 2.3701913356781006, "learning_rate": 5.510720836762712e-06, "loss": 0.6355, "num_input_tokens_seen": 62847920, "step": 108245 }, { "epoch": 16.123026511766458, "grad_norm": 1.198432207107544, "learning_rate": 5.508685844361142e-06, "loss": 0.4787, "num_input_tokens_seen": 62850768, "step": 108250 }, { "epoch": 16.12377122430742, "grad_norm": 1.507022500038147, "learning_rate": 5.506651181245509e-06, "loss": 0.6893, "num_input_tokens_seen": 62853456, "step": 108255 }, { "epoch": 16.124515936848375, "grad_norm": 2.588601589202881, "learning_rate": 5.5046168474502e-06, "loss": 0.6134, "num_input_tokens_seen": 62856208, "step": 108260 }, { "epoch": 16.125260649389336, "grad_norm": 1.846816897392273, "learning_rate": 5.502582843009577e-06, "loss": 0.4298, "num_input_tokens_seen": 62858928, "step": 108265 }, { "epoch": 16.126005361930297, "grad_norm": 1.121145248413086, "learning_rate": 5.500549167957989e-06, "loss": 0.5219, "num_input_tokens_seen": 62861744, "step": 108270 }, { "epoch": 16.126750074471254, "grad_norm": 2.8636770248413086, "learning_rate": 5.498515822329814e-06, "loss": 0.5983, "num_input_tokens_seen": 62864784, "step": 108275 }, { "epoch": 16.127494787012214, "grad_norm": 1.2436696290969849, "learning_rate": 5.496482806159395e-06, "loss": 0.7482, "num_input_tokens_seen": 62867408, "step": 108280 }, { "epoch": 16.12823949955317, "grad_norm": 1.032515525817871, "learning_rate": 5.494450119481073e-06, "loss": 0.5322, "num_input_tokens_seen": 62870256, "step": 108285 }, { "epoch": 16.12898421209413, "grad_norm": 2.387420892715454, "learning_rate": 5.492417762329188e-06, "loss": 0.5938, "num_input_tokens_seen": 62873424, "step": 108290 }, { "epoch": 16.129728924635092, "grad_norm": 1.7205193042755127, "learning_rate": 5.490385734738082e-06, "loss": 0.5969, "num_input_tokens_seen": 62876368, "step": 108295 }, { "epoch": 16.13047363717605, "grad_norm": 1.6040592193603516, "learning_rate": 5.4883540367420775e-06, "loss": 0.5623, "num_input_tokens_seen": 62879312, "step": 108300 }, { "epoch": 16.13121834971701, "grad_norm": 1.736500859260559, "learning_rate": 5.486322668375504e-06, "loss": 0.6269, "num_input_tokens_seen": 62882000, "step": 108305 }, { "epoch": 16.13196306225797, "grad_norm": 1.456369161605835, "learning_rate": 5.484291629672677e-06, "loss": 0.5863, "num_input_tokens_seen": 62884848, "step": 108310 }, { "epoch": 16.132707774798927, "grad_norm": 1.1634093523025513, "learning_rate": 5.482260920667903e-06, "loss": 0.6141, "num_input_tokens_seen": 62887664, "step": 108315 }, { "epoch": 16.133452487339888, "grad_norm": 1.3446780443191528, "learning_rate": 5.480230541395501e-06, "loss": 0.5833, "num_input_tokens_seen": 62890704, "step": 108320 }, { "epoch": 16.134197199880845, "grad_norm": 2.3500351905822754, "learning_rate": 5.478200491889754e-06, "loss": 0.4957, "num_input_tokens_seen": 62893584, "step": 108325 }, { "epoch": 16.134941912421805, "grad_norm": 1.5932508707046509, "learning_rate": 5.47617077218498e-06, "loss": 0.4583, "num_input_tokens_seen": 62896432, "step": 108330 }, { "epoch": 16.135686624962766, "grad_norm": 1.2021749019622803, "learning_rate": 5.474141382315448e-06, "loss": 0.5784, "num_input_tokens_seen": 62899824, "step": 108335 }, { "epoch": 16.136431337503723, "grad_norm": 2.052682399749756, "learning_rate": 5.472112322315459e-06, "loss": 0.8058, "num_input_tokens_seen": 62902512, "step": 108340 }, { "epoch": 16.137176050044683, "grad_norm": 2.407139778137207, "learning_rate": 5.4700835922192885e-06, "loss": 0.5618, "num_input_tokens_seen": 62905776, "step": 108345 }, { "epoch": 16.13792076258564, "grad_norm": 2.2559964656829834, "learning_rate": 5.468055192061203e-06, "loss": 0.5251, "num_input_tokens_seen": 62908560, "step": 108350 }, { "epoch": 16.1386654751266, "grad_norm": 2.0636277198791504, "learning_rate": 5.466027121875475e-06, "loss": 0.7193, "num_input_tokens_seen": 62911504, "step": 108355 }, { "epoch": 16.13941018766756, "grad_norm": 2.3341593742370605, "learning_rate": 5.463999381696358e-06, "loss": 0.6346, "num_input_tokens_seen": 62914320, "step": 108360 }, { "epoch": 16.14015490020852, "grad_norm": 1.3244999647140503, "learning_rate": 5.4619719715581215e-06, "loss": 0.7257, "num_input_tokens_seen": 62917168, "step": 108365 }, { "epoch": 16.14089961274948, "grad_norm": 1.1172538995742798, "learning_rate": 5.4599448914950055e-06, "loss": 0.5055, "num_input_tokens_seen": 62919952, "step": 108370 }, { "epoch": 16.14164432529044, "grad_norm": 3.238619804382324, "learning_rate": 5.457918141541268e-06, "loss": 0.7716, "num_input_tokens_seen": 62922736, "step": 108375 }, { "epoch": 16.142389037831396, "grad_norm": 3.7606799602508545, "learning_rate": 5.455891721731135e-06, "loss": 0.6449, "num_input_tokens_seen": 62925552, "step": 108380 }, { "epoch": 16.143133750372357, "grad_norm": 1.2239041328430176, "learning_rate": 5.453865632098853e-06, "loss": 0.6018, "num_input_tokens_seen": 62928208, "step": 108385 }, { "epoch": 16.143878462913314, "grad_norm": 1.9572232961654663, "learning_rate": 5.451839872678646e-06, "loss": 0.457, "num_input_tokens_seen": 62931184, "step": 108390 }, { "epoch": 16.144623175454274, "grad_norm": 1.8253463506698608, "learning_rate": 5.449814443504731e-06, "loss": 0.3857, "num_input_tokens_seen": 62933936, "step": 108395 }, { "epoch": 16.145367887995235, "grad_norm": 1.3450430631637573, "learning_rate": 5.447789344611337e-06, "loss": 0.6635, "num_input_tokens_seen": 62937264, "step": 108400 }, { "epoch": 16.146112600536192, "grad_norm": 1.8411833047866821, "learning_rate": 5.445764576032672e-06, "loss": 0.6043, "num_input_tokens_seen": 62940240, "step": 108405 }, { "epoch": 16.146857313077152, "grad_norm": 1.9911314249038696, "learning_rate": 5.44374013780293e-06, "loss": 0.5049, "num_input_tokens_seen": 62943088, "step": 108410 }, { "epoch": 16.147602025618113, "grad_norm": 1.5745619535446167, "learning_rate": 5.441716029956331e-06, "loss": 0.6295, "num_input_tokens_seen": 62945968, "step": 108415 }, { "epoch": 16.14834673815907, "grad_norm": 2.137782096862793, "learning_rate": 5.439692252527062e-06, "loss": 0.5472, "num_input_tokens_seen": 62948624, "step": 108420 }, { "epoch": 16.14909145070003, "grad_norm": 1.7729319334030151, "learning_rate": 5.437668805549312e-06, "loss": 0.5551, "num_input_tokens_seen": 62951728, "step": 108425 }, { "epoch": 16.149836163240987, "grad_norm": 2.282923698425293, "learning_rate": 5.435645689057256e-06, "loss": 0.6428, "num_input_tokens_seen": 62954704, "step": 108430 }, { "epoch": 16.150580875781948, "grad_norm": 1.6969919204711914, "learning_rate": 5.433622903085092e-06, "loss": 0.5338, "num_input_tokens_seen": 62957392, "step": 108435 }, { "epoch": 16.15132558832291, "grad_norm": 1.4262173175811768, "learning_rate": 5.4316004476669735e-06, "loss": 0.5226, "num_input_tokens_seen": 62960144, "step": 108440 }, { "epoch": 16.152070300863866, "grad_norm": 1.0311305522918701, "learning_rate": 5.429578322837084e-06, "loss": 0.7269, "num_input_tokens_seen": 62963120, "step": 108445 }, { "epoch": 16.152815013404826, "grad_norm": 1.4110686779022217, "learning_rate": 5.4275565286295735e-06, "loss": 0.5209, "num_input_tokens_seen": 62966064, "step": 108450 }, { "epoch": 16.153559725945787, "grad_norm": 2.6338677406311035, "learning_rate": 5.425535065078608e-06, "loss": 0.7636, "num_input_tokens_seen": 62969168, "step": 108455 }, { "epoch": 16.154304438486744, "grad_norm": 1.280056118965149, "learning_rate": 5.423513932218327e-06, "loss": 0.4616, "num_input_tokens_seen": 62971952, "step": 108460 }, { "epoch": 16.155049151027704, "grad_norm": 1.2868187427520752, "learning_rate": 5.421493130082889e-06, "loss": 0.6705, "num_input_tokens_seen": 62974704, "step": 108465 }, { "epoch": 16.15579386356866, "grad_norm": 2.837822198867798, "learning_rate": 5.419472658706423e-06, "loss": 0.4463, "num_input_tokens_seen": 62977680, "step": 108470 }, { "epoch": 16.15653857610962, "grad_norm": 1.3127378225326538, "learning_rate": 5.417452518123067e-06, "loss": 0.55, "num_input_tokens_seen": 62980368, "step": 108475 }, { "epoch": 16.157283288650582, "grad_norm": 2.451101541519165, "learning_rate": 5.415432708366949e-06, "loss": 0.8349, "num_input_tokens_seen": 62983696, "step": 108480 }, { "epoch": 16.15802800119154, "grad_norm": 1.3474853038787842, "learning_rate": 5.413413229472184e-06, "loss": 0.6233, "num_input_tokens_seen": 62986544, "step": 108485 }, { "epoch": 16.1587727137325, "grad_norm": 1.2027570009231567, "learning_rate": 5.411394081472901e-06, "loss": 0.5219, "num_input_tokens_seen": 62989328, "step": 108490 }, { "epoch": 16.15951742627346, "grad_norm": 2.781390905380249, "learning_rate": 5.409375264403199e-06, "loss": 0.7034, "num_input_tokens_seen": 62992464, "step": 108495 }, { "epoch": 16.160262138814417, "grad_norm": 1.2998199462890625, "learning_rate": 5.407356778297198e-06, "loss": 0.7068, "num_input_tokens_seen": 62995632, "step": 108500 }, { "epoch": 16.161006851355378, "grad_norm": 1.758104920387268, "learning_rate": 5.4053386231889855e-06, "loss": 0.6508, "num_input_tokens_seen": 62998544, "step": 108505 }, { "epoch": 16.161751563896335, "grad_norm": 1.4333981275558472, "learning_rate": 5.403320799112666e-06, "loss": 0.7771, "num_input_tokens_seen": 63001584, "step": 108510 }, { "epoch": 16.162496276437295, "grad_norm": 2.131924867630005, "learning_rate": 5.401303306102326e-06, "loss": 0.6402, "num_input_tokens_seen": 63004208, "step": 108515 }, { "epoch": 16.163240988978256, "grad_norm": 1.0413556098937988, "learning_rate": 5.3992861441920425e-06, "loss": 0.5556, "num_input_tokens_seen": 63007056, "step": 108520 }, { "epoch": 16.163985701519213, "grad_norm": 0.6630505323410034, "learning_rate": 5.397269313415903e-06, "loss": 0.6992, "num_input_tokens_seen": 63010192, "step": 108525 }, { "epoch": 16.164730414060173, "grad_norm": 2.1522679328918457, "learning_rate": 5.395252813807969e-06, "loss": 0.5672, "num_input_tokens_seen": 63013008, "step": 108530 }, { "epoch": 16.16547512660113, "grad_norm": 1.1076087951660156, "learning_rate": 5.39323664540232e-06, "loss": 0.6112, "num_input_tokens_seen": 63016112, "step": 108535 }, { "epoch": 16.16621983914209, "grad_norm": 1.5877842903137207, "learning_rate": 5.391220808233008e-06, "loss": 0.541, "num_input_tokens_seen": 63019088, "step": 108540 }, { "epoch": 16.16696455168305, "grad_norm": 1.5541472434997559, "learning_rate": 5.3892053023340935e-06, "loss": 0.5964, "num_input_tokens_seen": 63022160, "step": 108545 }, { "epoch": 16.16770926422401, "grad_norm": 0.8557461500167847, "learning_rate": 5.387190127739625e-06, "loss": 0.5834, "num_input_tokens_seen": 63024976, "step": 108550 }, { "epoch": 16.16845397676497, "grad_norm": 1.0952906608581543, "learning_rate": 5.3851752844836374e-06, "loss": 0.3247, "num_input_tokens_seen": 63027696, "step": 108555 }, { "epoch": 16.16919868930593, "grad_norm": 1.2504740953445435, "learning_rate": 5.383160772600185e-06, "loss": 0.5656, "num_input_tokens_seen": 63030480, "step": 108560 }, { "epoch": 16.169943401846886, "grad_norm": 1.5112690925598145, "learning_rate": 5.381146592123287e-06, "loss": 0.457, "num_input_tokens_seen": 63033392, "step": 108565 }, { "epoch": 16.170688114387847, "grad_norm": 1.8307996988296509, "learning_rate": 5.379132743086984e-06, "loss": 0.5384, "num_input_tokens_seen": 63036496, "step": 108570 }, { "epoch": 16.171432826928804, "grad_norm": 2.883518934249878, "learning_rate": 5.377119225525284e-06, "loss": 0.5445, "num_input_tokens_seen": 63039472, "step": 108575 }, { "epoch": 16.172177539469764, "grad_norm": 1.9157981872558594, "learning_rate": 5.375106039472219e-06, "loss": 0.4524, "num_input_tokens_seen": 63042544, "step": 108580 }, { "epoch": 16.172922252010725, "grad_norm": 1.5499573945999146, "learning_rate": 5.373093184961783e-06, "loss": 0.5237, "num_input_tokens_seen": 63045520, "step": 108585 }, { "epoch": 16.173666964551682, "grad_norm": 3.0445027351379395, "learning_rate": 5.371080662028e-06, "loss": 0.6984, "num_input_tokens_seen": 63048464, "step": 108590 }, { "epoch": 16.174411677092642, "grad_norm": 4.289202690124512, "learning_rate": 5.369068470704855e-06, "loss": 0.69, "num_input_tokens_seen": 63051440, "step": 108595 }, { "epoch": 16.175156389633603, "grad_norm": 2.6655023097991943, "learning_rate": 5.367056611026341e-06, "loss": 0.5567, "num_input_tokens_seen": 63054192, "step": 108600 }, { "epoch": 16.17590110217456, "grad_norm": 0.742692768573761, "learning_rate": 5.36504508302646e-06, "loss": 0.5358, "num_input_tokens_seen": 63057296, "step": 108605 }, { "epoch": 16.17664581471552, "grad_norm": 2.08699369430542, "learning_rate": 5.363033886739186e-06, "loss": 0.667, "num_input_tokens_seen": 63059728, "step": 108610 }, { "epoch": 16.177390527256478, "grad_norm": 1.2700300216674805, "learning_rate": 5.361023022198494e-06, "loss": 0.5887, "num_input_tokens_seen": 63062640, "step": 108615 }, { "epoch": 16.178135239797438, "grad_norm": 1.6743501424789429, "learning_rate": 5.359012489438353e-06, "loss": 0.5103, "num_input_tokens_seen": 63065392, "step": 108620 }, { "epoch": 16.1788799523384, "grad_norm": 2.0110747814178467, "learning_rate": 5.357002288492741e-06, "loss": 0.6323, "num_input_tokens_seen": 63068272, "step": 108625 }, { "epoch": 16.179624664879356, "grad_norm": 1.9779366254806519, "learning_rate": 5.35499241939561e-06, "loss": 0.4345, "num_input_tokens_seen": 63070928, "step": 108630 }, { "epoch": 16.180369377420316, "grad_norm": 1.4581106901168823, "learning_rate": 5.3529828821809065e-06, "loss": 0.5935, "num_input_tokens_seen": 63073904, "step": 108635 }, { "epoch": 16.181114089961277, "grad_norm": 1.7658040523529053, "learning_rate": 5.350973676882601e-06, "loss": 0.6303, "num_input_tokens_seen": 63077136, "step": 108640 }, { "epoch": 16.181858802502234, "grad_norm": 1.9661921262741089, "learning_rate": 5.3489648035346144e-06, "loss": 0.6568, "num_input_tokens_seen": 63079952, "step": 108645 }, { "epoch": 16.182603515043194, "grad_norm": 1.667477011680603, "learning_rate": 5.346956262170902e-06, "loss": 0.7629, "num_input_tokens_seen": 63082768, "step": 108650 }, { "epoch": 16.18334822758415, "grad_norm": 1.1948750019073486, "learning_rate": 5.3449480528253825e-06, "loss": 0.6264, "num_input_tokens_seen": 63085776, "step": 108655 }, { "epoch": 16.18409294012511, "grad_norm": 1.801052212715149, "learning_rate": 5.342940175531999e-06, "loss": 0.7339, "num_input_tokens_seen": 63088432, "step": 108660 }, { "epoch": 16.184837652666072, "grad_norm": 0.921299397945404, "learning_rate": 5.3409326303246524e-06, "loss": 0.6937, "num_input_tokens_seen": 63091440, "step": 108665 }, { "epoch": 16.18558236520703, "grad_norm": 1.4090851545333862, "learning_rate": 5.338925417237275e-06, "loss": 0.4452, "num_input_tokens_seen": 63094480, "step": 108670 }, { "epoch": 16.18632707774799, "grad_norm": 2.2922134399414062, "learning_rate": 5.336918536303773e-06, "loss": 0.7088, "num_input_tokens_seen": 63097488, "step": 108675 }, { "epoch": 16.187071790288947, "grad_norm": 1.0064423084259033, "learning_rate": 5.334911987558045e-06, "loss": 0.7336, "num_input_tokens_seen": 63100528, "step": 108680 }, { "epoch": 16.187816502829907, "grad_norm": 1.2300032377243042, "learning_rate": 5.332905771033994e-06, "loss": 0.5926, "num_input_tokens_seen": 63103760, "step": 108685 }, { "epoch": 16.188561215370868, "grad_norm": 1.8909794092178345, "learning_rate": 5.330899886765503e-06, "loss": 0.6602, "num_input_tokens_seen": 63106544, "step": 108690 }, { "epoch": 16.189305927911825, "grad_norm": 1.1439924240112305, "learning_rate": 5.328894334786474e-06, "loss": 0.5827, "num_input_tokens_seen": 63109296, "step": 108695 }, { "epoch": 16.190050640452785, "grad_norm": 2.613318681716919, "learning_rate": 5.326889115130779e-06, "loss": 0.5907, "num_input_tokens_seen": 63112016, "step": 108700 }, { "epoch": 16.190795352993746, "grad_norm": 0.9496563076972961, "learning_rate": 5.324884227832302e-06, "loss": 0.5396, "num_input_tokens_seen": 63114768, "step": 108705 }, { "epoch": 16.191540065534703, "grad_norm": 1.8750368356704712, "learning_rate": 5.322879672924908e-06, "loss": 0.6166, "num_input_tokens_seen": 63117424, "step": 108710 }, { "epoch": 16.192284778075663, "grad_norm": 1.4082642793655396, "learning_rate": 5.3208754504424585e-06, "loss": 0.6585, "num_input_tokens_seen": 63120432, "step": 108715 }, { "epoch": 16.19302949061662, "grad_norm": 2.88972806930542, "learning_rate": 5.318871560418822e-06, "loss": 0.5843, "num_input_tokens_seen": 63123248, "step": 108720 }, { "epoch": 16.19377420315758, "grad_norm": 1.0324978828430176, "learning_rate": 5.316868002887843e-06, "loss": 0.6182, "num_input_tokens_seen": 63126192, "step": 108725 }, { "epoch": 16.19451891569854, "grad_norm": 1.8215142488479614, "learning_rate": 5.314864777883377e-06, "loss": 0.7002, "num_input_tokens_seen": 63128912, "step": 108730 }, { "epoch": 16.1952636282395, "grad_norm": 1.6675962209701538, "learning_rate": 5.3128618854392655e-06, "loss": 0.6072, "num_input_tokens_seen": 63131696, "step": 108735 }, { "epoch": 16.19600834078046, "grad_norm": 1.4772785902023315, "learning_rate": 5.3108593255893376e-06, "loss": 0.6063, "num_input_tokens_seen": 63134704, "step": 108740 }, { "epoch": 16.19675305332142, "grad_norm": 1.677864670753479, "learning_rate": 5.308857098367437e-06, "loss": 0.5681, "num_input_tokens_seen": 63137648, "step": 108745 }, { "epoch": 16.197497765862376, "grad_norm": 1.1043916940689087, "learning_rate": 5.306855203807382e-06, "loss": 0.7371, "num_input_tokens_seen": 63140880, "step": 108750 }, { "epoch": 16.198242478403337, "grad_norm": 1.7480612993240356, "learning_rate": 5.304853641942995e-06, "loss": 0.5762, "num_input_tokens_seen": 63143568, "step": 108755 }, { "epoch": 16.198987190944294, "grad_norm": 1.221727728843689, "learning_rate": 5.302852412808079e-06, "loss": 0.6836, "num_input_tokens_seen": 63146544, "step": 108760 }, { "epoch": 16.199731903485254, "grad_norm": 1.1446701288223267, "learning_rate": 5.3008515164364585e-06, "loss": 0.6058, "num_input_tokens_seen": 63149648, "step": 108765 }, { "epoch": 16.200476616026215, "grad_norm": 1.9109793901443481, "learning_rate": 5.298850952861925e-06, "loss": 0.5528, "num_input_tokens_seen": 63152560, "step": 108770 }, { "epoch": 16.201221328567172, "grad_norm": 1.2775965929031372, "learning_rate": 5.296850722118288e-06, "loss": 0.5002, "num_input_tokens_seen": 63155344, "step": 108775 }, { "epoch": 16.201966041108133, "grad_norm": 2.087172508239746, "learning_rate": 5.294850824239325e-06, "loss": 0.5978, "num_input_tokens_seen": 63158352, "step": 108780 }, { "epoch": 16.202710753649093, "grad_norm": 0.8738886117935181, "learning_rate": 5.292851259258838e-06, "loss": 0.5528, "num_input_tokens_seen": 63161008, "step": 108785 }, { "epoch": 16.20345546619005, "grad_norm": 2.002171516418457, "learning_rate": 5.2908520272106e-06, "loss": 0.4019, "num_input_tokens_seen": 63163888, "step": 108790 }, { "epoch": 16.20420017873101, "grad_norm": 1.3109371662139893, "learning_rate": 5.288853128128377e-06, "loss": 0.6482, "num_input_tokens_seen": 63166704, "step": 108795 }, { "epoch": 16.204944891271968, "grad_norm": 2.2475533485412598, "learning_rate": 5.2868545620459535e-06, "loss": 0.5748, "num_input_tokens_seen": 63169680, "step": 108800 }, { "epoch": 16.205689603812928, "grad_norm": 2.1865901947021484, "learning_rate": 5.284856328997087e-06, "loss": 0.5733, "num_input_tokens_seen": 63172592, "step": 108805 }, { "epoch": 16.20643431635389, "grad_norm": 2.3812568187713623, "learning_rate": 5.282858429015536e-06, "loss": 0.5407, "num_input_tokens_seen": 63175344, "step": 108810 }, { "epoch": 16.207179028894846, "grad_norm": 1.5610133409500122, "learning_rate": 5.280860862135045e-06, "loss": 0.4384, "num_input_tokens_seen": 63178320, "step": 108815 }, { "epoch": 16.207923741435806, "grad_norm": 2.1964027881622314, "learning_rate": 5.278863628389377e-06, "loss": 0.4586, "num_input_tokens_seen": 63181200, "step": 108820 }, { "epoch": 16.208668453976767, "grad_norm": 2.9400150775909424, "learning_rate": 5.276866727812255e-06, "loss": 0.6669, "num_input_tokens_seen": 63184176, "step": 108825 }, { "epoch": 16.209413166517724, "grad_norm": 2.7424747943878174, "learning_rate": 5.274870160437431e-06, "loss": 0.6545, "num_input_tokens_seen": 63187248, "step": 108830 }, { "epoch": 16.210157879058684, "grad_norm": 1.0841381549835205, "learning_rate": 5.272873926298627e-06, "loss": 0.4741, "num_input_tokens_seen": 63190256, "step": 108835 }, { "epoch": 16.21090259159964, "grad_norm": 1.649868130683899, "learning_rate": 5.270878025429565e-06, "loss": 0.6069, "num_input_tokens_seen": 63192944, "step": 108840 }, { "epoch": 16.2116473041406, "grad_norm": 1.9592616558074951, "learning_rate": 5.268882457863972e-06, "loss": 0.6418, "num_input_tokens_seen": 63195664, "step": 108845 }, { "epoch": 16.212392016681562, "grad_norm": 1.307073712348938, "learning_rate": 5.266887223635547e-06, "loss": 0.4153, "num_input_tokens_seen": 63198608, "step": 108850 }, { "epoch": 16.21313672922252, "grad_norm": 1.7863386869430542, "learning_rate": 5.264892322778014e-06, "loss": 0.5517, "num_input_tokens_seen": 63201520, "step": 108855 }, { "epoch": 16.21388144176348, "grad_norm": 1.4173531532287598, "learning_rate": 5.262897755325064e-06, "loss": 0.7224, "num_input_tokens_seen": 63204464, "step": 108860 }, { "epoch": 16.214626154304437, "grad_norm": 1.276124119758606, "learning_rate": 5.260903521310401e-06, "loss": 0.5879, "num_input_tokens_seen": 63207696, "step": 108865 }, { "epoch": 16.215370866845397, "grad_norm": 0.610651969909668, "learning_rate": 5.25890962076771e-06, "loss": 0.3265, "num_input_tokens_seen": 63210768, "step": 108870 }, { "epoch": 16.216115579386358, "grad_norm": 1.3035610914230347, "learning_rate": 5.256916053730679e-06, "loss": 0.6043, "num_input_tokens_seen": 63213968, "step": 108875 }, { "epoch": 16.216860291927315, "grad_norm": 1.2266727685928345, "learning_rate": 5.254922820232983e-06, "loss": 0.5544, "num_input_tokens_seen": 63216656, "step": 108880 }, { "epoch": 16.217605004468275, "grad_norm": 1.2229617834091187, "learning_rate": 5.2529299203082914e-06, "loss": 0.5785, "num_input_tokens_seen": 63219344, "step": 108885 }, { "epoch": 16.218349717009236, "grad_norm": 1.3436479568481445, "learning_rate": 5.250937353990288e-06, "loss": 0.5471, "num_input_tokens_seen": 63222448, "step": 108890 }, { "epoch": 16.219094429550193, "grad_norm": 1.019787073135376, "learning_rate": 5.248945121312618e-06, "loss": 0.4847, "num_input_tokens_seen": 63225584, "step": 108895 }, { "epoch": 16.219839142091153, "grad_norm": 2.4822089672088623, "learning_rate": 5.246953222308953e-06, "loss": 0.852, "num_input_tokens_seen": 63228656, "step": 108900 }, { "epoch": 16.22058385463211, "grad_norm": 1.4250752925872803, "learning_rate": 5.244961657012928e-06, "loss": 0.6683, "num_input_tokens_seen": 63231472, "step": 108905 }, { "epoch": 16.22132856717307, "grad_norm": 1.653584361076355, "learning_rate": 5.242970425458208e-06, "loss": 0.6001, "num_input_tokens_seen": 63234416, "step": 108910 }, { "epoch": 16.22207327971403, "grad_norm": 2.7566137313842773, "learning_rate": 5.240979527678422e-06, "loss": 0.5443, "num_input_tokens_seen": 63237168, "step": 108915 }, { "epoch": 16.22281799225499, "grad_norm": 1.7589253187179565, "learning_rate": 5.238988963707195e-06, "loss": 0.6631, "num_input_tokens_seen": 63239920, "step": 108920 }, { "epoch": 16.22356270479595, "grad_norm": 0.7312381863594055, "learning_rate": 5.236998733578175e-06, "loss": 0.5699, "num_input_tokens_seen": 63242800, "step": 108925 }, { "epoch": 16.22430741733691, "grad_norm": 1.1973559856414795, "learning_rate": 5.235008837324967e-06, "loss": 0.5893, "num_input_tokens_seen": 63246288, "step": 108930 }, { "epoch": 16.225052129877866, "grad_norm": 2.6788034439086914, "learning_rate": 5.233019274981205e-06, "loss": 0.7733, "num_input_tokens_seen": 63249200, "step": 108935 }, { "epoch": 16.225796842418827, "grad_norm": 2.7270147800445557, "learning_rate": 5.23103004658049e-06, "loss": 0.4859, "num_input_tokens_seen": 63251984, "step": 108940 }, { "epoch": 16.226541554959784, "grad_norm": 1.6276476383209229, "learning_rate": 5.2290411521564305e-06, "loss": 0.6834, "num_input_tokens_seen": 63254864, "step": 108945 }, { "epoch": 16.227286267500745, "grad_norm": 1.092781901359558, "learning_rate": 5.227052591742626e-06, "loss": 0.6153, "num_input_tokens_seen": 63257616, "step": 108950 }, { "epoch": 16.228030980041705, "grad_norm": 1.7357399463653564, "learning_rate": 5.225064365372667e-06, "loss": 0.498, "num_input_tokens_seen": 63260336, "step": 108955 }, { "epoch": 16.228775692582662, "grad_norm": 2.9509193897247314, "learning_rate": 5.223076473080152e-06, "loss": 0.5771, "num_input_tokens_seen": 63263120, "step": 108960 }, { "epoch": 16.229520405123623, "grad_norm": 1.2810590267181396, "learning_rate": 5.221088914898653e-06, "loss": 0.5295, "num_input_tokens_seen": 63265840, "step": 108965 }, { "epoch": 16.230265117664583, "grad_norm": 2.0103447437286377, "learning_rate": 5.219101690861763e-06, "loss": 0.4958, "num_input_tokens_seen": 63268400, "step": 108970 }, { "epoch": 16.23100983020554, "grad_norm": 1.2539693117141724, "learning_rate": 5.217114801003037e-06, "loss": 0.3635, "num_input_tokens_seen": 63271280, "step": 108975 }, { "epoch": 16.2317545427465, "grad_norm": 1.2519055604934692, "learning_rate": 5.215128245356057e-06, "loss": 0.5368, "num_input_tokens_seen": 63273936, "step": 108980 }, { "epoch": 16.232499255287458, "grad_norm": 1.9873932600021362, "learning_rate": 5.2131420239543704e-06, "loss": 0.5448, "num_input_tokens_seen": 63276656, "step": 108985 }, { "epoch": 16.233243967828418, "grad_norm": 2.2016453742980957, "learning_rate": 5.211156136831546e-06, "loss": 0.4602, "num_input_tokens_seen": 63279536, "step": 108990 }, { "epoch": 16.23398868036938, "grad_norm": 1.555476427078247, "learning_rate": 5.209170584021125e-06, "loss": 0.6878, "num_input_tokens_seen": 63282320, "step": 108995 }, { "epoch": 16.234733392910336, "grad_norm": 2.5290684700012207, "learning_rate": 5.207185365556646e-06, "loss": 0.543, "num_input_tokens_seen": 63285264, "step": 109000 }, { "epoch": 16.235478105451296, "grad_norm": 3.0038208961486816, "learning_rate": 5.205200481471662e-06, "loss": 0.5949, "num_input_tokens_seen": 63287856, "step": 109005 }, { "epoch": 16.236222817992257, "grad_norm": 1.8389720916748047, "learning_rate": 5.2032159317996955e-06, "loss": 0.4875, "num_input_tokens_seen": 63290320, "step": 109010 }, { "epoch": 16.236967530533214, "grad_norm": 0.6737361550331116, "learning_rate": 5.201231716574276e-06, "loss": 0.6466, "num_input_tokens_seen": 63293136, "step": 109015 }, { "epoch": 16.237712243074174, "grad_norm": 1.842013955116272, "learning_rate": 5.199247835828916e-06, "loss": 0.6089, "num_input_tokens_seen": 63296304, "step": 109020 }, { "epoch": 16.23845695561513, "grad_norm": 2.199702262878418, "learning_rate": 5.197264289597148e-06, "loss": 0.5712, "num_input_tokens_seen": 63299344, "step": 109025 }, { "epoch": 16.239201668156092, "grad_norm": 1.0710750818252563, "learning_rate": 5.195281077912473e-06, "loss": 0.4686, "num_input_tokens_seen": 63302384, "step": 109030 }, { "epoch": 16.239946380697052, "grad_norm": 1.6953202486038208, "learning_rate": 5.193298200808389e-06, "loss": 0.5112, "num_input_tokens_seen": 63305264, "step": 109035 }, { "epoch": 16.24069109323801, "grad_norm": 1.1051796674728394, "learning_rate": 5.191315658318408e-06, "loss": 0.3648, "num_input_tokens_seen": 63308240, "step": 109040 }, { "epoch": 16.24143580577897, "grad_norm": 2.1183950901031494, "learning_rate": 5.189333450476008e-06, "loss": 0.6215, "num_input_tokens_seen": 63311280, "step": 109045 }, { "epoch": 16.242180518319927, "grad_norm": 1.8576278686523438, "learning_rate": 5.187351577314692e-06, "loss": 0.7793, "num_input_tokens_seen": 63314128, "step": 109050 }, { "epoch": 16.242925230860887, "grad_norm": 2.6834325790405273, "learning_rate": 5.185370038867929e-06, "loss": 0.58, "num_input_tokens_seen": 63316848, "step": 109055 }, { "epoch": 16.243669943401848, "grad_norm": 1.8554044961929321, "learning_rate": 5.183388835169206e-06, "loss": 0.545, "num_input_tokens_seen": 63319696, "step": 109060 }, { "epoch": 16.244414655942805, "grad_norm": 3.1083483695983887, "learning_rate": 5.181407966251986e-06, "loss": 0.5859, "num_input_tokens_seen": 63322512, "step": 109065 }, { "epoch": 16.245159368483765, "grad_norm": 1.6809592247009277, "learning_rate": 5.179427432149733e-06, "loss": 0.5952, "num_input_tokens_seen": 63325328, "step": 109070 }, { "epoch": 16.245904081024726, "grad_norm": 1.9609549045562744, "learning_rate": 5.177447232895913e-06, "loss": 0.5588, "num_input_tokens_seen": 63328080, "step": 109075 }, { "epoch": 16.246648793565683, "grad_norm": 1.5895662307739258, "learning_rate": 5.1754673685239755e-06, "loss": 0.5296, "num_input_tokens_seen": 63331184, "step": 109080 }, { "epoch": 16.247393506106643, "grad_norm": 1.2758235931396484, "learning_rate": 5.173487839067371e-06, "loss": 0.5852, "num_input_tokens_seen": 63333936, "step": 109085 }, { "epoch": 16.2481382186476, "grad_norm": 1.6981048583984375, "learning_rate": 5.171508644559528e-06, "loss": 0.4808, "num_input_tokens_seen": 63336848, "step": 109090 }, { "epoch": 16.24888293118856, "grad_norm": 1.631622076034546, "learning_rate": 5.169529785033903e-06, "loss": 0.6535, "num_input_tokens_seen": 63339888, "step": 109095 }, { "epoch": 16.24962764372952, "grad_norm": 1.9388725757598877, "learning_rate": 5.167551260523909e-06, "loss": 0.5633, "num_input_tokens_seen": 63342960, "step": 109100 }, { "epoch": 16.25037235627048, "grad_norm": 1.0077273845672607, "learning_rate": 5.165573071062985e-06, "loss": 0.5089, "num_input_tokens_seen": 63346224, "step": 109105 }, { "epoch": 16.25111706881144, "grad_norm": 1.0743200778961182, "learning_rate": 5.163595216684541e-06, "loss": 0.5643, "num_input_tokens_seen": 63349136, "step": 109110 }, { "epoch": 16.2518617813524, "grad_norm": 0.984397828578949, "learning_rate": 5.161617697422003e-06, "loss": 0.6131, "num_input_tokens_seen": 63352240, "step": 109115 }, { "epoch": 16.252606493893357, "grad_norm": 1.5427671670913696, "learning_rate": 5.159640513308767e-06, "loss": 0.6358, "num_input_tokens_seen": 63354928, "step": 109120 }, { "epoch": 16.253351206434317, "grad_norm": 2.23907470703125, "learning_rate": 5.1576636643782376e-06, "loss": 0.8179, "num_input_tokens_seen": 63357744, "step": 109125 }, { "epoch": 16.254095918975274, "grad_norm": 1.4513375759124756, "learning_rate": 5.155687150663815e-06, "loss": 0.7766, "num_input_tokens_seen": 63360464, "step": 109130 }, { "epoch": 16.254840631516235, "grad_norm": 1.2352561950683594, "learning_rate": 5.153710972198894e-06, "loss": 0.6712, "num_input_tokens_seen": 63363600, "step": 109135 }, { "epoch": 16.255585344057195, "grad_norm": 3.2607321739196777, "learning_rate": 5.151735129016855e-06, "loss": 0.6733, "num_input_tokens_seen": 63366512, "step": 109140 }, { "epoch": 16.256330056598152, "grad_norm": 0.9037352204322815, "learning_rate": 5.149759621151068e-06, "loss": 0.6824, "num_input_tokens_seen": 63369488, "step": 109145 }, { "epoch": 16.257074769139113, "grad_norm": 2.2487244606018066, "learning_rate": 5.147784448634926e-06, "loss": 0.6519, "num_input_tokens_seen": 63372368, "step": 109150 }, { "epoch": 16.257819481680073, "grad_norm": 1.465678334236145, "learning_rate": 5.145809611501789e-06, "loss": 0.5768, "num_input_tokens_seen": 63375216, "step": 109155 }, { "epoch": 16.25856419422103, "grad_norm": 2.7588205337524414, "learning_rate": 5.143835109785014e-06, "loss": 0.6776, "num_input_tokens_seen": 63378064, "step": 109160 }, { "epoch": 16.25930890676199, "grad_norm": 1.7090791463851929, "learning_rate": 5.1418609435179676e-06, "loss": 0.8386, "num_input_tokens_seen": 63380848, "step": 109165 }, { "epoch": 16.260053619302948, "grad_norm": 1.7670339345932007, "learning_rate": 5.139887112733993e-06, "loss": 0.7041, "num_input_tokens_seen": 63383696, "step": 109170 }, { "epoch": 16.260798331843908, "grad_norm": 1.546797275543213, "learning_rate": 5.137913617466447e-06, "loss": 0.6229, "num_input_tokens_seen": 63386544, "step": 109175 }, { "epoch": 16.26154304438487, "grad_norm": 1.4478683471679688, "learning_rate": 5.1359404577486585e-06, "loss": 0.505, "num_input_tokens_seen": 63389520, "step": 109180 }, { "epoch": 16.262287756925826, "grad_norm": 1.152958631515503, "learning_rate": 5.13396763361397e-06, "loss": 0.4642, "num_input_tokens_seen": 63392240, "step": 109185 }, { "epoch": 16.263032469466786, "grad_norm": 2.962047576904297, "learning_rate": 5.131995145095705e-06, "loss": 0.8283, "num_input_tokens_seen": 63395280, "step": 109190 }, { "epoch": 16.263777182007743, "grad_norm": 3.5813539028167725, "learning_rate": 5.130022992227193e-06, "loss": 0.7156, "num_input_tokens_seen": 63398288, "step": 109195 }, { "epoch": 16.264521894548704, "grad_norm": 2.1229279041290283, "learning_rate": 5.128051175041748e-06, "loss": 0.5686, "num_input_tokens_seen": 63401040, "step": 109200 }, { "epoch": 16.265266607089664, "grad_norm": 1.405525803565979, "learning_rate": 5.126079693572683e-06, "loss": 0.6736, "num_input_tokens_seen": 63403696, "step": 109205 }, { "epoch": 16.26601131963062, "grad_norm": 1.650923728942871, "learning_rate": 5.124108547853301e-06, "loss": 0.5645, "num_input_tokens_seen": 63406640, "step": 109210 }, { "epoch": 16.266756032171582, "grad_norm": 3.4544639587402344, "learning_rate": 5.122137737916896e-06, "loss": 0.7867, "num_input_tokens_seen": 63409680, "step": 109215 }, { "epoch": 16.267500744712542, "grad_norm": 1.3181090354919434, "learning_rate": 5.120167263796779e-06, "loss": 0.4731, "num_input_tokens_seen": 63412496, "step": 109220 }, { "epoch": 16.2682454572535, "grad_norm": 1.8659974336624146, "learning_rate": 5.118197125526228e-06, "loss": 0.7786, "num_input_tokens_seen": 63415632, "step": 109225 }, { "epoch": 16.26899016979446, "grad_norm": 1.1390821933746338, "learning_rate": 5.116227323138531e-06, "loss": 0.4811, "num_input_tokens_seen": 63418512, "step": 109230 }, { "epoch": 16.269734882335417, "grad_norm": 1.5467867851257324, "learning_rate": 5.114257856666968e-06, "loss": 0.5541, "num_input_tokens_seen": 63421328, "step": 109235 }, { "epoch": 16.270479594876377, "grad_norm": 1.6729282140731812, "learning_rate": 5.112288726144798e-06, "loss": 0.6066, "num_input_tokens_seen": 63424336, "step": 109240 }, { "epoch": 16.271224307417338, "grad_norm": 1.2119070291519165, "learning_rate": 5.110319931605306e-06, "loss": 0.4886, "num_input_tokens_seen": 63427312, "step": 109245 }, { "epoch": 16.271969019958295, "grad_norm": 2.249300479888916, "learning_rate": 5.1083514730817375e-06, "loss": 0.5239, "num_input_tokens_seen": 63430288, "step": 109250 }, { "epoch": 16.272713732499255, "grad_norm": 2.5134363174438477, "learning_rate": 5.106383350607358e-06, "loss": 0.7637, "num_input_tokens_seen": 63433136, "step": 109255 }, { "epoch": 16.273458445040216, "grad_norm": 0.9303743839263916, "learning_rate": 5.104415564215409e-06, "loss": 0.7245, "num_input_tokens_seen": 63435696, "step": 109260 }, { "epoch": 16.274203157581173, "grad_norm": 1.7439240217208862, "learning_rate": 5.102448113939143e-06, "loss": 0.6077, "num_input_tokens_seen": 63439088, "step": 109265 }, { "epoch": 16.274947870122134, "grad_norm": 1.3254958391189575, "learning_rate": 5.100480999811794e-06, "loss": 0.6111, "num_input_tokens_seen": 63441968, "step": 109270 }, { "epoch": 16.27569258266309, "grad_norm": 3.216848134994507, "learning_rate": 5.09851422186659e-06, "loss": 0.5431, "num_input_tokens_seen": 63444816, "step": 109275 }, { "epoch": 16.27643729520405, "grad_norm": 1.8343257904052734, "learning_rate": 5.096547780136765e-06, "loss": 0.6611, "num_input_tokens_seen": 63447504, "step": 109280 }, { "epoch": 16.27718200774501, "grad_norm": 1.3488826751708984, "learning_rate": 5.0945816746555295e-06, "loss": 0.7407, "num_input_tokens_seen": 63450480, "step": 109285 }, { "epoch": 16.27792672028597, "grad_norm": 2.103484630584717, "learning_rate": 5.092615905456111e-06, "loss": 0.669, "num_input_tokens_seen": 63453680, "step": 109290 }, { "epoch": 16.27867143282693, "grad_norm": 1.6709356307983398, "learning_rate": 5.090650472571709e-06, "loss": 0.6498, "num_input_tokens_seen": 63456624, "step": 109295 }, { "epoch": 16.27941614536789, "grad_norm": 1.8978484869003296, "learning_rate": 5.088685376035538e-06, "loss": 0.5563, "num_input_tokens_seen": 63459440, "step": 109300 }, { "epoch": 16.280160857908847, "grad_norm": 1.5386244058609009, "learning_rate": 5.086720615880783e-06, "loss": 0.4587, "num_input_tokens_seen": 63462128, "step": 109305 }, { "epoch": 16.280905570449807, "grad_norm": 1.2536669969558716, "learning_rate": 5.084756192140652e-06, "loss": 0.4629, "num_input_tokens_seen": 63464784, "step": 109310 }, { "epoch": 16.281650282990764, "grad_norm": 1.0707558393478394, "learning_rate": 5.082792104848325e-06, "loss": 0.4218, "num_input_tokens_seen": 63467472, "step": 109315 }, { "epoch": 16.282394995531725, "grad_norm": 1.2736330032348633, "learning_rate": 5.080828354036974e-06, "loss": 0.5708, "num_input_tokens_seen": 63470288, "step": 109320 }, { "epoch": 16.283139708072685, "grad_norm": 1.3651158809661865, "learning_rate": 5.078864939739789e-06, "loss": 0.5672, "num_input_tokens_seen": 63472848, "step": 109325 }, { "epoch": 16.283884420613642, "grad_norm": 1.3534369468688965, "learning_rate": 5.076901861989927e-06, "loss": 0.624, "num_input_tokens_seen": 63475984, "step": 109330 }, { "epoch": 16.284629133154603, "grad_norm": 1.5033549070358276, "learning_rate": 5.074939120820568e-06, "loss": 0.6407, "num_input_tokens_seen": 63478672, "step": 109335 }, { "epoch": 16.285373845695563, "grad_norm": 0.8359810709953308, "learning_rate": 5.072976716264863e-06, "loss": 0.4086, "num_input_tokens_seen": 63481424, "step": 109340 }, { "epoch": 16.28611855823652, "grad_norm": 1.6714036464691162, "learning_rate": 5.07101464835596e-06, "loss": 0.6226, "num_input_tokens_seen": 63484048, "step": 109345 }, { "epoch": 16.28686327077748, "grad_norm": 1.1553677320480347, "learning_rate": 5.069052917127004e-06, "loss": 0.6556, "num_input_tokens_seen": 63487152, "step": 109350 }, { "epoch": 16.287607983318438, "grad_norm": 2.008744955062866, "learning_rate": 5.06709152261115e-06, "loss": 0.6921, "num_input_tokens_seen": 63490064, "step": 109355 }, { "epoch": 16.2883526958594, "grad_norm": 2.110318660736084, "learning_rate": 5.065130464841525e-06, "loss": 0.4272, "num_input_tokens_seen": 63493008, "step": 109360 }, { "epoch": 16.28909740840036, "grad_norm": 2.4045753479003906, "learning_rate": 5.063169743851251e-06, "loss": 0.6821, "num_input_tokens_seen": 63495824, "step": 109365 }, { "epoch": 16.289842120941316, "grad_norm": 1.169793963432312, "learning_rate": 5.061209359673471e-06, "loss": 0.4989, "num_input_tokens_seen": 63498384, "step": 109370 }, { "epoch": 16.290586833482276, "grad_norm": 1.2510344982147217, "learning_rate": 5.059249312341286e-06, "loss": 0.5522, "num_input_tokens_seen": 63501264, "step": 109375 }, { "epoch": 16.291331546023233, "grad_norm": 1.425291895866394, "learning_rate": 5.057289601887824e-06, "loss": 0.5257, "num_input_tokens_seen": 63504016, "step": 109380 }, { "epoch": 16.292076258564194, "grad_norm": 1.6388449668884277, "learning_rate": 5.055330228346178e-06, "loss": 0.6382, "num_input_tokens_seen": 63506800, "step": 109385 }, { "epoch": 16.292820971105154, "grad_norm": 1.09323251247406, "learning_rate": 5.053371191749465e-06, "loss": 0.7064, "num_input_tokens_seen": 63509872, "step": 109390 }, { "epoch": 16.29356568364611, "grad_norm": 1.301065444946289, "learning_rate": 5.051412492130772e-06, "loss": 0.5356, "num_input_tokens_seen": 63512880, "step": 109395 }, { "epoch": 16.294310396187072, "grad_norm": 2.943125009536743, "learning_rate": 5.049454129523185e-06, "loss": 0.6218, "num_input_tokens_seen": 63515824, "step": 109400 }, { "epoch": 16.295055108728032, "grad_norm": 1.9511815309524536, "learning_rate": 5.047496103959798e-06, "loss": 0.8109, "num_input_tokens_seen": 63518864, "step": 109405 }, { "epoch": 16.29579982126899, "grad_norm": 0.9021894931793213, "learning_rate": 5.045538415473686e-06, "loss": 0.4296, "num_input_tokens_seen": 63521872, "step": 109410 }, { "epoch": 16.29654453380995, "grad_norm": 1.9810794591903687, "learning_rate": 5.0435810640979215e-06, "loss": 0.6258, "num_input_tokens_seen": 63525072, "step": 109415 }, { "epoch": 16.297289246350907, "grad_norm": 1.9965780973434448, "learning_rate": 5.041624049865567e-06, "loss": 0.5416, "num_input_tokens_seen": 63528048, "step": 109420 }, { "epoch": 16.298033958891867, "grad_norm": 3.999032497406006, "learning_rate": 5.039667372809695e-06, "loss": 0.7803, "num_input_tokens_seen": 63530960, "step": 109425 }, { "epoch": 16.298778671432828, "grad_norm": 1.5050898790359497, "learning_rate": 5.0377110329633495e-06, "loss": 0.6584, "num_input_tokens_seen": 63533968, "step": 109430 }, { "epoch": 16.299523383973785, "grad_norm": 0.7264499664306641, "learning_rate": 5.035755030359593e-06, "loss": 0.4348, "num_input_tokens_seen": 63536464, "step": 109435 }, { "epoch": 16.300268096514746, "grad_norm": 2.422818183898926, "learning_rate": 5.0337993650314665e-06, "loss": 0.5894, "num_input_tokens_seen": 63539440, "step": 109440 }, { "epoch": 16.301012809055706, "grad_norm": 0.9695323705673218, "learning_rate": 5.0318440370119985e-06, "loss": 0.3792, "num_input_tokens_seen": 63542128, "step": 109445 }, { "epoch": 16.301757521596663, "grad_norm": 1.734777808189392, "learning_rate": 5.029889046334238e-06, "loss": 0.7628, "num_input_tokens_seen": 63545040, "step": 109450 }, { "epoch": 16.302502234137624, "grad_norm": 1.2166117429733276, "learning_rate": 5.0279343930312e-06, "loss": 0.5638, "num_input_tokens_seen": 63547824, "step": 109455 }, { "epoch": 16.30324694667858, "grad_norm": 1.0419702529907227, "learning_rate": 5.025980077135917e-06, "loss": 0.5274, "num_input_tokens_seen": 63550928, "step": 109460 }, { "epoch": 16.30399165921954, "grad_norm": 0.9759003520011902, "learning_rate": 5.0240260986814e-06, "loss": 0.5162, "num_input_tokens_seen": 63554224, "step": 109465 }, { "epoch": 16.3047363717605, "grad_norm": 1.1718699932098389, "learning_rate": 5.022072457700658e-06, "loss": 0.5546, "num_input_tokens_seen": 63557520, "step": 109470 }, { "epoch": 16.30548108430146, "grad_norm": 0.5814505219459534, "learning_rate": 5.020119154226699e-06, "loss": 0.447, "num_input_tokens_seen": 63560208, "step": 109475 }, { "epoch": 16.30622579684242, "grad_norm": 1.3255244493484497, "learning_rate": 5.018166188292514e-06, "loss": 0.5587, "num_input_tokens_seen": 63563120, "step": 109480 }, { "epoch": 16.30697050938338, "grad_norm": 2.048121690750122, "learning_rate": 5.016213559931107e-06, "loss": 0.6607, "num_input_tokens_seen": 63565968, "step": 109485 }, { "epoch": 16.307715221924337, "grad_norm": 1.3784217834472656, "learning_rate": 5.014261269175457e-06, "loss": 0.5324, "num_input_tokens_seen": 63568752, "step": 109490 }, { "epoch": 16.308459934465297, "grad_norm": 1.3010962009429932, "learning_rate": 5.012309316058555e-06, "loss": 0.539, "num_input_tokens_seen": 63571664, "step": 109495 }, { "epoch": 16.309204647006254, "grad_norm": 1.5055861473083496, "learning_rate": 5.0103577006133685e-06, "loss": 0.4478, "num_input_tokens_seen": 63574416, "step": 109500 }, { "epoch": 16.309949359547215, "grad_norm": 1.8821696043014526, "learning_rate": 5.008406422872878e-06, "loss": 0.6266, "num_input_tokens_seen": 63577520, "step": 109505 }, { "epoch": 16.310694072088175, "grad_norm": 1.2740867137908936, "learning_rate": 5.0064554828700345e-06, "loss": 0.4909, "num_input_tokens_seen": 63580400, "step": 109510 }, { "epoch": 16.311438784629132, "grad_norm": 1.236771821975708, "learning_rate": 5.004504880637812e-06, "loss": 0.5869, "num_input_tokens_seen": 63583056, "step": 109515 }, { "epoch": 16.312183497170093, "grad_norm": 2.795255661010742, "learning_rate": 5.002554616209157e-06, "loss": 0.7395, "num_input_tokens_seen": 63586544, "step": 109520 }, { "epoch": 16.312928209711053, "grad_norm": 1.587534785270691, "learning_rate": 5.000604689617011e-06, "loss": 0.3337, "num_input_tokens_seen": 63589232, "step": 109525 }, { "epoch": 16.31367292225201, "grad_norm": 2.6314117908477783, "learning_rate": 4.998655100894328e-06, "loss": 0.7227, "num_input_tokens_seen": 63592048, "step": 109530 }, { "epoch": 16.31441763479297, "grad_norm": 2.224074602127075, "learning_rate": 4.996705850074041e-06, "loss": 0.7567, "num_input_tokens_seen": 63595184, "step": 109535 }, { "epoch": 16.315162347333928, "grad_norm": 1.7233237028121948, "learning_rate": 4.994756937189076e-06, "loss": 0.7861, "num_input_tokens_seen": 63598160, "step": 109540 }, { "epoch": 16.31590705987489, "grad_norm": 1.4860929250717163, "learning_rate": 4.992808362272353e-06, "loss": 0.664, "num_input_tokens_seen": 63601168, "step": 109545 }, { "epoch": 16.31665177241585, "grad_norm": 1.103665828704834, "learning_rate": 4.990860125356806e-06, "loss": 0.6754, "num_input_tokens_seen": 63604176, "step": 109550 }, { "epoch": 16.317396484956806, "grad_norm": 2.2591464519500732, "learning_rate": 4.988912226475342e-06, "loss": 0.7843, "num_input_tokens_seen": 63606672, "step": 109555 }, { "epoch": 16.318141197497766, "grad_norm": 2.0723073482513428, "learning_rate": 4.986964665660859e-06, "loss": 0.6793, "num_input_tokens_seen": 63609904, "step": 109560 }, { "epoch": 16.318885910038723, "grad_norm": 2.023268222808838, "learning_rate": 4.985017442946274e-06, "loss": 0.6101, "num_input_tokens_seen": 63612560, "step": 109565 }, { "epoch": 16.319630622579684, "grad_norm": 2.4570021629333496, "learning_rate": 4.983070558364472e-06, "loss": 0.7143, "num_input_tokens_seen": 63615664, "step": 109570 }, { "epoch": 16.320375335120644, "grad_norm": 2.490910053253174, "learning_rate": 4.981124011948355e-06, "loss": 0.7131, "num_input_tokens_seen": 63618928, "step": 109575 }, { "epoch": 16.3211200476616, "grad_norm": 1.0704419612884521, "learning_rate": 4.979177803730794e-06, "loss": 0.4222, "num_input_tokens_seen": 63621936, "step": 109580 }, { "epoch": 16.321864760202562, "grad_norm": 1.2101479768753052, "learning_rate": 4.9772319337446835e-06, "loss": 0.6567, "num_input_tokens_seen": 63624592, "step": 109585 }, { "epoch": 16.322609472743522, "grad_norm": 1.9564414024353027, "learning_rate": 4.975286402022883e-06, "loss": 0.758, "num_input_tokens_seen": 63627536, "step": 109590 }, { "epoch": 16.32335418528448, "grad_norm": 1.67612624168396, "learning_rate": 4.973341208598273e-06, "loss": 0.6733, "num_input_tokens_seen": 63630544, "step": 109595 }, { "epoch": 16.32409889782544, "grad_norm": 2.899301767349243, "learning_rate": 4.971396353503707e-06, "loss": 0.4725, "num_input_tokens_seen": 63633328, "step": 109600 }, { "epoch": 16.324843610366397, "grad_norm": 1.22465980052948, "learning_rate": 4.969451836772046e-06, "loss": 0.4652, "num_input_tokens_seen": 63636144, "step": 109605 }, { "epoch": 16.325588322907358, "grad_norm": 1.844915509223938, "learning_rate": 4.9675076584361355e-06, "loss": 0.6249, "num_input_tokens_seen": 63638704, "step": 109610 }, { "epoch": 16.326333035448318, "grad_norm": 2.8076186180114746, "learning_rate": 4.965563818528818e-06, "loss": 0.5049, "num_input_tokens_seen": 63641744, "step": 109615 }, { "epoch": 16.327077747989275, "grad_norm": 1.3679174184799194, "learning_rate": 4.9636203170829424e-06, "loss": 0.6917, "num_input_tokens_seen": 63645008, "step": 109620 }, { "epoch": 16.327822460530236, "grad_norm": 1.169945478439331, "learning_rate": 4.9616771541313335e-06, "loss": 0.5637, "num_input_tokens_seen": 63647696, "step": 109625 }, { "epoch": 16.328567173071196, "grad_norm": 1.5486114025115967, "learning_rate": 4.9597343297068274e-06, "loss": 0.6861, "num_input_tokens_seen": 63650416, "step": 109630 }, { "epoch": 16.329311885612153, "grad_norm": 2.0144689083099365, "learning_rate": 4.957791843842244e-06, "loss": 0.5649, "num_input_tokens_seen": 63653296, "step": 109635 }, { "epoch": 16.330056598153114, "grad_norm": 2.486116886138916, "learning_rate": 4.955849696570392e-06, "loss": 0.856, "num_input_tokens_seen": 63656144, "step": 109640 }, { "epoch": 16.33080131069407, "grad_norm": 1.3115348815917969, "learning_rate": 4.953907887924089e-06, "loss": 0.6026, "num_input_tokens_seen": 63659248, "step": 109645 }, { "epoch": 16.33154602323503, "grad_norm": 2.236438751220703, "learning_rate": 4.9519664179361355e-06, "loss": 0.7493, "num_input_tokens_seen": 63662032, "step": 109650 }, { "epoch": 16.33229073577599, "grad_norm": 3.8702356815338135, "learning_rate": 4.95002528663934e-06, "loss": 0.666, "num_input_tokens_seen": 63664944, "step": 109655 }, { "epoch": 16.33303544831695, "grad_norm": 1.5583477020263672, "learning_rate": 4.948084494066482e-06, "loss": 0.5368, "num_input_tokens_seen": 63668016, "step": 109660 }, { "epoch": 16.33378016085791, "grad_norm": 2.2698633670806885, "learning_rate": 4.946144040250361e-06, "loss": 0.5632, "num_input_tokens_seen": 63671152, "step": 109665 }, { "epoch": 16.33452487339887, "grad_norm": 1.641981601715088, "learning_rate": 4.944203925223759e-06, "loss": 0.4653, "num_input_tokens_seen": 63673872, "step": 109670 }, { "epoch": 16.335269585939827, "grad_norm": 1.0174195766448975, "learning_rate": 4.942264149019446e-06, "loss": 0.5102, "num_input_tokens_seen": 63676656, "step": 109675 }, { "epoch": 16.336014298480787, "grad_norm": 1.6096829175949097, "learning_rate": 4.940324711670194e-06, "loss": 0.5396, "num_input_tokens_seen": 63679792, "step": 109680 }, { "epoch": 16.336759011021744, "grad_norm": 2.3336806297302246, "learning_rate": 4.93838561320876e-06, "loss": 0.5718, "num_input_tokens_seen": 63682736, "step": 109685 }, { "epoch": 16.337503723562705, "grad_norm": 3.182565212249756, "learning_rate": 4.93644685366792e-06, "loss": 0.7183, "num_input_tokens_seen": 63685776, "step": 109690 }, { "epoch": 16.338248436103665, "grad_norm": 1.0636024475097656, "learning_rate": 4.934508433080412e-06, "loss": 0.4981, "num_input_tokens_seen": 63688912, "step": 109695 }, { "epoch": 16.338993148644622, "grad_norm": 1.567776083946228, "learning_rate": 4.932570351478996e-06, "loss": 0.5234, "num_input_tokens_seen": 63692048, "step": 109700 }, { "epoch": 16.339737861185583, "grad_norm": 1.3321888446807861, "learning_rate": 4.930632608896402e-06, "loss": 0.5872, "num_input_tokens_seen": 63694864, "step": 109705 }, { "epoch": 16.34048257372654, "grad_norm": 2.8169491291046143, "learning_rate": 4.92869520536538e-06, "loss": 0.4761, "num_input_tokens_seen": 63697904, "step": 109710 }, { "epoch": 16.3412272862675, "grad_norm": 1.0163872241973877, "learning_rate": 4.926758140918647e-06, "loss": 0.4262, "num_input_tokens_seen": 63700720, "step": 109715 }, { "epoch": 16.34197199880846, "grad_norm": 1.6226069927215576, "learning_rate": 4.924821415588937e-06, "loss": 0.6474, "num_input_tokens_seen": 63703504, "step": 109720 }, { "epoch": 16.342716711349418, "grad_norm": 1.5748878717422485, "learning_rate": 4.922885029408969e-06, "loss": 0.6277, "num_input_tokens_seen": 63706192, "step": 109725 }, { "epoch": 16.34346142389038, "grad_norm": 1.9127304553985596, "learning_rate": 4.920948982411444e-06, "loss": 0.5287, "num_input_tokens_seen": 63708944, "step": 109730 }, { "epoch": 16.34420613643134, "grad_norm": 1.1935830116271973, "learning_rate": 4.919013274629087e-06, "loss": 0.6531, "num_input_tokens_seen": 63711664, "step": 109735 }, { "epoch": 16.344950848972296, "grad_norm": 1.6987764835357666, "learning_rate": 4.9170779060945916e-06, "loss": 0.5942, "num_input_tokens_seen": 63714608, "step": 109740 }, { "epoch": 16.345695561513256, "grad_norm": 1.5008138418197632, "learning_rate": 4.915142876840653e-06, "loss": 0.425, "num_input_tokens_seen": 63717648, "step": 109745 }, { "epoch": 16.346440274054213, "grad_norm": 1.6794577836990356, "learning_rate": 4.9132081868999535e-06, "loss": 0.5758, "num_input_tokens_seen": 63720528, "step": 109750 }, { "epoch": 16.347184986595174, "grad_norm": 2.0588841438293457, "learning_rate": 4.911273836305194e-06, "loss": 0.5225, "num_input_tokens_seen": 63723472, "step": 109755 }, { "epoch": 16.347929699136134, "grad_norm": 1.611947774887085, "learning_rate": 4.909339825089049e-06, "loss": 0.6129, "num_input_tokens_seen": 63726160, "step": 109760 }, { "epoch": 16.34867441167709, "grad_norm": 1.7912647724151611, "learning_rate": 4.9074061532841774e-06, "loss": 0.5728, "num_input_tokens_seen": 63729200, "step": 109765 }, { "epoch": 16.349419124218052, "grad_norm": 1.6175296306610107, "learning_rate": 4.905472820923265e-06, "loss": 0.5314, "num_input_tokens_seen": 63731888, "step": 109770 }, { "epoch": 16.350163836759013, "grad_norm": 1.898159146308899, "learning_rate": 4.903539828038961e-06, "loss": 0.4578, "num_input_tokens_seen": 63734608, "step": 109775 }, { "epoch": 16.35090854929997, "grad_norm": 0.9978735446929932, "learning_rate": 4.901607174663933e-06, "loss": 0.6871, "num_input_tokens_seen": 63737616, "step": 109780 }, { "epoch": 16.35165326184093, "grad_norm": 1.4914648532867432, "learning_rate": 4.899674860830819e-06, "loss": 0.5591, "num_input_tokens_seen": 63740336, "step": 109785 }, { "epoch": 16.352397974381887, "grad_norm": 2.8430449962615967, "learning_rate": 4.897742886572274e-06, "loss": 0.7634, "num_input_tokens_seen": 63742960, "step": 109790 }, { "epoch": 16.353142686922848, "grad_norm": 1.3547015190124512, "learning_rate": 4.8958112519209315e-06, "loss": 0.5805, "num_input_tokens_seen": 63745776, "step": 109795 }, { "epoch": 16.353887399463808, "grad_norm": 2.160719633102417, "learning_rate": 4.8938799569094275e-06, "loss": 0.5918, "num_input_tokens_seen": 63748528, "step": 109800 }, { "epoch": 16.354632112004765, "grad_norm": 1.4938126802444458, "learning_rate": 4.891949001570384e-06, "loss": 0.7683, "num_input_tokens_seen": 63751696, "step": 109805 }, { "epoch": 16.355376824545726, "grad_norm": 3.4042913913726807, "learning_rate": 4.890018385936421e-06, "loss": 0.5321, "num_input_tokens_seen": 63754576, "step": 109810 }, { "epoch": 16.356121537086686, "grad_norm": 2.157335042953491, "learning_rate": 4.888088110040162e-06, "loss": 0.5203, "num_input_tokens_seen": 63757456, "step": 109815 }, { "epoch": 16.356866249627643, "grad_norm": 1.5628280639648438, "learning_rate": 4.88615817391421e-06, "loss": 0.6503, "num_input_tokens_seen": 63760400, "step": 109820 }, { "epoch": 16.357610962168604, "grad_norm": 0.6613379120826721, "learning_rate": 4.884228577591177e-06, "loss": 0.4306, "num_input_tokens_seen": 63763248, "step": 109825 }, { "epoch": 16.35835567470956, "grad_norm": 1.670120120048523, "learning_rate": 4.882299321103653e-06, "loss": 0.6597, "num_input_tokens_seen": 63766256, "step": 109830 }, { "epoch": 16.35910038725052, "grad_norm": 2.986729621887207, "learning_rate": 4.880370404484242e-06, "loss": 0.708, "num_input_tokens_seen": 63768944, "step": 109835 }, { "epoch": 16.35984509979148, "grad_norm": 1.3475322723388672, "learning_rate": 4.87844182776552e-06, "loss": 0.5894, "num_input_tokens_seen": 63771664, "step": 109840 }, { "epoch": 16.36058981233244, "grad_norm": 1.0903853178024292, "learning_rate": 4.87651359098007e-06, "loss": 0.5974, "num_input_tokens_seen": 63774448, "step": 109845 }, { "epoch": 16.3613345248734, "grad_norm": 1.725496530532837, "learning_rate": 4.874585694160477e-06, "loss": 0.5897, "num_input_tokens_seen": 63777264, "step": 109850 }, { "epoch": 16.36207923741436, "grad_norm": 1.4307067394256592, "learning_rate": 4.872658137339295e-06, "loss": 0.6285, "num_input_tokens_seen": 63780112, "step": 109855 }, { "epoch": 16.362823949955317, "grad_norm": 1.4245543479919434, "learning_rate": 4.870730920549108e-06, "loss": 0.5302, "num_input_tokens_seen": 63782832, "step": 109860 }, { "epoch": 16.363568662496277, "grad_norm": 1.325350046157837, "learning_rate": 4.868804043822458e-06, "loss": 0.528, "num_input_tokens_seen": 63785936, "step": 109865 }, { "epoch": 16.364313375037234, "grad_norm": 1.283847451210022, "learning_rate": 4.866877507191908e-06, "loss": 0.5256, "num_input_tokens_seen": 63788848, "step": 109870 }, { "epoch": 16.365058087578195, "grad_norm": 1.8115291595458984, "learning_rate": 4.864951310689991e-06, "loss": 0.6451, "num_input_tokens_seen": 63791856, "step": 109875 }, { "epoch": 16.365802800119155, "grad_norm": 1.2800920009613037, "learning_rate": 4.863025454349266e-06, "loss": 0.5253, "num_input_tokens_seen": 63794672, "step": 109880 }, { "epoch": 16.366547512660112, "grad_norm": 1.1835272312164307, "learning_rate": 4.861099938202257e-06, "loss": 0.5996, "num_input_tokens_seen": 63797648, "step": 109885 }, { "epoch": 16.367292225201073, "grad_norm": 2.468289375305176, "learning_rate": 4.859174762281493e-06, "loss": 0.5785, "num_input_tokens_seen": 63800848, "step": 109890 }, { "epoch": 16.36803693774203, "grad_norm": 1.4123127460479736, "learning_rate": 4.857249926619506e-06, "loss": 0.5837, "num_input_tokens_seen": 63803952, "step": 109895 }, { "epoch": 16.36878165028299, "grad_norm": 1.176440954208374, "learning_rate": 4.855325431248803e-06, "loss": 0.4737, "num_input_tokens_seen": 63806672, "step": 109900 }, { "epoch": 16.36952636282395, "grad_norm": 3.5043911933898926, "learning_rate": 4.853401276201908e-06, "loss": 0.379, "num_input_tokens_seen": 63809616, "step": 109905 }, { "epoch": 16.370271075364908, "grad_norm": 2.1824734210968018, "learning_rate": 4.851477461511317e-06, "loss": 0.5782, "num_input_tokens_seen": 63812336, "step": 109910 }, { "epoch": 16.37101578790587, "grad_norm": 0.9430261254310608, "learning_rate": 4.84955398720954e-06, "loss": 0.6274, "num_input_tokens_seen": 63815568, "step": 109915 }, { "epoch": 16.37176050044683, "grad_norm": 1.7579830884933472, "learning_rate": 4.8476308533290714e-06, "loss": 0.6792, "num_input_tokens_seen": 63818224, "step": 109920 }, { "epoch": 16.372505212987786, "grad_norm": 1.5339007377624512, "learning_rate": 4.8457080599023905e-06, "loss": 0.5626, "num_input_tokens_seen": 63821072, "step": 109925 }, { "epoch": 16.373249925528746, "grad_norm": 4.970703601837158, "learning_rate": 4.843785606961995e-06, "loss": 0.6321, "num_input_tokens_seen": 63824688, "step": 109930 }, { "epoch": 16.373994638069703, "grad_norm": 1.0787674188613892, "learning_rate": 4.8418634945403555e-06, "loss": 0.4655, "num_input_tokens_seen": 63827952, "step": 109935 }, { "epoch": 16.374739350610664, "grad_norm": 1.4135463237762451, "learning_rate": 4.839941722669944e-06, "loss": 0.5063, "num_input_tokens_seen": 63830832, "step": 109940 }, { "epoch": 16.375484063151625, "grad_norm": 2.0254693031311035, "learning_rate": 4.8380202913832215e-06, "loss": 0.6034, "num_input_tokens_seen": 63833776, "step": 109945 }, { "epoch": 16.37622877569258, "grad_norm": 4.787895202636719, "learning_rate": 4.83609920071266e-06, "loss": 0.5884, "num_input_tokens_seen": 63836592, "step": 109950 }, { "epoch": 16.376973488233542, "grad_norm": 2.0862529277801514, "learning_rate": 4.834178450690704e-06, "loss": 0.5137, "num_input_tokens_seen": 63839504, "step": 109955 }, { "epoch": 16.377718200774503, "grad_norm": 1.6968942880630493, "learning_rate": 4.832258041349813e-06, "loss": 0.443, "num_input_tokens_seen": 63842320, "step": 109960 }, { "epoch": 16.37846291331546, "grad_norm": 1.4041639566421509, "learning_rate": 4.830337972722424e-06, "loss": 0.7772, "num_input_tokens_seen": 63845040, "step": 109965 }, { "epoch": 16.37920762585642, "grad_norm": 1.6332122087478638, "learning_rate": 4.828418244840968e-06, "loss": 0.7127, "num_input_tokens_seen": 63847920, "step": 109970 }, { "epoch": 16.379952338397377, "grad_norm": 2.0201406478881836, "learning_rate": 4.8264988577378934e-06, "loss": 0.691, "num_input_tokens_seen": 63850864, "step": 109975 }, { "epoch": 16.380697050938338, "grad_norm": 1.41786789894104, "learning_rate": 4.824579811445609e-06, "loss": 0.5823, "num_input_tokens_seen": 63854128, "step": 109980 }, { "epoch": 16.381441763479298, "grad_norm": 1.2849088907241821, "learning_rate": 4.822661105996551e-06, "loss": 0.566, "num_input_tokens_seen": 63857232, "step": 109985 }, { "epoch": 16.382186476020255, "grad_norm": 1.2512857913970947, "learning_rate": 4.82074274142312e-06, "loss": 0.5938, "num_input_tokens_seen": 63859888, "step": 109990 }, { "epoch": 16.382931188561216, "grad_norm": 1.943846583366394, "learning_rate": 4.818824717757736e-06, "loss": 0.788, "num_input_tokens_seen": 63862832, "step": 109995 }, { "epoch": 16.383675901102176, "grad_norm": 2.040411949157715, "learning_rate": 4.816907035032797e-06, "loss": 0.5925, "num_input_tokens_seen": 63865616, "step": 110000 }, { "epoch": 16.384420613643133, "grad_norm": 1.3613593578338623, "learning_rate": 4.814989693280703e-06, "loss": 0.4825, "num_input_tokens_seen": 63868496, "step": 110005 }, { "epoch": 16.385165326184094, "grad_norm": 2.368227481842041, "learning_rate": 4.81307269253384e-06, "loss": 0.6236, "num_input_tokens_seen": 63871792, "step": 110010 }, { "epoch": 16.38591003872505, "grad_norm": 1.6279758214950562, "learning_rate": 4.811156032824593e-06, "loss": 0.6378, "num_input_tokens_seen": 63874352, "step": 110015 }, { "epoch": 16.38665475126601, "grad_norm": 1.5224413871765137, "learning_rate": 4.8092397141853515e-06, "loss": 0.5611, "num_input_tokens_seen": 63877264, "step": 110020 }, { "epoch": 16.38739946380697, "grad_norm": 1.6420437097549438, "learning_rate": 4.807323736648475e-06, "loss": 0.5269, "num_input_tokens_seen": 63880368, "step": 110025 }, { "epoch": 16.38814417634793, "grad_norm": 1.531522512435913, "learning_rate": 4.80540810024635e-06, "loss": 0.4274, "num_input_tokens_seen": 63883088, "step": 110030 }, { "epoch": 16.38888888888889, "grad_norm": 1.20182466506958, "learning_rate": 4.8034928050113256e-06, "loss": 0.6566, "num_input_tokens_seen": 63885744, "step": 110035 }, { "epoch": 16.38963360142985, "grad_norm": 1.79960036277771, "learning_rate": 4.8015778509757665e-06, "loss": 0.6168, "num_input_tokens_seen": 63888624, "step": 110040 }, { "epoch": 16.390378313970807, "grad_norm": 1.2320562601089478, "learning_rate": 4.799663238172022e-06, "loss": 0.6657, "num_input_tokens_seen": 63891664, "step": 110045 }, { "epoch": 16.391123026511767, "grad_norm": 1.2334915399551392, "learning_rate": 4.7977489666324285e-06, "loss": 0.4893, "num_input_tokens_seen": 63894416, "step": 110050 }, { "epoch": 16.391867739052724, "grad_norm": 1.473010778427124, "learning_rate": 4.7958350363893424e-06, "loss": 0.5876, "num_input_tokens_seen": 63897520, "step": 110055 }, { "epoch": 16.392612451593685, "grad_norm": 2.482779026031494, "learning_rate": 4.793921447475083e-06, "loss": 0.6851, "num_input_tokens_seen": 63900336, "step": 110060 }, { "epoch": 16.393357164134645, "grad_norm": 1.4304119348526, "learning_rate": 4.7920081999219875e-06, "loss": 0.4822, "num_input_tokens_seen": 63903280, "step": 110065 }, { "epoch": 16.394101876675602, "grad_norm": 2.272700548171997, "learning_rate": 4.790095293762379e-06, "loss": 0.4775, "num_input_tokens_seen": 63906224, "step": 110070 }, { "epoch": 16.394846589216563, "grad_norm": 1.7729603052139282, "learning_rate": 4.788182729028565e-06, "loss": 0.6388, "num_input_tokens_seen": 63908944, "step": 110075 }, { "epoch": 16.39559130175752, "grad_norm": 2.424100160598755, "learning_rate": 4.786270505752866e-06, "loss": 0.6363, "num_input_tokens_seen": 63911728, "step": 110080 }, { "epoch": 16.39633601429848, "grad_norm": 2.0232326984405518, "learning_rate": 4.784358623967572e-06, "loss": 0.5343, "num_input_tokens_seen": 63914480, "step": 110085 }, { "epoch": 16.39708072683944, "grad_norm": 1.8975462913513184, "learning_rate": 4.782447083705002e-06, "loss": 0.5258, "num_input_tokens_seen": 63917360, "step": 110090 }, { "epoch": 16.397825439380398, "grad_norm": 2.069594144821167, "learning_rate": 4.780535884997433e-06, "loss": 0.5816, "num_input_tokens_seen": 63920368, "step": 110095 }, { "epoch": 16.39857015192136, "grad_norm": 1.9460707902908325, "learning_rate": 4.7786250278771675e-06, "loss": 0.6581, "num_input_tokens_seen": 63923312, "step": 110100 }, { "epoch": 16.39931486446232, "grad_norm": 2.1631133556365967, "learning_rate": 4.776714512376474e-06, "loss": 0.5816, "num_input_tokens_seen": 63926032, "step": 110105 }, { "epoch": 16.400059577003276, "grad_norm": 1.3697222471237183, "learning_rate": 4.774804338527639e-06, "loss": 0.4894, "num_input_tokens_seen": 63928944, "step": 110110 }, { "epoch": 16.400804289544237, "grad_norm": 2.7834300994873047, "learning_rate": 4.772894506362924e-06, "loss": 0.6022, "num_input_tokens_seen": 63931824, "step": 110115 }, { "epoch": 16.401549002085194, "grad_norm": 1.511201024055481, "learning_rate": 4.770985015914603e-06, "loss": 0.5887, "num_input_tokens_seen": 63934672, "step": 110120 }, { "epoch": 16.402293714626154, "grad_norm": 1.5429489612579346, "learning_rate": 4.769075867214931e-06, "loss": 0.5229, "num_input_tokens_seen": 63937488, "step": 110125 }, { "epoch": 16.403038427167115, "grad_norm": 2.5474162101745605, "learning_rate": 4.767167060296163e-06, "loss": 0.6128, "num_input_tokens_seen": 63940624, "step": 110130 }, { "epoch": 16.40378313970807, "grad_norm": 1.2840937376022339, "learning_rate": 4.7652585951905415e-06, "loss": 0.5826, "num_input_tokens_seen": 63943792, "step": 110135 }, { "epoch": 16.404527852249032, "grad_norm": 2.1692698001861572, "learning_rate": 4.763350471930303e-06, "loss": 0.643, "num_input_tokens_seen": 63946544, "step": 110140 }, { "epoch": 16.405272564789993, "grad_norm": 1.2512949705123901, "learning_rate": 4.761442690547699e-06, "loss": 0.4187, "num_input_tokens_seen": 63949648, "step": 110145 }, { "epoch": 16.40601727733095, "grad_norm": 1.4970166683197021, "learning_rate": 4.759535251074942e-06, "loss": 0.5861, "num_input_tokens_seen": 63952720, "step": 110150 }, { "epoch": 16.40676198987191, "grad_norm": 1.317933201789856, "learning_rate": 4.7576281535442745e-06, "loss": 0.6706, "num_input_tokens_seen": 63955984, "step": 110155 }, { "epoch": 16.407506702412867, "grad_norm": 1.3782751560211182, "learning_rate": 4.755721397987906e-06, "loss": 0.5377, "num_input_tokens_seen": 63959184, "step": 110160 }, { "epoch": 16.408251414953828, "grad_norm": 2.0130820274353027, "learning_rate": 4.753814984438043e-06, "loss": 0.6073, "num_input_tokens_seen": 63961904, "step": 110165 }, { "epoch": 16.408996127494788, "grad_norm": 3.0952701568603516, "learning_rate": 4.7519089129269026e-06, "loss": 0.5465, "num_input_tokens_seen": 63964496, "step": 110170 }, { "epoch": 16.409740840035745, "grad_norm": 1.3359332084655762, "learning_rate": 4.750003183486676e-06, "loss": 0.5493, "num_input_tokens_seen": 63967312, "step": 110175 }, { "epoch": 16.410485552576706, "grad_norm": 3.368255853652954, "learning_rate": 4.748097796149573e-06, "loss": 0.7606, "num_input_tokens_seen": 63970192, "step": 110180 }, { "epoch": 16.411230265117666, "grad_norm": 3.005089521408081, "learning_rate": 4.746192750947767e-06, "loss": 0.6938, "num_input_tokens_seen": 63973264, "step": 110185 }, { "epoch": 16.411974977658623, "grad_norm": 1.7704346179962158, "learning_rate": 4.744288047913456e-06, "loss": 0.557, "num_input_tokens_seen": 63976144, "step": 110190 }, { "epoch": 16.412719690199584, "grad_norm": 2.0017073154449463, "learning_rate": 4.742383687078811e-06, "loss": 0.3822, "num_input_tokens_seen": 63978864, "step": 110195 }, { "epoch": 16.41346440274054, "grad_norm": 1.2928622961044312, "learning_rate": 4.7404796684760055e-06, "loss": 0.6088, "num_input_tokens_seen": 63981744, "step": 110200 }, { "epoch": 16.4142091152815, "grad_norm": 3.1894052028656006, "learning_rate": 4.738575992137203e-06, "loss": 0.662, "num_input_tokens_seen": 63984880, "step": 110205 }, { "epoch": 16.414953827822462, "grad_norm": 1.8732701539993286, "learning_rate": 4.736672658094562e-06, "loss": 0.5481, "num_input_tokens_seen": 63987760, "step": 110210 }, { "epoch": 16.41569854036342, "grad_norm": 3.9430224895477295, "learning_rate": 4.734769666380248e-06, "loss": 0.5699, "num_input_tokens_seen": 63990576, "step": 110215 }, { "epoch": 16.41644325290438, "grad_norm": 1.5848788022994995, "learning_rate": 4.732867017026396e-06, "loss": 0.5866, "num_input_tokens_seen": 63993616, "step": 110220 }, { "epoch": 16.417187965445336, "grad_norm": 2.688585042953491, "learning_rate": 4.730964710065164e-06, "loss": 0.6199, "num_input_tokens_seen": 63996304, "step": 110225 }, { "epoch": 16.417932677986297, "grad_norm": 1.7444876432418823, "learning_rate": 4.729062745528678e-06, "loss": 0.5254, "num_input_tokens_seen": 63999152, "step": 110230 }, { "epoch": 16.418677390527257, "grad_norm": 1.3571736812591553, "learning_rate": 4.727161123449078e-06, "loss": 0.6279, "num_input_tokens_seen": 64002032, "step": 110235 }, { "epoch": 16.419422103068214, "grad_norm": 1.4003604650497437, "learning_rate": 4.72525984385849e-06, "loss": 0.6711, "num_input_tokens_seen": 64004880, "step": 110240 }, { "epoch": 16.420166815609175, "grad_norm": 1.2557642459869385, "learning_rate": 4.7233589067890215e-06, "loss": 0.6458, "num_input_tokens_seen": 64007984, "step": 110245 }, { "epoch": 16.420911528150135, "grad_norm": 2.1366190910339355, "learning_rate": 4.721458312272803e-06, "loss": 0.5074, "num_input_tokens_seen": 64011056, "step": 110250 }, { "epoch": 16.421656240691092, "grad_norm": 1.1809040307998657, "learning_rate": 4.719558060341931e-06, "loss": 0.487, "num_input_tokens_seen": 64014064, "step": 110255 }, { "epoch": 16.422400953232053, "grad_norm": 2.3635621070861816, "learning_rate": 4.717658151028517e-06, "loss": 0.6437, "num_input_tokens_seen": 64017168, "step": 110260 }, { "epoch": 16.42314566577301, "grad_norm": 1.1480093002319336, "learning_rate": 4.715758584364657e-06, "loss": 0.5403, "num_input_tokens_seen": 64020016, "step": 110265 }, { "epoch": 16.42389037831397, "grad_norm": 2.2480883598327637, "learning_rate": 4.713859360382439e-06, "loss": 0.6112, "num_input_tokens_seen": 64022896, "step": 110270 }, { "epoch": 16.42463509085493, "grad_norm": 2.097832679748535, "learning_rate": 4.7119604791139414e-06, "loss": 0.6877, "num_input_tokens_seen": 64026032, "step": 110275 }, { "epoch": 16.425379803395888, "grad_norm": 1.6430355310440063, "learning_rate": 4.7100619405912625e-06, "loss": 0.5208, "num_input_tokens_seen": 64029136, "step": 110280 }, { "epoch": 16.42612451593685, "grad_norm": 2.029186487197876, "learning_rate": 4.708163744846461e-06, "loss": 0.4557, "num_input_tokens_seen": 64031824, "step": 110285 }, { "epoch": 16.42686922847781, "grad_norm": 0.9957593083381653, "learning_rate": 4.706265891911604e-06, "loss": 0.5348, "num_input_tokens_seen": 64034896, "step": 110290 }, { "epoch": 16.427613941018766, "grad_norm": 1.855006217956543, "learning_rate": 4.704368381818766e-06, "loss": 0.6523, "num_input_tokens_seen": 64037808, "step": 110295 }, { "epoch": 16.428358653559727, "grad_norm": 1.0727471113204956, "learning_rate": 4.70247121459999e-06, "loss": 0.5423, "num_input_tokens_seen": 64040848, "step": 110300 }, { "epoch": 16.429103366100684, "grad_norm": 1.320888638496399, "learning_rate": 4.700574390287341e-06, "loss": 0.5186, "num_input_tokens_seen": 64043696, "step": 110305 }, { "epoch": 16.429848078641644, "grad_norm": 2.3251469135284424, "learning_rate": 4.698677908912846e-06, "loss": 0.6712, "num_input_tokens_seen": 64046416, "step": 110310 }, { "epoch": 16.430592791182605, "grad_norm": 1.67600417137146, "learning_rate": 4.696781770508566e-06, "loss": 0.6259, "num_input_tokens_seen": 64049520, "step": 110315 }, { "epoch": 16.43133750372356, "grad_norm": 1.7527297735214233, "learning_rate": 4.694885975106511e-06, "loss": 0.5415, "num_input_tokens_seen": 64052592, "step": 110320 }, { "epoch": 16.432082216264522, "grad_norm": 1.5436006784439087, "learning_rate": 4.6929905227387295e-06, "loss": 0.6279, "num_input_tokens_seen": 64055408, "step": 110325 }, { "epoch": 16.432826928805483, "grad_norm": 1.315399408340454, "learning_rate": 4.691095413437235e-06, "loss": 0.6162, "num_input_tokens_seen": 64058096, "step": 110330 }, { "epoch": 16.43357164134644, "grad_norm": 1.74583101272583, "learning_rate": 4.6892006472340405e-06, "loss": 0.604, "num_input_tokens_seen": 64060784, "step": 110335 }, { "epoch": 16.4343163538874, "grad_norm": 1.0139414072036743, "learning_rate": 4.687306224161159e-06, "loss": 0.6794, "num_input_tokens_seen": 64063568, "step": 110340 }, { "epoch": 16.435061066428357, "grad_norm": 1.1913931369781494, "learning_rate": 4.685412144250586e-06, "loss": 0.7111, "num_input_tokens_seen": 64066224, "step": 110345 }, { "epoch": 16.435805778969318, "grad_norm": 2.376283645629883, "learning_rate": 4.683518407534338e-06, "loss": 0.7485, "num_input_tokens_seen": 64069264, "step": 110350 }, { "epoch": 16.43655049151028, "grad_norm": 1.3704475164413452, "learning_rate": 4.6816250140443884e-06, "loss": 0.5207, "num_input_tokens_seen": 64071856, "step": 110355 }, { "epoch": 16.437295204051235, "grad_norm": 2.3623692989349365, "learning_rate": 4.679731963812742e-06, "loss": 0.6729, "num_input_tokens_seen": 64074704, "step": 110360 }, { "epoch": 16.438039916592196, "grad_norm": 2.1271660327911377, "learning_rate": 4.6778392568713695e-06, "loss": 0.5845, "num_input_tokens_seen": 64077712, "step": 110365 }, { "epoch": 16.438784629133156, "grad_norm": 2.6868128776550293, "learning_rate": 4.675946893252242e-06, "loss": 0.591, "num_input_tokens_seen": 64080656, "step": 110370 }, { "epoch": 16.439529341674113, "grad_norm": 1.308694839477539, "learning_rate": 4.674054872987344e-06, "loss": 0.4639, "num_input_tokens_seen": 64083600, "step": 110375 }, { "epoch": 16.440274054215074, "grad_norm": 1.8855572938919067, "learning_rate": 4.67216319610862e-06, "loss": 0.8435, "num_input_tokens_seen": 64086672, "step": 110380 }, { "epoch": 16.44101876675603, "grad_norm": 1.1153064966201782, "learning_rate": 4.670271862648049e-06, "loss": 0.5183, "num_input_tokens_seen": 64089552, "step": 110385 }, { "epoch": 16.44176347929699, "grad_norm": 1.205429196357727, "learning_rate": 4.668380872637562e-06, "loss": 0.5979, "num_input_tokens_seen": 64092304, "step": 110390 }, { "epoch": 16.442508191837952, "grad_norm": 1.3186228275299072, "learning_rate": 4.666490226109127e-06, "loss": 0.6123, "num_input_tokens_seen": 64095216, "step": 110395 }, { "epoch": 16.44325290437891, "grad_norm": 1.236075520515442, "learning_rate": 4.66459992309467e-06, "loss": 0.4898, "num_input_tokens_seen": 64098192, "step": 110400 }, { "epoch": 16.44399761691987, "grad_norm": 0.7380169034004211, "learning_rate": 4.662709963626133e-06, "loss": 0.4944, "num_input_tokens_seen": 64101040, "step": 110405 }, { "epoch": 16.44474232946083, "grad_norm": 1.9247149229049683, "learning_rate": 4.660820347735437e-06, "loss": 0.4908, "num_input_tokens_seen": 64104112, "step": 110410 }, { "epoch": 16.445487042001787, "grad_norm": 2.0068747997283936, "learning_rate": 4.658931075454507e-06, "loss": 0.5425, "num_input_tokens_seen": 64107088, "step": 110415 }, { "epoch": 16.446231754542747, "grad_norm": 0.9025630950927734, "learning_rate": 4.657042146815266e-06, "loss": 0.5032, "num_input_tokens_seen": 64110000, "step": 110420 }, { "epoch": 16.446976467083704, "grad_norm": 1.4815647602081299, "learning_rate": 4.655153561849618e-06, "loss": 0.3946, "num_input_tokens_seen": 64112752, "step": 110425 }, { "epoch": 16.447721179624665, "grad_norm": 2.588226318359375, "learning_rate": 4.6532653205894786e-06, "loss": 0.5752, "num_input_tokens_seen": 64115568, "step": 110430 }, { "epoch": 16.448465892165625, "grad_norm": 1.9125727415084839, "learning_rate": 4.651377423066736e-06, "loss": 0.6543, "num_input_tokens_seen": 64118320, "step": 110435 }, { "epoch": 16.449210604706582, "grad_norm": 1.3408317565917969, "learning_rate": 4.649489869313295e-06, "loss": 0.5513, "num_input_tokens_seen": 64121072, "step": 110440 }, { "epoch": 16.449955317247543, "grad_norm": 1.1951467990875244, "learning_rate": 4.647602659361042e-06, "loss": 0.4378, "num_input_tokens_seen": 64124016, "step": 110445 }, { "epoch": 16.4507000297885, "grad_norm": 1.4560208320617676, "learning_rate": 4.645715793241848e-06, "loss": 0.4678, "num_input_tokens_seen": 64127280, "step": 110450 }, { "epoch": 16.45144474232946, "grad_norm": 1.5308634042739868, "learning_rate": 4.6438292709876065e-06, "loss": 0.5033, "num_input_tokens_seen": 64130192, "step": 110455 }, { "epoch": 16.45218945487042, "grad_norm": 1.7694069147109985, "learning_rate": 4.64194309263018e-06, "loss": 0.695, "num_input_tokens_seen": 64133360, "step": 110460 }, { "epoch": 16.452934167411378, "grad_norm": 2.6673226356506348, "learning_rate": 4.6400572582014325e-06, "loss": 0.364, "num_input_tokens_seen": 64136560, "step": 110465 }, { "epoch": 16.45367887995234, "grad_norm": 2.8043949604034424, "learning_rate": 4.638171767733221e-06, "loss": 0.5959, "num_input_tokens_seen": 64139760, "step": 110470 }, { "epoch": 16.4544235924933, "grad_norm": 1.411317229270935, "learning_rate": 4.636286621257407e-06, "loss": 0.3289, "num_input_tokens_seen": 64142672, "step": 110475 }, { "epoch": 16.455168305034256, "grad_norm": 2.5088608264923096, "learning_rate": 4.634401818805828e-06, "loss": 0.5437, "num_input_tokens_seen": 64145488, "step": 110480 }, { "epoch": 16.455913017575217, "grad_norm": 1.491420030593872, "learning_rate": 4.632517360410338e-06, "loss": 0.4682, "num_input_tokens_seen": 64148464, "step": 110485 }, { "epoch": 16.456657730116174, "grad_norm": 0.8000432848930359, "learning_rate": 4.630633246102767e-06, "loss": 0.4891, "num_input_tokens_seen": 64151376, "step": 110490 }, { "epoch": 16.457402442657134, "grad_norm": 2.115973711013794, "learning_rate": 4.62874947591494e-06, "loss": 0.7849, "num_input_tokens_seen": 64154352, "step": 110495 }, { "epoch": 16.458147155198095, "grad_norm": 1.032375693321228, "learning_rate": 4.62686604987869e-06, "loss": 0.5696, "num_input_tokens_seen": 64157232, "step": 110500 }, { "epoch": 16.45889186773905, "grad_norm": 1.0765970945358276, "learning_rate": 4.624982968025826e-06, "loss": 0.464, "num_input_tokens_seen": 64160400, "step": 110505 }, { "epoch": 16.459636580280012, "grad_norm": 2.0160796642303467, "learning_rate": 4.623100230388172e-06, "loss": 0.6227, "num_input_tokens_seen": 64163088, "step": 110510 }, { "epoch": 16.460381292820973, "grad_norm": 1.7006711959838867, "learning_rate": 4.621217836997524e-06, "loss": 0.5972, "num_input_tokens_seen": 64166000, "step": 110515 }, { "epoch": 16.46112600536193, "grad_norm": 1.0814110040664673, "learning_rate": 4.619335787885695e-06, "loss": 0.4704, "num_input_tokens_seen": 64169040, "step": 110520 }, { "epoch": 16.46187071790289, "grad_norm": 2.221127510070801, "learning_rate": 4.617454083084474e-06, "loss": 0.54, "num_input_tokens_seen": 64171696, "step": 110525 }, { "epoch": 16.462615430443847, "grad_norm": 1.5882532596588135, "learning_rate": 4.615572722625649e-06, "loss": 0.673, "num_input_tokens_seen": 64174672, "step": 110530 }, { "epoch": 16.463360142984808, "grad_norm": 3.1203041076660156, "learning_rate": 4.6136917065410065e-06, "loss": 0.7222, "num_input_tokens_seen": 64177488, "step": 110535 }, { "epoch": 16.46410485552577, "grad_norm": 2.5018227100372314, "learning_rate": 4.611811034862318e-06, "loss": 0.5028, "num_input_tokens_seen": 64180592, "step": 110540 }, { "epoch": 16.464849568066725, "grad_norm": 1.459936499595642, "learning_rate": 4.609930707621366e-06, "loss": 0.5794, "num_input_tokens_seen": 64183376, "step": 110545 }, { "epoch": 16.465594280607686, "grad_norm": 2.316312313079834, "learning_rate": 4.608050724849902e-06, "loss": 0.4623, "num_input_tokens_seen": 64186160, "step": 110550 }, { "epoch": 16.466338993148646, "grad_norm": 1.6306198835372925, "learning_rate": 4.6061710865797055e-06, "loss": 0.5157, "num_input_tokens_seen": 64188976, "step": 110555 }, { "epoch": 16.467083705689603, "grad_norm": 1.7298026084899902, "learning_rate": 4.604291792842513e-06, "loss": 0.493, "num_input_tokens_seen": 64192368, "step": 110560 }, { "epoch": 16.467828418230564, "grad_norm": 1.7238049507141113, "learning_rate": 4.602412843670087e-06, "loss": 0.6522, "num_input_tokens_seen": 64194960, "step": 110565 }, { "epoch": 16.46857313077152, "grad_norm": 1.1918545961380005, "learning_rate": 4.600534239094165e-06, "loss": 0.5719, "num_input_tokens_seen": 64197968, "step": 110570 }, { "epoch": 16.46931784331248, "grad_norm": 2.0548181533813477, "learning_rate": 4.598655979146479e-06, "loss": 0.6231, "num_input_tokens_seen": 64200720, "step": 110575 }, { "epoch": 16.470062555853442, "grad_norm": 2.171633243560791, "learning_rate": 4.59677806385877e-06, "loss": 0.5962, "num_input_tokens_seen": 64203568, "step": 110580 }, { "epoch": 16.4708072683944, "grad_norm": 2.743119478225708, "learning_rate": 4.5949004932627545e-06, "loss": 0.5274, "num_input_tokens_seen": 64206448, "step": 110585 }, { "epoch": 16.47155198093536, "grad_norm": 1.3607162237167358, "learning_rate": 4.593023267390162e-06, "loss": 0.7854, "num_input_tokens_seen": 64209488, "step": 110590 }, { "epoch": 16.472296693476316, "grad_norm": 2.02424693107605, "learning_rate": 4.591146386272699e-06, "loss": 0.7, "num_input_tokens_seen": 64212432, "step": 110595 }, { "epoch": 16.473041406017277, "grad_norm": 2.0175907611846924, "learning_rate": 4.5892698499420764e-06, "loss": 0.4745, "num_input_tokens_seen": 64215568, "step": 110600 }, { "epoch": 16.473786118558237, "grad_norm": 1.4350898265838623, "learning_rate": 4.5873936584299946e-06, "loss": 0.4059, "num_input_tokens_seen": 64218512, "step": 110605 }, { "epoch": 16.474530831099194, "grad_norm": 2.7094058990478516, "learning_rate": 4.5855178117681444e-06, "loss": 0.6355, "num_input_tokens_seen": 64221520, "step": 110610 }, { "epoch": 16.475275543640155, "grad_norm": 2.5269994735717773, "learning_rate": 4.583642309988229e-06, "loss": 0.7709, "num_input_tokens_seen": 64224176, "step": 110615 }, { "epoch": 16.476020256181116, "grad_norm": 1.6489105224609375, "learning_rate": 4.581767153121922e-06, "loss": 0.571, "num_input_tokens_seen": 64226896, "step": 110620 }, { "epoch": 16.476764968722073, "grad_norm": 1.2096503973007202, "learning_rate": 4.579892341200911e-06, "loss": 0.6195, "num_input_tokens_seen": 64229904, "step": 110625 }, { "epoch": 16.477509681263033, "grad_norm": 1.740118145942688, "learning_rate": 4.578017874256857e-06, "loss": 0.5837, "num_input_tokens_seen": 64232816, "step": 110630 }, { "epoch": 16.47825439380399, "grad_norm": 1.202361822128296, "learning_rate": 4.5761437523214435e-06, "loss": 0.6213, "num_input_tokens_seen": 64236208, "step": 110635 }, { "epoch": 16.47899910634495, "grad_norm": 2.009883165359497, "learning_rate": 4.574269975426318e-06, "loss": 0.6236, "num_input_tokens_seen": 64239344, "step": 110640 }, { "epoch": 16.47974381888591, "grad_norm": 0.9040000438690186, "learning_rate": 4.572396543603147e-06, "loss": 0.6169, "num_input_tokens_seen": 64242096, "step": 110645 }, { "epoch": 16.480488531426868, "grad_norm": 2.4813685417175293, "learning_rate": 4.570523456883574e-06, "loss": 0.775, "num_input_tokens_seen": 64244880, "step": 110650 }, { "epoch": 16.48123324396783, "grad_norm": 1.285569667816162, "learning_rate": 4.568650715299236e-06, "loss": 0.5574, "num_input_tokens_seen": 64247536, "step": 110655 }, { "epoch": 16.48197795650879, "grad_norm": 1.1825791597366333, "learning_rate": 4.566778318881787e-06, "loss": 0.6189, "num_input_tokens_seen": 64250736, "step": 110660 }, { "epoch": 16.482722669049746, "grad_norm": 2.206993579864502, "learning_rate": 4.56490626766285e-06, "loss": 0.6034, "num_input_tokens_seen": 64253648, "step": 110665 }, { "epoch": 16.483467381590707, "grad_norm": 2.002319574356079, "learning_rate": 4.563034561674054e-06, "loss": 0.5828, "num_input_tokens_seen": 64256368, "step": 110670 }, { "epoch": 16.484212094131664, "grad_norm": 1.7717300653457642, "learning_rate": 4.561163200947008e-06, "loss": 0.7639, "num_input_tokens_seen": 64259280, "step": 110675 }, { "epoch": 16.484956806672624, "grad_norm": 1.110838532447815, "learning_rate": 4.559292185513347e-06, "loss": 0.5679, "num_input_tokens_seen": 64262160, "step": 110680 }, { "epoch": 16.485701519213585, "grad_norm": 1.8580867052078247, "learning_rate": 4.557421515404667e-06, "loss": 0.6566, "num_input_tokens_seen": 64265040, "step": 110685 }, { "epoch": 16.48644623175454, "grad_norm": 1.568708062171936, "learning_rate": 4.555551190652568e-06, "loss": 0.6916, "num_input_tokens_seen": 64267952, "step": 110690 }, { "epoch": 16.487190944295502, "grad_norm": 1.7324199676513672, "learning_rate": 4.55368121128866e-06, "loss": 0.4585, "num_input_tokens_seen": 64271184, "step": 110695 }, { "epoch": 16.487935656836463, "grad_norm": 0.8893350958824158, "learning_rate": 4.55181157734452e-06, "loss": 0.4589, "num_input_tokens_seen": 64273904, "step": 110700 }, { "epoch": 16.48868036937742, "grad_norm": 1.3630595207214355, "learning_rate": 4.549942288851747e-06, "loss": 0.5819, "num_input_tokens_seen": 64276880, "step": 110705 }, { "epoch": 16.48942508191838, "grad_norm": 0.8954018950462341, "learning_rate": 4.5480733458419074e-06, "loss": 0.5901, "num_input_tokens_seen": 64279984, "step": 110710 }, { "epoch": 16.490169794459337, "grad_norm": 1.337020993232727, "learning_rate": 4.5462047483465886e-06, "loss": 0.4965, "num_input_tokens_seen": 64283056, "step": 110715 }, { "epoch": 16.490914507000298, "grad_norm": 1.455444574356079, "learning_rate": 4.5443364963973475e-06, "loss": 0.5471, "num_input_tokens_seen": 64285680, "step": 110720 }, { "epoch": 16.49165921954126, "grad_norm": 1.9014112949371338, "learning_rate": 4.542468590025756e-06, "loss": 0.4817, "num_input_tokens_seen": 64288272, "step": 110725 }, { "epoch": 16.492403932082215, "grad_norm": 1.1091338396072388, "learning_rate": 4.540601029263367e-06, "loss": 0.4287, "num_input_tokens_seen": 64290928, "step": 110730 }, { "epoch": 16.493148644623176, "grad_norm": 1.420145869255066, "learning_rate": 4.538733814141729e-06, "loss": 0.4189, "num_input_tokens_seen": 64293872, "step": 110735 }, { "epoch": 16.493893357164133, "grad_norm": 1.872459888458252, "learning_rate": 4.536866944692386e-06, "loss": 0.777, "num_input_tokens_seen": 64296944, "step": 110740 }, { "epoch": 16.494638069705093, "grad_norm": 1.774596095085144, "learning_rate": 4.535000420946875e-06, "loss": 0.5833, "num_input_tokens_seen": 64299856, "step": 110745 }, { "epoch": 16.495382782246054, "grad_norm": 1.8312616348266602, "learning_rate": 4.533134242936735e-06, "loss": 0.5705, "num_input_tokens_seen": 64302768, "step": 110750 }, { "epoch": 16.49612749478701, "grad_norm": 1.4316688776016235, "learning_rate": 4.531268410693488e-06, "loss": 0.4376, "num_input_tokens_seen": 64305936, "step": 110755 }, { "epoch": 16.49687220732797, "grad_norm": 1.0015548467636108, "learning_rate": 4.52940292424866e-06, "loss": 0.5235, "num_input_tokens_seen": 64308880, "step": 110760 }, { "epoch": 16.497616919868932, "grad_norm": 1.1128342151641846, "learning_rate": 4.527537783633764e-06, "loss": 0.5113, "num_input_tokens_seen": 64311952, "step": 110765 }, { "epoch": 16.49836163240989, "grad_norm": 1.623652458190918, "learning_rate": 4.525672988880308e-06, "loss": 0.6208, "num_input_tokens_seen": 64314672, "step": 110770 }, { "epoch": 16.49910634495085, "grad_norm": 2.7052557468414307, "learning_rate": 4.5238085400198e-06, "loss": 0.5804, "num_input_tokens_seen": 64317552, "step": 110775 }, { "epoch": 16.499851057491806, "grad_norm": 0.9066700339317322, "learning_rate": 4.521944437083731e-06, "loss": 0.5886, "num_input_tokens_seen": 64320464, "step": 110780 }, { "epoch": 16.500595770032767, "grad_norm": 1.7778903245925903, "learning_rate": 4.520080680103603e-06, "loss": 0.5451, "num_input_tokens_seen": 64323472, "step": 110785 }, { "epoch": 16.501340482573728, "grad_norm": 1.5861097574234009, "learning_rate": 4.5182172691108996e-06, "loss": 0.6022, "num_input_tokens_seen": 64326224, "step": 110790 }, { "epoch": 16.502085195114685, "grad_norm": 2.189072608947754, "learning_rate": 4.5163542041370965e-06, "loss": 0.5672, "num_input_tokens_seen": 64329104, "step": 110795 }, { "epoch": 16.502829907655645, "grad_norm": 1.5181928873062134, "learning_rate": 4.514491485213665e-06, "loss": 0.698, "num_input_tokens_seen": 64332176, "step": 110800 }, { "epoch": 16.503574620196606, "grad_norm": 1.9557304382324219, "learning_rate": 4.512629112372085e-06, "loss": 0.5457, "num_input_tokens_seen": 64335184, "step": 110805 }, { "epoch": 16.504319332737563, "grad_norm": 4.83226203918457, "learning_rate": 4.510767085643814e-06, "loss": 0.7004, "num_input_tokens_seen": 64338032, "step": 110810 }, { "epoch": 16.505064045278523, "grad_norm": 2.2364773750305176, "learning_rate": 4.508905405060301e-06, "loss": 0.5516, "num_input_tokens_seen": 64340784, "step": 110815 }, { "epoch": 16.50580875781948, "grad_norm": 1.5417159795761108, "learning_rate": 4.5070440706530135e-06, "loss": 0.5521, "num_input_tokens_seen": 64343664, "step": 110820 }, { "epoch": 16.50655347036044, "grad_norm": 1.7099558115005493, "learning_rate": 4.505183082453382e-06, "loss": 0.5261, "num_input_tokens_seen": 64346672, "step": 110825 }, { "epoch": 16.5072981829014, "grad_norm": 2.0234761238098145, "learning_rate": 4.503322440492858e-06, "loss": 0.7129, "num_input_tokens_seen": 64349616, "step": 110830 }, { "epoch": 16.508042895442358, "grad_norm": 1.6739373207092285, "learning_rate": 4.501462144802862e-06, "loss": 0.44, "num_input_tokens_seen": 64352816, "step": 110835 }, { "epoch": 16.50878760798332, "grad_norm": 2.4534826278686523, "learning_rate": 4.4996021954148375e-06, "loss": 0.5559, "num_input_tokens_seen": 64355856, "step": 110840 }, { "epoch": 16.50953232052428, "grad_norm": 1.3590153455734253, "learning_rate": 4.497742592360196e-06, "loss": 0.4588, "num_input_tokens_seen": 64359120, "step": 110845 }, { "epoch": 16.510277033065236, "grad_norm": 2.2214279174804688, "learning_rate": 4.495883335670351e-06, "loss": 0.656, "num_input_tokens_seen": 64361616, "step": 110850 }, { "epoch": 16.511021745606197, "grad_norm": 1.0985163450241089, "learning_rate": 4.494024425376722e-06, "loss": 0.4964, "num_input_tokens_seen": 64364624, "step": 110855 }, { "epoch": 16.511766458147154, "grad_norm": 1.5389539003372192, "learning_rate": 4.4921658615107106e-06, "loss": 0.6748, "num_input_tokens_seen": 64367472, "step": 110860 }, { "epoch": 16.512511170688114, "grad_norm": 4.351070880889893, "learning_rate": 4.490307644103717e-06, "loss": 0.5563, "num_input_tokens_seen": 64370224, "step": 110865 }, { "epoch": 16.513255883229075, "grad_norm": 1.1289451122283936, "learning_rate": 4.48844977318712e-06, "loss": 0.5213, "num_input_tokens_seen": 64373008, "step": 110870 }, { "epoch": 16.51400059577003, "grad_norm": 0.9167060256004333, "learning_rate": 4.486592248792323e-06, "loss": 0.533, "num_input_tokens_seen": 64375664, "step": 110875 }, { "epoch": 16.514745308310992, "grad_norm": 1.8294404745101929, "learning_rate": 4.484735070950696e-06, "loss": 0.5602, "num_input_tokens_seen": 64378352, "step": 110880 }, { "epoch": 16.515490020851953, "grad_norm": 1.3583065271377563, "learning_rate": 4.482878239693628e-06, "loss": 0.6565, "num_input_tokens_seen": 64381200, "step": 110885 }, { "epoch": 16.51623473339291, "grad_norm": 1.2164506912231445, "learning_rate": 4.481021755052476e-06, "loss": 0.5869, "num_input_tokens_seen": 64384304, "step": 110890 }, { "epoch": 16.51697944593387, "grad_norm": 1.9014825820922852, "learning_rate": 4.479165617058603e-06, "loss": 0.5987, "num_input_tokens_seen": 64387024, "step": 110895 }, { "epoch": 16.517724158474827, "grad_norm": 1.103971004486084, "learning_rate": 4.4773098257433754e-06, "loss": 0.4564, "num_input_tokens_seen": 64389904, "step": 110900 }, { "epoch": 16.518468871015788, "grad_norm": 1.57120680809021, "learning_rate": 4.4754543811381335e-06, "loss": 0.6991, "num_input_tokens_seen": 64392880, "step": 110905 }, { "epoch": 16.51921358355675, "grad_norm": 1.4694072008132935, "learning_rate": 4.473599283274235e-06, "loss": 0.6859, "num_input_tokens_seen": 64395632, "step": 110910 }, { "epoch": 16.519958296097705, "grad_norm": 2.3598036766052246, "learning_rate": 4.471744532183012e-06, "loss": 0.7044, "num_input_tokens_seen": 64398608, "step": 110915 }, { "epoch": 16.520703008638666, "grad_norm": 1.5707073211669922, "learning_rate": 4.469890127895804e-06, "loss": 0.5381, "num_input_tokens_seen": 64401264, "step": 110920 }, { "epoch": 16.521447721179626, "grad_norm": 1.9780027866363525, "learning_rate": 4.468036070443938e-06, "loss": 0.5087, "num_input_tokens_seen": 64404304, "step": 110925 }, { "epoch": 16.522192433720583, "grad_norm": 2.1408708095550537, "learning_rate": 4.466182359858734e-06, "loss": 0.5698, "num_input_tokens_seen": 64407248, "step": 110930 }, { "epoch": 16.522937146261544, "grad_norm": 2.128277540206909, "learning_rate": 4.4643289961715076e-06, "loss": 0.7556, "num_input_tokens_seen": 64410256, "step": 110935 }, { "epoch": 16.5236818588025, "grad_norm": 0.6202754974365234, "learning_rate": 4.462475979413569e-06, "loss": 0.7827, "num_input_tokens_seen": 64413200, "step": 110940 }, { "epoch": 16.52442657134346, "grad_norm": 1.6993260383605957, "learning_rate": 4.46062330961623e-06, "loss": 0.554, "num_input_tokens_seen": 64415984, "step": 110945 }, { "epoch": 16.525171283884422, "grad_norm": 2.018415927886963, "learning_rate": 4.458770986810776e-06, "loss": 0.5289, "num_input_tokens_seen": 64418800, "step": 110950 }, { "epoch": 16.52591599642538, "grad_norm": 2.4303219318389893, "learning_rate": 4.456919011028518e-06, "loss": 0.6758, "num_input_tokens_seen": 64421552, "step": 110955 }, { "epoch": 16.52666070896634, "grad_norm": 2.3485770225524902, "learning_rate": 4.4550673823007284e-06, "loss": 0.6637, "num_input_tokens_seen": 64424400, "step": 110960 }, { "epoch": 16.527405421507297, "grad_norm": 2.2603745460510254, "learning_rate": 4.4532161006587e-06, "loss": 0.5185, "num_input_tokens_seen": 64427216, "step": 110965 }, { "epoch": 16.528150134048257, "grad_norm": 3.168999195098877, "learning_rate": 4.4513651661337e-06, "loss": 0.6401, "num_input_tokens_seen": 64430032, "step": 110970 }, { "epoch": 16.528894846589218, "grad_norm": 1.4569323062896729, "learning_rate": 4.449514578757e-06, "loss": 0.4601, "num_input_tokens_seen": 64432976, "step": 110975 }, { "epoch": 16.529639559130175, "grad_norm": 2.3034329414367676, "learning_rate": 4.447664338559867e-06, "loss": 0.6833, "num_input_tokens_seen": 64435664, "step": 110980 }, { "epoch": 16.530384271671135, "grad_norm": 0.9273971319198608, "learning_rate": 4.445814445573551e-06, "loss": 0.5717, "num_input_tokens_seen": 64438352, "step": 110985 }, { "epoch": 16.531128984212096, "grad_norm": 3.029362440109253, "learning_rate": 4.443964899829317e-06, "loss": 0.4621, "num_input_tokens_seen": 64441360, "step": 110990 }, { "epoch": 16.531873696753053, "grad_norm": 1.714962363243103, "learning_rate": 4.442115701358401e-06, "loss": 0.5441, "num_input_tokens_seen": 64444528, "step": 110995 }, { "epoch": 16.532618409294013, "grad_norm": 1.5868468284606934, "learning_rate": 4.440266850192049e-06, "loss": 0.6471, "num_input_tokens_seen": 64447440, "step": 111000 }, { "epoch": 16.53336312183497, "grad_norm": 1.1468877792358398, "learning_rate": 4.4384183463614865e-06, "loss": 0.3833, "num_input_tokens_seen": 64450352, "step": 111005 }, { "epoch": 16.53410783437593, "grad_norm": 1.9041231870651245, "learning_rate": 4.436570189897951e-06, "loss": 0.5319, "num_input_tokens_seen": 64453072, "step": 111010 }, { "epoch": 16.53485254691689, "grad_norm": 1.3861656188964844, "learning_rate": 4.434722380832665e-06, "loss": 0.5482, "num_input_tokens_seen": 64456240, "step": 111015 }, { "epoch": 16.535597259457848, "grad_norm": 1.5233170986175537, "learning_rate": 4.432874919196836e-06, "loss": 0.6246, "num_input_tokens_seen": 64459088, "step": 111020 }, { "epoch": 16.53634197199881, "grad_norm": 4.136610507965088, "learning_rate": 4.4310278050216895e-06, "loss": 0.5925, "num_input_tokens_seen": 64461776, "step": 111025 }, { "epoch": 16.53708668453977, "grad_norm": 1.445144534111023, "learning_rate": 4.429181038338415e-06, "loss": 0.6221, "num_input_tokens_seen": 64464624, "step": 111030 }, { "epoch": 16.537831397080726, "grad_norm": 1.805910587310791, "learning_rate": 4.427334619178225e-06, "loss": 0.836, "num_input_tokens_seen": 64467280, "step": 111035 }, { "epoch": 16.538576109621687, "grad_norm": 1.0546715259552002, "learning_rate": 4.425488547572304e-06, "loss": 0.3801, "num_input_tokens_seen": 64470032, "step": 111040 }, { "epoch": 16.539320822162644, "grad_norm": 1.7073887586593628, "learning_rate": 4.4236428235518465e-06, "loss": 0.6017, "num_input_tokens_seen": 64473104, "step": 111045 }, { "epoch": 16.540065534703604, "grad_norm": 1.0309971570968628, "learning_rate": 4.421797447148032e-06, "loss": 0.4391, "num_input_tokens_seen": 64475824, "step": 111050 }, { "epoch": 16.540810247244565, "grad_norm": 1.9259471893310547, "learning_rate": 4.419952418392029e-06, "loss": 0.742, "num_input_tokens_seen": 64478768, "step": 111055 }, { "epoch": 16.541554959785522, "grad_norm": 1.2053622007369995, "learning_rate": 4.418107737315019e-06, "loss": 0.5325, "num_input_tokens_seen": 64481616, "step": 111060 }, { "epoch": 16.542299672326482, "grad_norm": 1.6452361345291138, "learning_rate": 4.416263403948159e-06, "loss": 0.6752, "num_input_tokens_seen": 64484528, "step": 111065 }, { "epoch": 16.543044384867443, "grad_norm": 3.055117607116699, "learning_rate": 4.41441941832261e-06, "loss": 0.6536, "num_input_tokens_seen": 64487664, "step": 111070 }, { "epoch": 16.5437890974084, "grad_norm": 2.624192953109741, "learning_rate": 4.412575780469516e-06, "loss": 0.6402, "num_input_tokens_seen": 64490544, "step": 111075 }, { "epoch": 16.54453380994936, "grad_norm": 2.4071359634399414, "learning_rate": 4.410732490420036e-06, "loss": 0.7261, "num_input_tokens_seen": 64493872, "step": 111080 }, { "epoch": 16.545278522490317, "grad_norm": 1.941985845565796, "learning_rate": 4.4088895482053e-06, "loss": 0.6204, "num_input_tokens_seen": 64496880, "step": 111085 }, { "epoch": 16.546023235031278, "grad_norm": 3.7631399631500244, "learning_rate": 4.40704695385645e-06, "loss": 0.6923, "num_input_tokens_seen": 64499536, "step": 111090 }, { "epoch": 16.54676794757224, "grad_norm": 1.0449823141098022, "learning_rate": 4.405204707404614e-06, "loss": 0.7003, "num_input_tokens_seen": 64502416, "step": 111095 }, { "epoch": 16.547512660113195, "grad_norm": 1.8422268629074097, "learning_rate": 4.403362808880909e-06, "loss": 0.468, "num_input_tokens_seen": 64505264, "step": 111100 }, { "epoch": 16.548257372654156, "grad_norm": 1.466568946838379, "learning_rate": 4.40152125831646e-06, "loss": 0.4881, "num_input_tokens_seen": 64508272, "step": 111105 }, { "epoch": 16.549002085195113, "grad_norm": 1.1449742317199707, "learning_rate": 4.3996800557423665e-06, "loss": 0.6846, "num_input_tokens_seen": 64511120, "step": 111110 }, { "epoch": 16.549746797736073, "grad_norm": 3.359710693359375, "learning_rate": 4.397839201189749e-06, "loss": 0.5587, "num_input_tokens_seen": 64514608, "step": 111115 }, { "epoch": 16.550491510277034, "grad_norm": 2.2343616485595703, "learning_rate": 4.395998694689699e-06, "loss": 0.657, "num_input_tokens_seen": 64517424, "step": 111120 }, { "epoch": 16.55123622281799, "grad_norm": 1.6792826652526855, "learning_rate": 4.39415853627331e-06, "loss": 0.636, "num_input_tokens_seen": 64520304, "step": 111125 }, { "epoch": 16.55198093535895, "grad_norm": 1.9745304584503174, "learning_rate": 4.3923187259716615e-06, "loss": 0.5112, "num_input_tokens_seen": 64522832, "step": 111130 }, { "epoch": 16.552725647899912, "grad_norm": 2.854250907897949, "learning_rate": 4.390479263815852e-06, "loss": 0.5695, "num_input_tokens_seen": 64525744, "step": 111135 }, { "epoch": 16.55347036044087, "grad_norm": 2.6805686950683594, "learning_rate": 4.388640149836948e-06, "loss": 0.6117, "num_input_tokens_seen": 64528688, "step": 111140 }, { "epoch": 16.55421507298183, "grad_norm": 1.8175222873687744, "learning_rate": 4.3868013840660135e-06, "loss": 0.7119, "num_input_tokens_seen": 64531632, "step": 111145 }, { "epoch": 16.554959785522787, "grad_norm": 0.9710594415664673, "learning_rate": 4.3849629665341255e-06, "loss": 0.6406, "num_input_tokens_seen": 64534384, "step": 111150 }, { "epoch": 16.555704498063747, "grad_norm": 1.9263468980789185, "learning_rate": 4.383124897272331e-06, "loss": 0.5619, "num_input_tokens_seen": 64537008, "step": 111155 }, { "epoch": 16.556449210604708, "grad_norm": 3.0357587337493896, "learning_rate": 4.381287176311694e-06, "loss": 0.6877, "num_input_tokens_seen": 64539440, "step": 111160 }, { "epoch": 16.557193923145665, "grad_norm": 1.1933703422546387, "learning_rate": 4.379449803683247e-06, "loss": 0.5441, "num_input_tokens_seen": 64542224, "step": 111165 }, { "epoch": 16.557938635686625, "grad_norm": 1.3399180173873901, "learning_rate": 4.377612779418041e-06, "loss": 0.4391, "num_input_tokens_seen": 64545168, "step": 111170 }, { "epoch": 16.558683348227586, "grad_norm": 2.4461112022399902, "learning_rate": 4.375776103547114e-06, "loss": 0.6321, "num_input_tokens_seen": 64548272, "step": 111175 }, { "epoch": 16.559428060768543, "grad_norm": 1.5316001176834106, "learning_rate": 4.373939776101476e-06, "loss": 0.5126, "num_input_tokens_seen": 64550864, "step": 111180 }, { "epoch": 16.560172773309503, "grad_norm": 2.6877264976501465, "learning_rate": 4.37210379711217e-06, "loss": 0.5734, "num_input_tokens_seen": 64553904, "step": 111185 }, { "epoch": 16.56091748585046, "grad_norm": 1.5487251281738281, "learning_rate": 4.370268166610206e-06, "loss": 0.7271, "num_input_tokens_seen": 64557072, "step": 111190 }, { "epoch": 16.56166219839142, "grad_norm": 2.9066014289855957, "learning_rate": 4.368432884626594e-06, "loss": 0.5866, "num_input_tokens_seen": 64560176, "step": 111195 }, { "epoch": 16.56240691093238, "grad_norm": 1.3069703578948975, "learning_rate": 4.366597951192333e-06, "loss": 0.7289, "num_input_tokens_seen": 64562736, "step": 111200 }, { "epoch": 16.56315162347334, "grad_norm": 1.503767967224121, "learning_rate": 4.364763366338437e-06, "loss": 0.526, "num_input_tokens_seen": 64565328, "step": 111205 }, { "epoch": 16.5638963360143, "grad_norm": 1.1724650859832764, "learning_rate": 4.362929130095888e-06, "loss": 0.573, "num_input_tokens_seen": 64568176, "step": 111210 }, { "epoch": 16.56464104855526, "grad_norm": 2.1284477710723877, "learning_rate": 4.361095242495672e-06, "loss": 0.6118, "num_input_tokens_seen": 64571056, "step": 111215 }, { "epoch": 16.565385761096216, "grad_norm": 1.113999366760254, "learning_rate": 4.359261703568781e-06, "loss": 0.4983, "num_input_tokens_seen": 64573904, "step": 111220 }, { "epoch": 16.566130473637177, "grad_norm": 1.8815616369247437, "learning_rate": 4.357428513346179e-06, "loss": 0.5501, "num_input_tokens_seen": 64576656, "step": 111225 }, { "epoch": 16.566875186178134, "grad_norm": 3.261910915374756, "learning_rate": 4.35559567185885e-06, "loss": 0.5398, "num_input_tokens_seen": 64579376, "step": 111230 }, { "epoch": 16.567619898719094, "grad_norm": 4.762688636779785, "learning_rate": 4.353763179137743e-06, "loss": 0.721, "num_input_tokens_seen": 64582448, "step": 111235 }, { "epoch": 16.568364611260055, "grad_norm": 1.8044475317001343, "learning_rate": 4.351931035213827e-06, "loss": 0.6985, "num_input_tokens_seen": 64585424, "step": 111240 }, { "epoch": 16.569109323801012, "grad_norm": 1.1067721843719482, "learning_rate": 4.350099240118047e-06, "loss": 0.6145, "num_input_tokens_seen": 64588240, "step": 111245 }, { "epoch": 16.569854036341972, "grad_norm": 1.5459073781967163, "learning_rate": 4.348267793881358e-06, "loss": 0.4624, "num_input_tokens_seen": 64591024, "step": 111250 }, { "epoch": 16.57059874888293, "grad_norm": 0.6742302179336548, "learning_rate": 4.346436696534698e-06, "loss": 0.5583, "num_input_tokens_seen": 64593872, "step": 111255 }, { "epoch": 16.57134346142389, "grad_norm": 0.9342301487922668, "learning_rate": 4.344605948108993e-06, "loss": 0.4448, "num_input_tokens_seen": 64596528, "step": 111260 }, { "epoch": 16.57208817396485, "grad_norm": 1.0406239032745361, "learning_rate": 4.342775548635181e-06, "loss": 0.5768, "num_input_tokens_seen": 64599216, "step": 111265 }, { "epoch": 16.572832886505807, "grad_norm": 1.430620789527893, "learning_rate": 4.340945498144175e-06, "loss": 0.4698, "num_input_tokens_seen": 64601808, "step": 111270 }, { "epoch": 16.573577599046768, "grad_norm": 1.6906123161315918, "learning_rate": 4.3391157966669036e-06, "loss": 0.607, "num_input_tokens_seen": 64604560, "step": 111275 }, { "epoch": 16.57432231158773, "grad_norm": 1.5245496034622192, "learning_rate": 4.337286444234265e-06, "loss": 0.5468, "num_input_tokens_seen": 64607536, "step": 111280 }, { "epoch": 16.575067024128685, "grad_norm": 1.1179636716842651, "learning_rate": 4.335457440877177e-06, "loss": 0.5417, "num_input_tokens_seen": 64610224, "step": 111285 }, { "epoch": 16.575811736669646, "grad_norm": 2.048337459564209, "learning_rate": 4.333628786626534e-06, "loss": 0.5654, "num_input_tokens_seen": 64613168, "step": 111290 }, { "epoch": 16.576556449210603, "grad_norm": 1.0443378686904907, "learning_rate": 4.331800481513223e-06, "loss": 0.5446, "num_input_tokens_seen": 64616176, "step": 111295 }, { "epoch": 16.577301161751564, "grad_norm": 1.3221642971038818, "learning_rate": 4.329972525568141e-06, "loss": 0.5208, "num_input_tokens_seen": 64619056, "step": 111300 }, { "epoch": 16.578045874292524, "grad_norm": 1.4456900358200073, "learning_rate": 4.3281449188221605e-06, "loss": 0.5379, "num_input_tokens_seen": 64622064, "step": 111305 }, { "epoch": 16.57879058683348, "grad_norm": 1.1968045234680176, "learning_rate": 4.326317661306168e-06, "loss": 0.5909, "num_input_tokens_seen": 64625232, "step": 111310 }, { "epoch": 16.57953529937444, "grad_norm": 1.5470914840698242, "learning_rate": 4.324490753051019e-06, "loss": 0.6319, "num_input_tokens_seen": 64628048, "step": 111315 }, { "epoch": 16.580280011915402, "grad_norm": 1.7157363891601562, "learning_rate": 4.322664194087591e-06, "loss": 0.5104, "num_input_tokens_seen": 64630896, "step": 111320 }, { "epoch": 16.58102472445636, "grad_norm": 1.2184474468231201, "learning_rate": 4.320837984446738e-06, "loss": 0.5646, "num_input_tokens_seen": 64633456, "step": 111325 }, { "epoch": 16.58176943699732, "grad_norm": 1.08844792842865, "learning_rate": 4.319012124159308e-06, "loss": 0.6019, "num_input_tokens_seen": 64636432, "step": 111330 }, { "epoch": 16.582514149538277, "grad_norm": 2.549882411956787, "learning_rate": 4.317186613256149e-06, "loss": 0.5733, "num_input_tokens_seen": 64639280, "step": 111335 }, { "epoch": 16.583258862079237, "grad_norm": 0.7587395906448364, "learning_rate": 4.3153614517680965e-06, "loss": 0.4791, "num_input_tokens_seen": 64642416, "step": 111340 }, { "epoch": 16.584003574620198, "grad_norm": 1.2588105201721191, "learning_rate": 4.313536639725996e-06, "loss": 0.7793, "num_input_tokens_seen": 64646512, "step": 111345 }, { "epoch": 16.584748287161155, "grad_norm": 1.2456817626953125, "learning_rate": 4.311712177160662e-06, "loss": 0.6551, "num_input_tokens_seen": 64649232, "step": 111350 }, { "epoch": 16.585492999702115, "grad_norm": 1.3438160419464111, "learning_rate": 4.30988806410293e-06, "loss": 0.5911, "num_input_tokens_seen": 64652208, "step": 111355 }, { "epoch": 16.586237712243076, "grad_norm": 1.4103763103485107, "learning_rate": 4.308064300583603e-06, "loss": 0.585, "num_input_tokens_seen": 64654896, "step": 111360 }, { "epoch": 16.586982424784033, "grad_norm": 1.1419001817703247, "learning_rate": 4.3062408866335085e-06, "loss": 0.5877, "num_input_tokens_seen": 64657776, "step": 111365 }, { "epoch": 16.587727137324993, "grad_norm": 2.3731157779693604, "learning_rate": 4.30441782228344e-06, "loss": 0.5558, "num_input_tokens_seen": 64660784, "step": 111370 }, { "epoch": 16.58847184986595, "grad_norm": 1.5140416622161865, "learning_rate": 4.302595107564192e-06, "loss": 0.6897, "num_input_tokens_seen": 64663760, "step": 111375 }, { "epoch": 16.58921656240691, "grad_norm": 1.4104591608047485, "learning_rate": 4.300772742506571e-06, "loss": 0.6263, "num_input_tokens_seen": 64666544, "step": 111380 }, { "epoch": 16.58996127494787, "grad_norm": 1.6680033206939697, "learning_rate": 4.2989507271413515e-06, "loss": 0.5847, "num_input_tokens_seen": 64669552, "step": 111385 }, { "epoch": 16.59070598748883, "grad_norm": 1.2426170110702515, "learning_rate": 4.297129061499324e-06, "loss": 0.5843, "num_input_tokens_seen": 64672528, "step": 111390 }, { "epoch": 16.59145070002979, "grad_norm": 2.391998767852783, "learning_rate": 4.29530774561126e-06, "loss": 0.7395, "num_input_tokens_seen": 64675440, "step": 111395 }, { "epoch": 16.59219541257075, "grad_norm": 1.4402387142181396, "learning_rate": 4.29348677950793e-06, "loss": 0.6216, "num_input_tokens_seen": 64678448, "step": 111400 }, { "epoch": 16.592940125111706, "grad_norm": 1.4508774280548096, "learning_rate": 4.291666163220087e-06, "loss": 0.7676, "num_input_tokens_seen": 64681104, "step": 111405 }, { "epoch": 16.593684837652667, "grad_norm": 1.297204852104187, "learning_rate": 4.289845896778505e-06, "loss": 0.6763, "num_input_tokens_seen": 64683920, "step": 111410 }, { "epoch": 16.594429550193624, "grad_norm": 0.684615433216095, "learning_rate": 4.2880259802139276e-06, "loss": 0.5538, "num_input_tokens_seen": 64686576, "step": 111415 }, { "epoch": 16.595174262734584, "grad_norm": 1.2309008836746216, "learning_rate": 4.286206413557092e-06, "loss": 0.5448, "num_input_tokens_seen": 64689424, "step": 111420 }, { "epoch": 16.595918975275545, "grad_norm": 1.1665736436843872, "learning_rate": 4.284387196838755e-06, "loss": 0.5579, "num_input_tokens_seen": 64692432, "step": 111425 }, { "epoch": 16.596663687816502, "grad_norm": 1.1465489864349365, "learning_rate": 4.282568330089637e-06, "loss": 0.5534, "num_input_tokens_seen": 64695280, "step": 111430 }, { "epoch": 16.597408400357462, "grad_norm": 1.8100556135177612, "learning_rate": 4.280749813340473e-06, "loss": 0.5348, "num_input_tokens_seen": 64698128, "step": 111435 }, { "epoch": 16.598153112898423, "grad_norm": 1.1409574747085571, "learning_rate": 4.278931646621981e-06, "loss": 0.5869, "num_input_tokens_seen": 64700944, "step": 111440 }, { "epoch": 16.59889782543938, "grad_norm": 1.7499693632125854, "learning_rate": 4.2771138299648825e-06, "loss": 0.5053, "num_input_tokens_seen": 64703920, "step": 111445 }, { "epoch": 16.59964253798034, "grad_norm": 1.7766311168670654, "learning_rate": 4.275296363399883e-06, "loss": 0.5159, "num_input_tokens_seen": 64706704, "step": 111450 }, { "epoch": 16.600387250521297, "grad_norm": 1.1327946186065674, "learning_rate": 4.27347924695769e-06, "loss": 0.4491, "num_input_tokens_seen": 64709584, "step": 111455 }, { "epoch": 16.601131963062258, "grad_norm": 0.9582917094230652, "learning_rate": 4.271662480668995e-06, "loss": 0.4259, "num_input_tokens_seen": 64712336, "step": 111460 }, { "epoch": 16.60187667560322, "grad_norm": 1.5077002048492432, "learning_rate": 4.269846064564498e-06, "loss": 0.4806, "num_input_tokens_seen": 64715120, "step": 111465 }, { "epoch": 16.602621388144176, "grad_norm": 0.8427306413650513, "learning_rate": 4.268029998674883e-06, "loss": 0.4636, "num_input_tokens_seen": 64717936, "step": 111470 }, { "epoch": 16.603366100685136, "grad_norm": 1.8254433870315552, "learning_rate": 4.266214283030825e-06, "loss": 0.5051, "num_input_tokens_seen": 64721040, "step": 111475 }, { "epoch": 16.604110813226093, "grad_norm": 1.5651657581329346, "learning_rate": 4.2643989176630095e-06, "loss": 0.5447, "num_input_tokens_seen": 64723856, "step": 111480 }, { "epoch": 16.604855525767054, "grad_norm": 1.8275126218795776, "learning_rate": 4.262583902602094e-06, "loss": 0.6349, "num_input_tokens_seen": 64726608, "step": 111485 }, { "epoch": 16.605600238308014, "grad_norm": 2.1497344970703125, "learning_rate": 4.2607692378787535e-06, "loss": 0.7566, "num_input_tokens_seen": 64729616, "step": 111490 }, { "epoch": 16.60634495084897, "grad_norm": 2.0553481578826904, "learning_rate": 4.258954923523636e-06, "loss": 0.5227, "num_input_tokens_seen": 64732272, "step": 111495 }, { "epoch": 16.60708966338993, "grad_norm": 2.151904821395874, "learning_rate": 4.25714095956739e-06, "loss": 0.6253, "num_input_tokens_seen": 64735056, "step": 111500 }, { "epoch": 16.607834375930892, "grad_norm": 1.420487642288208, "learning_rate": 4.255327346040672e-06, "loss": 0.5959, "num_input_tokens_seen": 64737776, "step": 111505 }, { "epoch": 16.60857908847185, "grad_norm": 1.2855510711669922, "learning_rate": 4.253514082974108e-06, "loss": 0.4714, "num_input_tokens_seen": 64740944, "step": 111510 }, { "epoch": 16.60932380101281, "grad_norm": 1.8570631742477417, "learning_rate": 4.251701170398342e-06, "loss": 0.6471, "num_input_tokens_seen": 64744176, "step": 111515 }, { "epoch": 16.610068513553767, "grad_norm": 1.5602136850357056, "learning_rate": 4.2498886083439995e-06, "loss": 0.4913, "num_input_tokens_seen": 64746960, "step": 111520 }, { "epoch": 16.610813226094727, "grad_norm": 2.6061930656433105, "learning_rate": 4.2480763968416996e-06, "loss": 0.6178, "num_input_tokens_seen": 64749968, "step": 111525 }, { "epoch": 16.611557938635688, "grad_norm": 2.0258970260620117, "learning_rate": 4.246264535922051e-06, "loss": 0.6742, "num_input_tokens_seen": 64753168, "step": 111530 }, { "epoch": 16.612302651176645, "grad_norm": 1.4684253931045532, "learning_rate": 4.244453025615674e-06, "loss": 0.6802, "num_input_tokens_seen": 64756240, "step": 111535 }, { "epoch": 16.613047363717605, "grad_norm": 2.3989391326904297, "learning_rate": 4.242641865953173e-06, "loss": 0.4979, "num_input_tokens_seen": 64759056, "step": 111540 }, { "epoch": 16.613792076258566, "grad_norm": 1.097597360610962, "learning_rate": 4.240831056965131e-06, "loss": 0.469, "num_input_tokens_seen": 64761904, "step": 111545 }, { "epoch": 16.614536788799523, "grad_norm": 1.3452874422073364, "learning_rate": 4.239020598682155e-06, "loss": 0.3477, "num_input_tokens_seen": 64764784, "step": 111550 }, { "epoch": 16.615281501340483, "grad_norm": 1.2763051986694336, "learning_rate": 4.237210491134821e-06, "loss": 0.5012, "num_input_tokens_seen": 64767792, "step": 111555 }, { "epoch": 16.61602621388144, "grad_norm": 1.8585771322250366, "learning_rate": 4.23540073435372e-06, "loss": 0.5936, "num_input_tokens_seen": 64770992, "step": 111560 }, { "epoch": 16.6167709264224, "grad_norm": 1.6133614778518677, "learning_rate": 4.2335913283694126e-06, "loss": 0.7887, "num_input_tokens_seen": 64773840, "step": 111565 }, { "epoch": 16.61751563896336, "grad_norm": 1.1642367839813232, "learning_rate": 4.231782273212481e-06, "loss": 0.6547, "num_input_tokens_seen": 64776976, "step": 111570 }, { "epoch": 16.61826035150432, "grad_norm": 2.4153921604156494, "learning_rate": 4.2299735689134784e-06, "loss": 0.6161, "num_input_tokens_seen": 64779760, "step": 111575 }, { "epoch": 16.61900506404528, "grad_norm": 1.743497371673584, "learning_rate": 4.228165215502958e-06, "loss": 0.5673, "num_input_tokens_seen": 64782832, "step": 111580 }, { "epoch": 16.61974977658624, "grad_norm": 1.593144178390503, "learning_rate": 4.226357213011478e-06, "loss": 0.4701, "num_input_tokens_seen": 64785840, "step": 111585 }, { "epoch": 16.620494489127196, "grad_norm": 1.6187487840652466, "learning_rate": 4.224549561469582e-06, "loss": 0.5976, "num_input_tokens_seen": 64788752, "step": 111590 }, { "epoch": 16.621239201668157, "grad_norm": 1.811023235321045, "learning_rate": 4.222742260907806e-06, "loss": 0.5114, "num_input_tokens_seen": 64791664, "step": 111595 }, { "epoch": 16.621983914209114, "grad_norm": 1.5905929803848267, "learning_rate": 4.220935311356675e-06, "loss": 0.5744, "num_input_tokens_seen": 64794416, "step": 111600 }, { "epoch": 16.622728626750074, "grad_norm": 0.8864444494247437, "learning_rate": 4.219128712846729e-06, "loss": 0.4166, "num_input_tokens_seen": 64797616, "step": 111605 }, { "epoch": 16.623473339291035, "grad_norm": 2.4130945205688477, "learning_rate": 4.217322465408477e-06, "loss": 0.5854, "num_input_tokens_seen": 64800944, "step": 111610 }, { "epoch": 16.624218051831992, "grad_norm": 2.04779052734375, "learning_rate": 4.2155165690724476e-06, "loss": 0.5456, "num_input_tokens_seen": 64803792, "step": 111615 }, { "epoch": 16.624962764372953, "grad_norm": 1.7066253423690796, "learning_rate": 4.213711023869138e-06, "loss": 0.5067, "num_input_tokens_seen": 64806480, "step": 111620 }, { "epoch": 16.62570747691391, "grad_norm": 1.450578212738037, "learning_rate": 4.211905829829049e-06, "loss": 0.6532, "num_input_tokens_seen": 64809392, "step": 111625 }, { "epoch": 16.62645218945487, "grad_norm": 1.415446400642395, "learning_rate": 4.21010098698269e-06, "loss": 0.8325, "num_input_tokens_seen": 64813744, "step": 111630 }, { "epoch": 16.62719690199583, "grad_norm": 1.3209935426712036, "learning_rate": 4.208296495360539e-06, "loss": 0.542, "num_input_tokens_seen": 64816304, "step": 111635 }, { "epoch": 16.627941614536788, "grad_norm": 1.4648616313934326, "learning_rate": 4.206492354993094e-06, "loss": 0.5591, "num_input_tokens_seen": 64818928, "step": 111640 }, { "epoch": 16.628686327077748, "grad_norm": 1.5287913084030151, "learning_rate": 4.204688565910819e-06, "loss": 0.6643, "num_input_tokens_seen": 64821808, "step": 111645 }, { "epoch": 16.62943103961871, "grad_norm": 1.8275179862976074, "learning_rate": 4.202885128144202e-06, "loss": 0.4906, "num_input_tokens_seen": 64824688, "step": 111650 }, { "epoch": 16.630175752159666, "grad_norm": 2.4449074268341064, "learning_rate": 4.201082041723703e-06, "loss": 0.669, "num_input_tokens_seen": 64828016, "step": 111655 }, { "epoch": 16.630920464700626, "grad_norm": 3.2344729900360107, "learning_rate": 4.1992793066797845e-06, "loss": 0.4364, "num_input_tokens_seen": 64830768, "step": 111660 }, { "epoch": 16.631665177241583, "grad_norm": 1.1553363800048828, "learning_rate": 4.197476923042901e-06, "loss": 0.5268, "num_input_tokens_seen": 64833968, "step": 111665 }, { "epoch": 16.632409889782544, "grad_norm": 1.2937020063400269, "learning_rate": 4.195674890843495e-06, "loss": 0.6151, "num_input_tokens_seen": 64836624, "step": 111670 }, { "epoch": 16.633154602323504, "grad_norm": 0.9315527081489563, "learning_rate": 4.193873210112026e-06, "loss": 0.545, "num_input_tokens_seen": 64839312, "step": 111675 }, { "epoch": 16.63389931486446, "grad_norm": 2.120856285095215, "learning_rate": 4.192071880878914e-06, "loss": 0.5182, "num_input_tokens_seen": 64842192, "step": 111680 }, { "epoch": 16.63464402740542, "grad_norm": 1.2180427312850952, "learning_rate": 4.1902709031746094e-06, "loss": 0.4881, "num_input_tokens_seen": 64845104, "step": 111685 }, { "epoch": 16.635388739946382, "grad_norm": 1.6291581392288208, "learning_rate": 4.188470277029516e-06, "loss": 0.6346, "num_input_tokens_seen": 64848304, "step": 111690 }, { "epoch": 16.63613345248734, "grad_norm": 1.0233737230300903, "learning_rate": 4.1866700024740745e-06, "loss": 0.5425, "num_input_tokens_seen": 64851344, "step": 111695 }, { "epoch": 16.6368781650283, "grad_norm": 1.4029639959335327, "learning_rate": 4.184870079538692e-06, "loss": 0.6135, "num_input_tokens_seen": 64854224, "step": 111700 }, { "epoch": 16.637622877569257, "grad_norm": 1.4445298910140991, "learning_rate": 4.183070508253764e-06, "loss": 0.4777, "num_input_tokens_seen": 64857072, "step": 111705 }, { "epoch": 16.638367590110217, "grad_norm": 2.3100109100341797, "learning_rate": 4.18127128864971e-06, "loss": 0.7769, "num_input_tokens_seen": 64860112, "step": 111710 }, { "epoch": 16.639112302651178, "grad_norm": 2.730987310409546, "learning_rate": 4.179472420756911e-06, "loss": 0.6966, "num_input_tokens_seen": 64863056, "step": 111715 }, { "epoch": 16.639857015192135, "grad_norm": 2.023038387298584, "learning_rate": 4.177673904605773e-06, "loss": 0.5308, "num_input_tokens_seen": 64865936, "step": 111720 }, { "epoch": 16.640601727733095, "grad_norm": 1.6216120719909668, "learning_rate": 4.1758757402266675e-06, "loss": 0.5744, "num_input_tokens_seen": 64868464, "step": 111725 }, { "epoch": 16.641346440274056, "grad_norm": 2.0971486568450928, "learning_rate": 4.1740779276499805e-06, "loss": 0.5372, "num_input_tokens_seen": 64871472, "step": 111730 }, { "epoch": 16.642091152815013, "grad_norm": 1.5857977867126465, "learning_rate": 4.172280466906079e-06, "loss": 0.5635, "num_input_tokens_seen": 64874160, "step": 111735 }, { "epoch": 16.642835865355973, "grad_norm": 2.248584508895874, "learning_rate": 4.170483358025323e-06, "loss": 0.8221, "num_input_tokens_seen": 64877232, "step": 111740 }, { "epoch": 16.64358057789693, "grad_norm": 1.8469165563583374, "learning_rate": 4.168686601038091e-06, "loss": 0.5544, "num_input_tokens_seen": 64880016, "step": 111745 }, { "epoch": 16.64432529043789, "grad_norm": 1.6181507110595703, "learning_rate": 4.1668901959747155e-06, "loss": 0.5174, "num_input_tokens_seen": 64882768, "step": 111750 }, { "epoch": 16.64507000297885, "grad_norm": 1.5415652990341187, "learning_rate": 4.165094142865566e-06, "loss": 0.5432, "num_input_tokens_seen": 64885936, "step": 111755 }, { "epoch": 16.64581471551981, "grad_norm": 2.0458829402923584, "learning_rate": 4.163298441740968e-06, "loss": 0.5758, "num_input_tokens_seen": 64888656, "step": 111760 }, { "epoch": 16.64655942806077, "grad_norm": 1.9949699640274048, "learning_rate": 4.161503092631272e-06, "loss": 0.5735, "num_input_tokens_seen": 64891568, "step": 111765 }, { "epoch": 16.647304140601726, "grad_norm": 1.7931132316589355, "learning_rate": 4.159708095566794e-06, "loss": 0.5661, "num_input_tokens_seen": 64894384, "step": 111770 }, { "epoch": 16.648048853142686, "grad_norm": 3.408470630645752, "learning_rate": 4.157913450577875e-06, "loss": 0.6812, "num_input_tokens_seen": 64897296, "step": 111775 }, { "epoch": 16.648793565683647, "grad_norm": 1.826924204826355, "learning_rate": 4.1561191576948235e-06, "loss": 0.5258, "num_input_tokens_seen": 64899984, "step": 111780 }, { "epoch": 16.649538278224604, "grad_norm": 0.8160297274589539, "learning_rate": 4.1543252169479546e-06, "loss": 0.4678, "num_input_tokens_seen": 64902672, "step": 111785 }, { "epoch": 16.650282990765565, "grad_norm": 2.199130058288574, "learning_rate": 4.15253162836757e-06, "loss": 0.4643, "num_input_tokens_seen": 64905328, "step": 111790 }, { "epoch": 16.651027703306525, "grad_norm": 1.4757132530212402, "learning_rate": 4.1507383919839795e-06, "loss": 0.5727, "num_input_tokens_seen": 64908080, "step": 111795 }, { "epoch": 16.651772415847482, "grad_norm": 1.617774486541748, "learning_rate": 4.148945507827476e-06, "loss": 0.5959, "num_input_tokens_seen": 64910960, "step": 111800 }, { "epoch": 16.652517128388443, "grad_norm": 1.6613177061080933, "learning_rate": 4.147152975928336e-06, "loss": 0.675, "num_input_tokens_seen": 64913776, "step": 111805 }, { "epoch": 16.6532618409294, "grad_norm": 2.726126194000244, "learning_rate": 4.1453607963168604e-06, "loss": 0.5516, "num_input_tokens_seen": 64916656, "step": 111810 }, { "epoch": 16.65400655347036, "grad_norm": 1.7091902494430542, "learning_rate": 4.1435689690233205e-06, "loss": 0.6588, "num_input_tokens_seen": 64919440, "step": 111815 }, { "epoch": 16.65475126601132, "grad_norm": 1.4820642471313477, "learning_rate": 4.141777494077978e-06, "loss": 0.6643, "num_input_tokens_seen": 64922288, "step": 111820 }, { "epoch": 16.655495978552278, "grad_norm": 1.6353601217269897, "learning_rate": 4.139986371511109e-06, "loss": 0.7597, "num_input_tokens_seen": 64925488, "step": 111825 }, { "epoch": 16.656240691093238, "grad_norm": 4.066806793212891, "learning_rate": 4.138195601352968e-06, "loss": 0.5871, "num_input_tokens_seen": 64928496, "step": 111830 }, { "epoch": 16.6569854036342, "grad_norm": 1.5288631916046143, "learning_rate": 4.1364051836338125e-06, "loss": 0.517, "num_input_tokens_seen": 64931312, "step": 111835 }, { "epoch": 16.657730116175156, "grad_norm": 1.5684945583343506, "learning_rate": 4.134615118383878e-06, "loss": 0.6079, "num_input_tokens_seen": 64934448, "step": 111840 }, { "epoch": 16.658474828716116, "grad_norm": 1.8980063199996948, "learning_rate": 4.132825405633425e-06, "loss": 0.5705, "num_input_tokens_seen": 64937776, "step": 111845 }, { "epoch": 16.659219541257073, "grad_norm": 1.3960376977920532, "learning_rate": 4.131036045412675e-06, "loss": 0.6361, "num_input_tokens_seen": 64940912, "step": 111850 }, { "epoch": 16.659964253798034, "grad_norm": 1.3258908987045288, "learning_rate": 4.1292470377518625e-06, "loss": 0.5678, "num_input_tokens_seen": 64943696, "step": 111855 }, { "epoch": 16.660708966338994, "grad_norm": 1.0689113140106201, "learning_rate": 4.12745838268121e-06, "loss": 0.6633, "num_input_tokens_seen": 64946608, "step": 111860 }, { "epoch": 16.66145367887995, "grad_norm": 1.7705597877502441, "learning_rate": 4.125670080230926e-06, "loss": 0.6727, "num_input_tokens_seen": 64949712, "step": 111865 }, { "epoch": 16.66219839142091, "grad_norm": 0.9254872798919678, "learning_rate": 4.123882130431236e-06, "loss": 0.7821, "num_input_tokens_seen": 64952336, "step": 111870 }, { "epoch": 16.662943103961872, "grad_norm": 1.6592801809310913, "learning_rate": 4.122094533312337e-06, "loss": 0.7592, "num_input_tokens_seen": 64955568, "step": 111875 }, { "epoch": 16.66368781650283, "grad_norm": 1.0678716897964478, "learning_rate": 4.120307288904435e-06, "loss": 0.6669, "num_input_tokens_seen": 64958416, "step": 111880 }, { "epoch": 16.66443252904379, "grad_norm": 1.388143539428711, "learning_rate": 4.118520397237715e-06, "loss": 0.4962, "num_input_tokens_seen": 64961168, "step": 111885 }, { "epoch": 16.665177241584747, "grad_norm": 1.2643027305603027, "learning_rate": 4.1167338583423755e-06, "loss": 0.4, "num_input_tokens_seen": 64963792, "step": 111890 }, { "epoch": 16.665921954125707, "grad_norm": 1.8661845922470093, "learning_rate": 4.114947672248593e-06, "loss": 0.5019, "num_input_tokens_seen": 64966608, "step": 111895 }, { "epoch": 16.666666666666668, "grad_norm": 2.085651397705078, "learning_rate": 4.113161838986537e-06, "loss": 0.3982, "num_input_tokens_seen": 64969296, "step": 111900 }, { "epoch": 16.667411379207625, "grad_norm": 2.745119333267212, "learning_rate": 4.111376358586388e-06, "loss": 0.7786, "num_input_tokens_seen": 64972240, "step": 111905 }, { "epoch": 16.668156091748585, "grad_norm": 1.1521871089935303, "learning_rate": 4.109591231078303e-06, "loss": 0.4947, "num_input_tokens_seen": 64975024, "step": 111910 }, { "epoch": 16.668900804289546, "grad_norm": 1.5262079238891602, "learning_rate": 4.107806456492444e-06, "loss": 0.5017, "num_input_tokens_seen": 64978000, "step": 111915 }, { "epoch": 16.669645516830503, "grad_norm": 0.9858382344245911, "learning_rate": 4.106022034858961e-06, "loss": 0.6529, "num_input_tokens_seen": 64981104, "step": 111920 }, { "epoch": 16.670390229371463, "grad_norm": 1.1398028135299683, "learning_rate": 4.104237966208002e-06, "loss": 0.5539, "num_input_tokens_seen": 64984048, "step": 111925 }, { "epoch": 16.67113494191242, "grad_norm": 1.8062268495559692, "learning_rate": 4.102454250569698e-06, "loss": 0.4422, "num_input_tokens_seen": 64986992, "step": 111930 }, { "epoch": 16.67187965445338, "grad_norm": 2.9634592533111572, "learning_rate": 4.1006708879741975e-06, "loss": 0.4835, "num_input_tokens_seen": 64989744, "step": 111935 }, { "epoch": 16.67262436699434, "grad_norm": 1.491766095161438, "learning_rate": 4.098887878451621e-06, "loss": 0.5927, "num_input_tokens_seen": 64992496, "step": 111940 }, { "epoch": 16.6733690795353, "grad_norm": 2.0290138721466064, "learning_rate": 4.097105222032083e-06, "loss": 0.4946, "num_input_tokens_seen": 64995952, "step": 111945 }, { "epoch": 16.67411379207626, "grad_norm": 1.689675211906433, "learning_rate": 4.095322918745717e-06, "loss": 0.554, "num_input_tokens_seen": 64998960, "step": 111950 }, { "epoch": 16.67485850461722, "grad_norm": 1.1643551588058472, "learning_rate": 4.093540968622614e-06, "loss": 0.7698, "num_input_tokens_seen": 65001936, "step": 111955 }, { "epoch": 16.675603217158177, "grad_norm": 1.1966280937194824, "learning_rate": 4.091759371692896e-06, "loss": 0.6575, "num_input_tokens_seen": 65005392, "step": 111960 }, { "epoch": 16.676347929699137, "grad_norm": 3.6701974868774414, "learning_rate": 4.089978127986646e-06, "loss": 0.7626, "num_input_tokens_seen": 65008144, "step": 111965 }, { "epoch": 16.677092642240094, "grad_norm": 1.2555738687515259, "learning_rate": 4.08819723753397e-06, "loss": 0.627, "num_input_tokens_seen": 65011152, "step": 111970 }, { "epoch": 16.677837354781055, "grad_norm": 1.3832439184188843, "learning_rate": 4.086416700364948e-06, "loss": 0.591, "num_input_tokens_seen": 65013808, "step": 111975 }, { "epoch": 16.678582067322015, "grad_norm": 1.6756274700164795, "learning_rate": 4.084636516509654e-06, "loss": 0.5944, "num_input_tokens_seen": 65016944, "step": 111980 }, { "epoch": 16.679326779862972, "grad_norm": 2.183577060699463, "learning_rate": 4.0828566859981765e-06, "loss": 0.5209, "num_input_tokens_seen": 65019696, "step": 111985 }, { "epoch": 16.680071492403933, "grad_norm": 1.5950372219085693, "learning_rate": 4.081077208860573e-06, "loss": 0.7638, "num_input_tokens_seen": 65022544, "step": 111990 }, { "epoch": 16.68081620494489, "grad_norm": 1.653537631034851, "learning_rate": 4.079298085126912e-06, "loss": 0.595, "num_input_tokens_seen": 65025488, "step": 111995 }, { "epoch": 16.68156091748585, "grad_norm": 1.212886095046997, "learning_rate": 4.0775193148272385e-06, "loss": 0.6022, "num_input_tokens_seen": 65028336, "step": 112000 }, { "epoch": 16.68230563002681, "grad_norm": 3.222486972808838, "learning_rate": 4.07574089799162e-06, "loss": 0.6871, "num_input_tokens_seen": 65031184, "step": 112005 }, { "epoch": 16.683050342567768, "grad_norm": 1.1597795486450195, "learning_rate": 4.073962834650083e-06, "loss": 0.5228, "num_input_tokens_seen": 65033904, "step": 112010 }, { "epoch": 16.683795055108728, "grad_norm": 2.0039405822753906, "learning_rate": 4.072185124832684e-06, "loss": 0.7454, "num_input_tokens_seen": 65037040, "step": 112015 }, { "epoch": 16.68453976764969, "grad_norm": 1.4451484680175781, "learning_rate": 4.070407768569448e-06, "loss": 0.4229, "num_input_tokens_seen": 65040016, "step": 112020 }, { "epoch": 16.685284480190646, "grad_norm": 0.7616029977798462, "learning_rate": 4.068630765890393e-06, "loss": 0.5836, "num_input_tokens_seen": 65042704, "step": 112025 }, { "epoch": 16.686029192731606, "grad_norm": 2.7812957763671875, "learning_rate": 4.0668541168255556e-06, "loss": 0.5651, "num_input_tokens_seen": 65045360, "step": 112030 }, { "epoch": 16.686773905272563, "grad_norm": 2.169928789138794, "learning_rate": 4.065077821404934e-06, "loss": 0.5959, "num_input_tokens_seen": 65048464, "step": 112035 }, { "epoch": 16.687518617813524, "grad_norm": 1.5535993576049805, "learning_rate": 4.0633018796585514e-06, "loss": 0.5026, "num_input_tokens_seen": 65051376, "step": 112040 }, { "epoch": 16.688263330354484, "grad_norm": 1.5521910190582275, "learning_rate": 4.061526291616399e-06, "loss": 0.5266, "num_input_tokens_seen": 65054256, "step": 112045 }, { "epoch": 16.68900804289544, "grad_norm": 1.6029467582702637, "learning_rate": 4.059751057308486e-06, "loss": 0.6102, "num_input_tokens_seen": 65057008, "step": 112050 }, { "epoch": 16.689752755436402, "grad_norm": 2.1423697471618652, "learning_rate": 4.057976176764797e-06, "loss": 0.5253, "num_input_tokens_seen": 65060048, "step": 112055 }, { "epoch": 16.690497467977362, "grad_norm": 1.7345597743988037, "learning_rate": 4.056201650015315e-06, "loss": 0.6495, "num_input_tokens_seen": 65063056, "step": 112060 }, { "epoch": 16.69124218051832, "grad_norm": 0.92550128698349, "learning_rate": 4.054427477090019e-06, "loss": 0.6943, "num_input_tokens_seen": 65066000, "step": 112065 }, { "epoch": 16.69198689305928, "grad_norm": 1.224947214126587, "learning_rate": 4.0526536580188766e-06, "loss": 0.6981, "num_input_tokens_seen": 65068816, "step": 112070 }, { "epoch": 16.692731605600237, "grad_norm": 2.1210670471191406, "learning_rate": 4.050880192831868e-06, "loss": 0.6826, "num_input_tokens_seen": 65071760, "step": 112075 }, { "epoch": 16.693476318141197, "grad_norm": 0.7776117920875549, "learning_rate": 4.04910708155894e-06, "loss": 0.5656, "num_input_tokens_seen": 65074768, "step": 112080 }, { "epoch": 16.694221030682158, "grad_norm": 1.3697246313095093, "learning_rate": 4.047334324230059e-06, "loss": 0.5423, "num_input_tokens_seen": 65077808, "step": 112085 }, { "epoch": 16.694965743223115, "grad_norm": 1.8977569341659546, "learning_rate": 4.045561920875165e-06, "loss": 0.6299, "num_input_tokens_seen": 65080720, "step": 112090 }, { "epoch": 16.695710455764075, "grad_norm": 0.8050650358200073, "learning_rate": 4.043789871524212e-06, "loss": 0.692, "num_input_tokens_seen": 65083440, "step": 112095 }, { "epoch": 16.696455168305036, "grad_norm": 2.049342155456543, "learning_rate": 4.042018176207127e-06, "loss": 0.5765, "num_input_tokens_seen": 65086608, "step": 112100 }, { "epoch": 16.697199880845993, "grad_norm": 1.4076666831970215, "learning_rate": 4.04024683495384e-06, "loss": 0.5638, "num_input_tokens_seen": 65089232, "step": 112105 }, { "epoch": 16.697944593386953, "grad_norm": 1.8941898345947266, "learning_rate": 4.038475847794287e-06, "loss": 0.5817, "num_input_tokens_seen": 65091792, "step": 112110 }, { "epoch": 16.69868930592791, "grad_norm": 2.1700620651245117, "learning_rate": 4.036705214758379e-06, "loss": 0.5753, "num_input_tokens_seen": 65094832, "step": 112115 }, { "epoch": 16.69943401846887, "grad_norm": 0.6032728552818298, "learning_rate": 4.0349349358760255e-06, "loss": 0.6427, "num_input_tokens_seen": 65097808, "step": 112120 }, { "epoch": 16.70017873100983, "grad_norm": 1.9933162927627563, "learning_rate": 4.0331650111771426e-06, "loss": 0.471, "num_input_tokens_seen": 65100784, "step": 112125 }, { "epoch": 16.70092344355079, "grad_norm": 4.007033824920654, "learning_rate": 4.031395440691629e-06, "loss": 0.8063, "num_input_tokens_seen": 65103696, "step": 112130 }, { "epoch": 16.70166815609175, "grad_norm": 2.4798519611358643, "learning_rate": 4.02962622444937e-06, "loss": 0.5856, "num_input_tokens_seen": 65106448, "step": 112135 }, { "epoch": 16.702412868632706, "grad_norm": 0.9919107556343079, "learning_rate": 4.0278573624802695e-06, "loss": 0.4413, "num_input_tokens_seen": 65109264, "step": 112140 }, { "epoch": 16.703157581173667, "grad_norm": 1.5280972719192505, "learning_rate": 4.026088854814205e-06, "loss": 0.5633, "num_input_tokens_seen": 65112240, "step": 112145 }, { "epoch": 16.703902293714627, "grad_norm": 2.078899383544922, "learning_rate": 4.024320701481044e-06, "loss": 0.4603, "num_input_tokens_seen": 65115408, "step": 112150 }, { "epoch": 16.704647006255584, "grad_norm": 1.4278963804244995, "learning_rate": 4.0225529025106735e-06, "loss": 0.8785, "num_input_tokens_seen": 65118320, "step": 112155 }, { "epoch": 16.705391718796545, "grad_norm": 2.6631271839141846, "learning_rate": 4.020785457932946e-06, "loss": 0.5504, "num_input_tokens_seen": 65121296, "step": 112160 }, { "epoch": 16.706136431337505, "grad_norm": 1.7502716779708862, "learning_rate": 4.01901836777773e-06, "loss": 0.5103, "num_input_tokens_seen": 65124080, "step": 112165 }, { "epoch": 16.706881143878462, "grad_norm": 1.9850963354110718, "learning_rate": 4.01725163207487e-06, "loss": 0.7167, "num_input_tokens_seen": 65126736, "step": 112170 }, { "epoch": 16.707625856419423, "grad_norm": 2.918139696121216, "learning_rate": 4.015485250854223e-06, "loss": 0.6097, "num_input_tokens_seen": 65129648, "step": 112175 }, { "epoch": 16.70837056896038, "grad_norm": 2.616785764694214, "learning_rate": 4.013719224145623e-06, "loss": 0.5398, "num_input_tokens_seen": 65132848, "step": 112180 }, { "epoch": 16.70911528150134, "grad_norm": 1.8469040393829346, "learning_rate": 4.011953551978911e-06, "loss": 0.5272, "num_input_tokens_seen": 65135504, "step": 112185 }, { "epoch": 16.7098599940423, "grad_norm": 1.1751195192337036, "learning_rate": 4.0101882343839105e-06, "loss": 0.5408, "num_input_tokens_seen": 65138288, "step": 112190 }, { "epoch": 16.710604706583258, "grad_norm": 1.78449285030365, "learning_rate": 4.0084232713904405e-06, "loss": 0.4131, "num_input_tokens_seen": 65140944, "step": 112195 }, { "epoch": 16.71134941912422, "grad_norm": 1.1690089702606201, "learning_rate": 4.006658663028331e-06, "loss": 0.6211, "num_input_tokens_seen": 65144048, "step": 112200 }, { "epoch": 16.71209413166518, "grad_norm": 1.1681978702545166, "learning_rate": 4.004894409327381e-06, "loss": 0.5952, "num_input_tokens_seen": 65146800, "step": 112205 }, { "epoch": 16.712838844206136, "grad_norm": 1.1291054487228394, "learning_rate": 4.0031305103174076e-06, "loss": 0.6003, "num_input_tokens_seen": 65149712, "step": 112210 }, { "epoch": 16.713583556747096, "grad_norm": 1.5509275197982788, "learning_rate": 4.001366966028197e-06, "loss": 0.6976, "num_input_tokens_seen": 65152528, "step": 112215 }, { "epoch": 16.714328269288053, "grad_norm": 1.6400614976882935, "learning_rate": 3.999603776489555e-06, "loss": 0.6177, "num_input_tokens_seen": 65155248, "step": 112220 }, { "epoch": 16.715072981829014, "grad_norm": 0.8807314038276672, "learning_rate": 3.997840941731265e-06, "loss": 0.6874, "num_input_tokens_seen": 65158128, "step": 112225 }, { "epoch": 16.715817694369974, "grad_norm": 1.4697972536087036, "learning_rate": 3.996078461783098e-06, "loss": 0.7821, "num_input_tokens_seen": 65160912, "step": 112230 }, { "epoch": 16.71656240691093, "grad_norm": 2.291365146636963, "learning_rate": 3.994316336674847e-06, "loss": 0.7307, "num_input_tokens_seen": 65163760, "step": 112235 }, { "epoch": 16.717307119451892, "grad_norm": 2.960486650466919, "learning_rate": 3.992554566436263e-06, "loss": 0.6548, "num_input_tokens_seen": 65166512, "step": 112240 }, { "epoch": 16.718051831992852, "grad_norm": 0.9929346442222595, "learning_rate": 3.990793151097128e-06, "loss": 0.6602, "num_input_tokens_seen": 65169776, "step": 112245 }, { "epoch": 16.71879654453381, "grad_norm": 3.042524814605713, "learning_rate": 3.989032090687189e-06, "loss": 0.5672, "num_input_tokens_seen": 65172752, "step": 112250 }, { "epoch": 16.71954125707477, "grad_norm": 1.4756371974945068, "learning_rate": 3.987271385236197e-06, "loss": 0.5209, "num_input_tokens_seen": 65176240, "step": 112255 }, { "epoch": 16.720285969615727, "grad_norm": 1.2774208784103394, "learning_rate": 3.9855110347739e-06, "loss": 0.6689, "num_input_tokens_seen": 65179408, "step": 112260 }, { "epoch": 16.721030682156687, "grad_norm": 1.2861465215682983, "learning_rate": 3.983751039330028e-06, "loss": 0.7619, "num_input_tokens_seen": 65182096, "step": 112265 }, { "epoch": 16.721775394697648, "grad_norm": 1.6599069833755493, "learning_rate": 3.981991398934329e-06, "loss": 0.5882, "num_input_tokens_seen": 65185072, "step": 112270 }, { "epoch": 16.722520107238605, "grad_norm": 2.2468762397766113, "learning_rate": 3.980232113616519e-06, "loss": 0.7057, "num_input_tokens_seen": 65187984, "step": 112275 }, { "epoch": 16.723264819779565, "grad_norm": 1.1013528108596802, "learning_rate": 3.978473183406328e-06, "loss": 0.6139, "num_input_tokens_seen": 65190960, "step": 112280 }, { "epoch": 16.724009532320522, "grad_norm": 1.7598165273666382, "learning_rate": 3.9767146083334625e-06, "loss": 0.5721, "num_input_tokens_seen": 65194000, "step": 112285 }, { "epoch": 16.724754244861483, "grad_norm": 1.5978617668151855, "learning_rate": 3.974956388427642e-06, "loss": 0.508, "num_input_tokens_seen": 65197136, "step": 112290 }, { "epoch": 16.725498957402444, "grad_norm": 1.0328885316848755, "learning_rate": 3.97319852371856e-06, "loss": 0.5523, "num_input_tokens_seen": 65200016, "step": 112295 }, { "epoch": 16.7262436699434, "grad_norm": 1.0405900478363037, "learning_rate": 3.971441014235921e-06, "loss": 0.5484, "num_input_tokens_seen": 65202736, "step": 112300 }, { "epoch": 16.72698838248436, "grad_norm": 1.1953530311584473, "learning_rate": 3.969683860009415e-06, "loss": 0.5042, "num_input_tokens_seen": 65205680, "step": 112305 }, { "epoch": 16.72773309502532, "grad_norm": 1.2841142416000366, "learning_rate": 3.967927061068721e-06, "loss": 0.513, "num_input_tokens_seen": 65208400, "step": 112310 }, { "epoch": 16.72847780756628, "grad_norm": 1.7933273315429688, "learning_rate": 3.966170617443529e-06, "loss": 0.5162, "num_input_tokens_seen": 65211312, "step": 112315 }, { "epoch": 16.72922252010724, "grad_norm": 1.652365803718567, "learning_rate": 3.964414529163507e-06, "loss": 0.4189, "num_input_tokens_seen": 65213936, "step": 112320 }, { "epoch": 16.7299672326482, "grad_norm": 1.9559314250946045, "learning_rate": 3.96265879625832e-06, "loss": 0.5067, "num_input_tokens_seen": 65216656, "step": 112325 }, { "epoch": 16.730711945189157, "grad_norm": 2.144710063934326, "learning_rate": 3.960903418757628e-06, "loss": 0.5825, "num_input_tokens_seen": 65219600, "step": 112330 }, { "epoch": 16.731456657730117, "grad_norm": 2.9144105911254883, "learning_rate": 3.959148396691092e-06, "loss": 0.6122, "num_input_tokens_seen": 65222608, "step": 112335 }, { "epoch": 16.732201370271074, "grad_norm": 2.2227985858917236, "learning_rate": 3.957393730088363e-06, "loss": 0.4354, "num_input_tokens_seen": 65225520, "step": 112340 }, { "epoch": 16.732946082812035, "grad_norm": 1.3054569959640503, "learning_rate": 3.9556394189790705e-06, "loss": 0.5353, "num_input_tokens_seen": 65228560, "step": 112345 }, { "epoch": 16.733690795352995, "grad_norm": 1.5270503759384155, "learning_rate": 3.95388546339287e-06, "loss": 0.9303, "num_input_tokens_seen": 65231152, "step": 112350 }, { "epoch": 16.734435507893952, "grad_norm": 3.4794230461120605, "learning_rate": 3.9521318633593765e-06, "loss": 0.6917, "num_input_tokens_seen": 65234128, "step": 112355 }, { "epoch": 16.735180220434913, "grad_norm": 1.8959206342697144, "learning_rate": 3.950378618908232e-06, "loss": 0.4239, "num_input_tokens_seen": 65236656, "step": 112360 }, { "epoch": 16.73592493297587, "grad_norm": 2.3441214561462402, "learning_rate": 3.948625730069039e-06, "loss": 0.5338, "num_input_tokens_seen": 65239792, "step": 112365 }, { "epoch": 16.73666964551683, "grad_norm": 2.4329538345336914, "learning_rate": 3.946873196871423e-06, "loss": 0.6875, "num_input_tokens_seen": 65242832, "step": 112370 }, { "epoch": 16.73741435805779, "grad_norm": 2.2947707176208496, "learning_rate": 3.945121019344983e-06, "loss": 0.6282, "num_input_tokens_seen": 65245776, "step": 112375 }, { "epoch": 16.738159070598748, "grad_norm": 2.099250555038452, "learning_rate": 3.943369197519328e-06, "loss": 0.6709, "num_input_tokens_seen": 65248624, "step": 112380 }, { "epoch": 16.73890378313971, "grad_norm": 2.9289169311523438, "learning_rate": 3.941617731424052e-06, "loss": 0.8387, "num_input_tokens_seen": 65251440, "step": 112385 }, { "epoch": 16.73964849568067, "grad_norm": 3.2339582443237305, "learning_rate": 3.9398666210887395e-06, "loss": 0.7658, "num_input_tokens_seen": 65254320, "step": 112390 }, { "epoch": 16.740393208221626, "grad_norm": 1.071319341659546, "learning_rate": 3.938115866542977e-06, "loss": 0.6424, "num_input_tokens_seen": 65257200, "step": 112395 }, { "epoch": 16.741137920762586, "grad_norm": 1.805873990058899, "learning_rate": 3.936365467816333e-06, "loss": 0.664, "num_input_tokens_seen": 65260144, "step": 112400 }, { "epoch": 16.741882633303543, "grad_norm": 1.3884249925613403, "learning_rate": 3.934615424938395e-06, "loss": 0.536, "num_input_tokens_seen": 65262864, "step": 112405 }, { "epoch": 16.742627345844504, "grad_norm": 3.92295503616333, "learning_rate": 3.932865737938712e-06, "loss": 0.6675, "num_input_tokens_seen": 65265616, "step": 112410 }, { "epoch": 16.743372058385464, "grad_norm": 1.4161624908447266, "learning_rate": 3.9311164068468576e-06, "loss": 0.636, "num_input_tokens_seen": 65268528, "step": 112415 }, { "epoch": 16.74411677092642, "grad_norm": 1.856217861175537, "learning_rate": 3.929367431692377e-06, "loss": 0.4937, "num_input_tokens_seen": 65271472, "step": 112420 }, { "epoch": 16.744861483467382, "grad_norm": 2.308671712875366, "learning_rate": 3.927618812504813e-06, "loss": 0.6418, "num_input_tokens_seen": 65274416, "step": 112425 }, { "epoch": 16.745606196008342, "grad_norm": 2.0098702907562256, "learning_rate": 3.925870549313718e-06, "loss": 0.5762, "num_input_tokens_seen": 65277584, "step": 112430 }, { "epoch": 16.7463509085493, "grad_norm": 1.8170350790023804, "learning_rate": 3.9241226421486145e-06, "loss": 0.5657, "num_input_tokens_seen": 65280528, "step": 112435 }, { "epoch": 16.74709562109026, "grad_norm": 2.916684627532959, "learning_rate": 3.922375091039046e-06, "loss": 0.5924, "num_input_tokens_seen": 65283312, "step": 112440 }, { "epoch": 16.747840333631217, "grad_norm": 2.3360252380371094, "learning_rate": 3.9206278960145255e-06, "loss": 0.6661, "num_input_tokens_seen": 65286224, "step": 112445 }, { "epoch": 16.748585046172177, "grad_norm": 1.5621562004089355, "learning_rate": 3.918881057104567e-06, "loss": 0.6883, "num_input_tokens_seen": 65289296, "step": 112450 }, { "epoch": 16.749329758713138, "grad_norm": 1.1337392330169678, "learning_rate": 3.917134574338696e-06, "loss": 0.6077, "num_input_tokens_seen": 65291856, "step": 112455 }, { "epoch": 16.750074471254095, "grad_norm": 3.0521609783172607, "learning_rate": 3.915388447746407e-06, "loss": 0.8348, "num_input_tokens_seen": 65294704, "step": 112460 }, { "epoch": 16.750819183795056, "grad_norm": 1.6668928861618042, "learning_rate": 3.913642677357201e-06, "loss": 0.7884, "num_input_tokens_seen": 65297520, "step": 112465 }, { "epoch": 16.751563896336016, "grad_norm": 1.0831769704818726, "learning_rate": 3.911897263200564e-06, "loss": 0.5033, "num_input_tokens_seen": 65300496, "step": 112470 }, { "epoch": 16.752308608876973, "grad_norm": 1.8811888694763184, "learning_rate": 3.910152205305998e-06, "loss": 0.6053, "num_input_tokens_seen": 65303344, "step": 112475 }, { "epoch": 16.753053321417934, "grad_norm": 1.2407467365264893, "learning_rate": 3.908407503702966e-06, "loss": 0.4759, "num_input_tokens_seen": 65306320, "step": 112480 }, { "epoch": 16.75379803395889, "grad_norm": 1.488976240158081, "learning_rate": 3.906663158420962e-06, "loss": 0.4168, "num_input_tokens_seen": 65309136, "step": 112485 }, { "epoch": 16.75454274649985, "grad_norm": 1.1741199493408203, "learning_rate": 3.904919169489438e-06, "loss": 0.449, "num_input_tokens_seen": 65311952, "step": 112490 }, { "epoch": 16.75528745904081, "grad_norm": 1.5097675323486328, "learning_rate": 3.90317553693787e-06, "loss": 0.6825, "num_input_tokens_seen": 65314896, "step": 112495 }, { "epoch": 16.75603217158177, "grad_norm": 1.9302908182144165, "learning_rate": 3.90143226079571e-06, "loss": 0.4563, "num_input_tokens_seen": 65317456, "step": 112500 }, { "epoch": 16.75677688412273, "grad_norm": 1.1155365705490112, "learning_rate": 3.899689341092402e-06, "loss": 0.6337, "num_input_tokens_seen": 65320240, "step": 112505 }, { "epoch": 16.757521596663686, "grad_norm": 1.5101829767227173, "learning_rate": 3.897946777857406e-06, "loss": 0.7306, "num_input_tokens_seen": 65323376, "step": 112510 }, { "epoch": 16.758266309204647, "grad_norm": 0.9024537801742554, "learning_rate": 3.896204571120149e-06, "loss": 0.563, "num_input_tokens_seen": 65326416, "step": 112515 }, { "epoch": 16.759011021745607, "grad_norm": 2.301159620285034, "learning_rate": 3.894462720910067e-06, "loss": 0.3794, "num_input_tokens_seen": 65329200, "step": 112520 }, { "epoch": 16.759755734286564, "grad_norm": 1.0812045335769653, "learning_rate": 3.892721227256582e-06, "loss": 0.5643, "num_input_tokens_seen": 65331888, "step": 112525 }, { "epoch": 16.760500446827525, "grad_norm": 2.522764205932617, "learning_rate": 3.890980090189126e-06, "loss": 0.5653, "num_input_tokens_seen": 65335056, "step": 112530 }, { "epoch": 16.761245159368485, "grad_norm": 1.2824015617370605, "learning_rate": 3.8892393097370975e-06, "loss": 0.4997, "num_input_tokens_seen": 65338064, "step": 112535 }, { "epoch": 16.761989871909442, "grad_norm": 1.2181297540664673, "learning_rate": 3.887498885929924e-06, "loss": 0.623, "num_input_tokens_seen": 65341008, "step": 112540 }, { "epoch": 16.762734584450403, "grad_norm": 1.6968961954116821, "learning_rate": 3.8857588187969975e-06, "loss": 0.8021, "num_input_tokens_seen": 65343568, "step": 112545 }, { "epoch": 16.76347929699136, "grad_norm": 1.7192630767822266, "learning_rate": 3.884019108367712e-06, "loss": 0.5715, "num_input_tokens_seen": 65346320, "step": 112550 }, { "epoch": 16.76422400953232, "grad_norm": 1.4095042943954468, "learning_rate": 3.882279754671467e-06, "loss": 0.5369, "num_input_tokens_seen": 65349168, "step": 112555 }, { "epoch": 16.76496872207328, "grad_norm": 1.1116219758987427, "learning_rate": 3.880540757737636e-06, "loss": 0.5281, "num_input_tokens_seen": 65352208, "step": 112560 }, { "epoch": 16.765713434614238, "grad_norm": 1.6641961336135864, "learning_rate": 3.878802117595609e-06, "loss": 0.618, "num_input_tokens_seen": 65355248, "step": 112565 }, { "epoch": 16.7664581471552, "grad_norm": 1.2605063915252686, "learning_rate": 3.877063834274749e-06, "loss": 0.4781, "num_input_tokens_seen": 65358128, "step": 112570 }, { "epoch": 16.76720285969616, "grad_norm": 3.2354350090026855, "learning_rate": 3.8753259078044365e-06, "loss": 0.7513, "num_input_tokens_seen": 65361232, "step": 112575 }, { "epoch": 16.767947572237116, "grad_norm": 1.7403974533081055, "learning_rate": 3.8735883382140184e-06, "loss": 0.5241, "num_input_tokens_seen": 65364016, "step": 112580 }, { "epoch": 16.768692284778076, "grad_norm": 2.110701560974121, "learning_rate": 3.871851125532855e-06, "loss": 0.5578, "num_input_tokens_seen": 65366832, "step": 112585 }, { "epoch": 16.769436997319033, "grad_norm": 1.3479764461517334, "learning_rate": 3.870114269790292e-06, "loss": 0.5435, "num_input_tokens_seen": 65369488, "step": 112590 }, { "epoch": 16.770181709859994, "grad_norm": 1.0772581100463867, "learning_rate": 3.8683777710156685e-06, "loss": 0.5085, "num_input_tokens_seen": 65372176, "step": 112595 }, { "epoch": 16.770926422400954, "grad_norm": 1.185996413230896, "learning_rate": 3.866641629238329e-06, "loss": 0.5913, "num_input_tokens_seen": 65374960, "step": 112600 }, { "epoch": 16.77167113494191, "grad_norm": 3.8452024459838867, "learning_rate": 3.864905844487596e-06, "loss": 0.8683, "num_input_tokens_seen": 65377936, "step": 112605 }, { "epoch": 16.772415847482872, "grad_norm": 1.420048475265503, "learning_rate": 3.8631704167928025e-06, "loss": 0.6105, "num_input_tokens_seen": 65380976, "step": 112610 }, { "epoch": 16.773160560023832, "grad_norm": 1.7396783828735352, "learning_rate": 3.861435346183259e-06, "loss": 0.5575, "num_input_tokens_seen": 65384176, "step": 112615 }, { "epoch": 16.77390527256479, "grad_norm": 1.3695995807647705, "learning_rate": 3.859700632688285e-06, "loss": 0.6217, "num_input_tokens_seen": 65387088, "step": 112620 }, { "epoch": 16.77464998510575, "grad_norm": 1.1738141775131226, "learning_rate": 3.857966276337183e-06, "loss": 0.5237, "num_input_tokens_seen": 65390096, "step": 112625 }, { "epoch": 16.775394697646707, "grad_norm": 0.7991380095481873, "learning_rate": 3.856232277159247e-06, "loss": 0.4115, "num_input_tokens_seen": 65392976, "step": 112630 }, { "epoch": 16.776139410187668, "grad_norm": 2.2135424613952637, "learning_rate": 3.8544986351837845e-06, "loss": 0.5865, "num_input_tokens_seen": 65395696, "step": 112635 }, { "epoch": 16.776884122728628, "grad_norm": 1.7144196033477783, "learning_rate": 3.852765350440069e-06, "loss": 0.5567, "num_input_tokens_seen": 65398640, "step": 112640 }, { "epoch": 16.777628835269585, "grad_norm": 1.4694616794586182, "learning_rate": 3.8510324229573956e-06, "loss": 0.6329, "num_input_tokens_seen": 65401520, "step": 112645 }, { "epoch": 16.778373547810546, "grad_norm": 3.384155750274658, "learning_rate": 3.849299852765034e-06, "loss": 0.4817, "num_input_tokens_seen": 65404080, "step": 112650 }, { "epoch": 16.779118260351503, "grad_norm": 1.6660726070404053, "learning_rate": 3.847567639892255e-06, "loss": 0.5442, "num_input_tokens_seen": 65407024, "step": 112655 }, { "epoch": 16.779862972892463, "grad_norm": 1.815053105354309, "learning_rate": 3.845835784368324e-06, "loss": 0.7612, "num_input_tokens_seen": 65410256, "step": 112660 }, { "epoch": 16.780607685433424, "grad_norm": 2.2602450847625732, "learning_rate": 3.844104286222492e-06, "loss": 0.7337, "num_input_tokens_seen": 65413232, "step": 112665 }, { "epoch": 16.78135239797438, "grad_norm": 1.7897977828979492, "learning_rate": 3.8423731454840185e-06, "loss": 0.5744, "num_input_tokens_seen": 65416048, "step": 112670 }, { "epoch": 16.78209711051534, "grad_norm": 3.002021551132202, "learning_rate": 3.840642362182143e-06, "loss": 0.7721, "num_input_tokens_seen": 65418704, "step": 112675 }, { "epoch": 16.7828418230563, "grad_norm": 2.306042194366455, "learning_rate": 3.838911936346115e-06, "loss": 0.8989, "num_input_tokens_seen": 65421392, "step": 112680 }, { "epoch": 16.78358653559726, "grad_norm": 3.039292812347412, "learning_rate": 3.837181868005158e-06, "loss": 0.4993, "num_input_tokens_seen": 65424144, "step": 112685 }, { "epoch": 16.78433124813822, "grad_norm": 1.2596232891082764, "learning_rate": 3.83545215718851e-06, "loss": 0.5686, "num_input_tokens_seen": 65427088, "step": 112690 }, { "epoch": 16.785075960679176, "grad_norm": 2.1134791374206543, "learning_rate": 3.83372280392538e-06, "loss": 0.6013, "num_input_tokens_seen": 65429872, "step": 112695 }, { "epoch": 16.785820673220137, "grad_norm": 1.6343400478363037, "learning_rate": 3.831993808244996e-06, "loss": 0.586, "num_input_tokens_seen": 65432912, "step": 112700 }, { "epoch": 16.786565385761097, "grad_norm": 1.273154377937317, "learning_rate": 3.830265170176564e-06, "loss": 0.5999, "num_input_tokens_seen": 65435760, "step": 112705 }, { "epoch": 16.787310098302054, "grad_norm": 1.84005868434906, "learning_rate": 3.828536889749282e-06, "loss": 0.6889, "num_input_tokens_seen": 65438448, "step": 112710 }, { "epoch": 16.788054810843015, "grad_norm": 1.0602736473083496, "learning_rate": 3.826808966992354e-06, "loss": 0.5151, "num_input_tokens_seen": 65441360, "step": 112715 }, { "epoch": 16.788799523383975, "grad_norm": 1.7528767585754395, "learning_rate": 3.825081401934971e-06, "loss": 0.5614, "num_input_tokens_seen": 65444400, "step": 112720 }, { "epoch": 16.789544235924932, "grad_norm": 1.0156984329223633, "learning_rate": 3.823354194606316e-06, "loss": 0.365, "num_input_tokens_seen": 65447504, "step": 112725 }, { "epoch": 16.790288948465893, "grad_norm": 1.3386069536209106, "learning_rate": 3.821627345035561e-06, "loss": 0.5713, "num_input_tokens_seen": 65450544, "step": 112730 }, { "epoch": 16.79103366100685, "grad_norm": 1.8418160676956177, "learning_rate": 3.819900853251896e-06, "loss": 0.7095, "num_input_tokens_seen": 65453456, "step": 112735 }, { "epoch": 16.79177837354781, "grad_norm": 1.2429701089859009, "learning_rate": 3.818174719284473e-06, "loss": 0.4887, "num_input_tokens_seen": 65456400, "step": 112740 }, { "epoch": 16.79252308608877, "grad_norm": 1.0693659782409668, "learning_rate": 3.816448943162465e-06, "loss": 0.6219, "num_input_tokens_seen": 65459280, "step": 112745 }, { "epoch": 16.793267798629728, "grad_norm": 1.826037883758545, "learning_rate": 3.814723524915023e-06, "loss": 0.6083, "num_input_tokens_seen": 65462128, "step": 112750 }, { "epoch": 16.79401251117069, "grad_norm": 1.7585654258728027, "learning_rate": 3.812998464571288e-06, "loss": 0.4998, "num_input_tokens_seen": 65464944, "step": 112755 }, { "epoch": 16.79475722371165, "grad_norm": 1.8389605283737183, "learning_rate": 3.8112737621604168e-06, "loss": 0.4905, "num_input_tokens_seen": 65468112, "step": 112760 }, { "epoch": 16.795501936252606, "grad_norm": 1.4717768430709839, "learning_rate": 3.8095494177115358e-06, "loss": 0.6952, "num_input_tokens_seen": 65470992, "step": 112765 }, { "epoch": 16.796246648793566, "grad_norm": 1.56065833568573, "learning_rate": 3.8078254312537836e-06, "loss": 0.5634, "num_input_tokens_seen": 65473648, "step": 112770 }, { "epoch": 16.796991361334523, "grad_norm": 2.9836158752441406, "learning_rate": 3.806101802816284e-06, "loss": 0.7471, "num_input_tokens_seen": 65476528, "step": 112775 }, { "epoch": 16.797736073875484, "grad_norm": 1.2680810689926147, "learning_rate": 3.804378532428149e-06, "loss": 0.5273, "num_input_tokens_seen": 65479376, "step": 112780 }, { "epoch": 16.798480786416444, "grad_norm": 3.6340410709381104, "learning_rate": 3.8026556201185003e-06, "loss": 0.8018, "num_input_tokens_seen": 65482064, "step": 112785 }, { "epoch": 16.7992254989574, "grad_norm": 1.4210177659988403, "learning_rate": 3.8009330659164425e-06, "loss": 0.5562, "num_input_tokens_seen": 65484848, "step": 112790 }, { "epoch": 16.799970211498362, "grad_norm": 1.0525037050247192, "learning_rate": 3.799210869851072e-06, "loss": 0.6333, "num_input_tokens_seen": 65487728, "step": 112795 }, { "epoch": 16.800714924039323, "grad_norm": 2.54290771484375, "learning_rate": 3.7974890319514816e-06, "loss": 0.5102, "num_input_tokens_seen": 65490768, "step": 112800 }, { "epoch": 16.80145963658028, "grad_norm": 3.4701311588287354, "learning_rate": 3.7957675522467735e-06, "loss": 0.6469, "num_input_tokens_seen": 65493712, "step": 112805 }, { "epoch": 16.80220434912124, "grad_norm": 2.0610103607177734, "learning_rate": 3.7940464307660126e-06, "loss": 0.7456, "num_input_tokens_seen": 65496368, "step": 112810 }, { "epoch": 16.802949061662197, "grad_norm": 1.6609420776367188, "learning_rate": 3.792325667538291e-06, "loss": 0.5972, "num_input_tokens_seen": 65499280, "step": 112815 }, { "epoch": 16.803693774203158, "grad_norm": 1.0329375267028809, "learning_rate": 3.790605262592667e-06, "loss": 0.5611, "num_input_tokens_seen": 65502128, "step": 112820 }, { "epoch": 16.804438486744118, "grad_norm": 1.8768608570098877, "learning_rate": 3.7888852159582154e-06, "loss": 0.5198, "num_input_tokens_seen": 65504848, "step": 112825 }, { "epoch": 16.805183199285075, "grad_norm": 1.8620474338531494, "learning_rate": 3.7871655276639937e-06, "loss": 0.5779, "num_input_tokens_seen": 65507856, "step": 112830 }, { "epoch": 16.805927911826036, "grad_norm": 2.6951098442077637, "learning_rate": 3.7854461977390426e-06, "loss": 0.5413, "num_input_tokens_seen": 65510832, "step": 112835 }, { "epoch": 16.806672624366996, "grad_norm": 1.6381542682647705, "learning_rate": 3.7837272262124216e-06, "loss": 0.6882, "num_input_tokens_seen": 65513712, "step": 112840 }, { "epoch": 16.807417336907953, "grad_norm": 2.5183887481689453, "learning_rate": 3.7820086131131667e-06, "loss": 0.6261, "num_input_tokens_seen": 65516560, "step": 112845 }, { "epoch": 16.808162049448914, "grad_norm": 1.1912732124328613, "learning_rate": 3.780290358470309e-06, "loss": 0.6972, "num_input_tokens_seen": 65519376, "step": 112850 }, { "epoch": 16.80890676198987, "grad_norm": 3.0681493282318115, "learning_rate": 3.7785724623128766e-06, "loss": 0.8749, "num_input_tokens_seen": 65522384, "step": 112855 }, { "epoch": 16.80965147453083, "grad_norm": 1.4306334257125854, "learning_rate": 3.776854924669898e-06, "loss": 0.5594, "num_input_tokens_seen": 65525296, "step": 112860 }, { "epoch": 16.81039618707179, "grad_norm": 2.1295676231384277, "learning_rate": 3.775137745570387e-06, "loss": 0.5533, "num_input_tokens_seen": 65527888, "step": 112865 }, { "epoch": 16.81114089961275, "grad_norm": 2.2405130863189697, "learning_rate": 3.7734209250433445e-06, "loss": 0.6601, "num_input_tokens_seen": 65530736, "step": 112870 }, { "epoch": 16.81188561215371, "grad_norm": 3.080906629562378, "learning_rate": 3.7717044631177867e-06, "loss": 0.7173, "num_input_tokens_seen": 65533840, "step": 112875 }, { "epoch": 16.812630324694666, "grad_norm": 1.9450125694274902, "learning_rate": 3.7699883598227016e-06, "loss": 0.5482, "num_input_tokens_seen": 65536784, "step": 112880 }, { "epoch": 16.813375037235627, "grad_norm": 1.9710617065429688, "learning_rate": 3.768272615187091e-06, "loss": 0.5902, "num_input_tokens_seen": 65539728, "step": 112885 }, { "epoch": 16.814119749776587, "grad_norm": 1.5321617126464844, "learning_rate": 3.766557229239931e-06, "loss": 0.6002, "num_input_tokens_seen": 65542352, "step": 112890 }, { "epoch": 16.814864462317544, "grad_norm": 0.8171889781951904, "learning_rate": 3.7648422020102105e-06, "loss": 0.5739, "num_input_tokens_seen": 65545392, "step": 112895 }, { "epoch": 16.815609174858505, "grad_norm": 1.6502095460891724, "learning_rate": 3.7631275335268946e-06, "loss": 0.5301, "num_input_tokens_seen": 65548208, "step": 112900 }, { "epoch": 16.816353887399465, "grad_norm": 1.3641760349273682, "learning_rate": 3.7614132238189605e-06, "loss": 0.5712, "num_input_tokens_seen": 65550736, "step": 112905 }, { "epoch": 16.817098599940422, "grad_norm": 1.5130443572998047, "learning_rate": 3.7596992729153623e-06, "loss": 0.6152, "num_input_tokens_seen": 65553584, "step": 112910 }, { "epoch": 16.817843312481383, "grad_norm": 0.8697932362556458, "learning_rate": 3.757985680845058e-06, "loss": 0.5562, "num_input_tokens_seen": 65556496, "step": 112915 }, { "epoch": 16.81858802502234, "grad_norm": 3.0971198081970215, "learning_rate": 3.7562724476369963e-06, "loss": 0.7398, "num_input_tokens_seen": 65559344, "step": 112920 }, { "epoch": 16.8193327375633, "grad_norm": 1.929953694343567, "learning_rate": 3.7545595733201126e-06, "loss": 0.6231, "num_input_tokens_seen": 65562288, "step": 112925 }, { "epoch": 16.82007745010426, "grad_norm": 2.5799720287323, "learning_rate": 3.7528470579233614e-06, "loss": 0.7292, "num_input_tokens_seen": 65565136, "step": 112930 }, { "epoch": 16.820822162645218, "grad_norm": 1.1621206998825073, "learning_rate": 3.751134901475656e-06, "loss": 0.4696, "num_input_tokens_seen": 65568624, "step": 112935 }, { "epoch": 16.82156687518618, "grad_norm": 1.2753362655639648, "learning_rate": 3.7494231040059364e-06, "loss": 0.5261, "num_input_tokens_seen": 65571248, "step": 112940 }, { "epoch": 16.82231158772714, "grad_norm": 1.4021235704421997, "learning_rate": 3.7477116655431162e-06, "loss": 0.6024, "num_input_tokens_seen": 65574384, "step": 112945 }, { "epoch": 16.823056300268096, "grad_norm": 2.221266746520996, "learning_rate": 3.7460005861161024e-06, "loss": 0.437, "num_input_tokens_seen": 65577392, "step": 112950 }, { "epoch": 16.823801012809056, "grad_norm": 2.1906118392944336, "learning_rate": 3.7442898657538113e-06, "loss": 0.5903, "num_input_tokens_seen": 65580624, "step": 112955 }, { "epoch": 16.824545725350013, "grad_norm": 1.4501949548721313, "learning_rate": 3.7425795044851357e-06, "loss": 0.4671, "num_input_tokens_seen": 65583504, "step": 112960 }, { "epoch": 16.825290437890974, "grad_norm": 1.124941349029541, "learning_rate": 3.7408695023389785e-06, "loss": 0.6679, "num_input_tokens_seen": 65586256, "step": 112965 }, { "epoch": 16.826035150431935, "grad_norm": 1.2290602922439575, "learning_rate": 3.7391598593442184e-06, "loss": 0.619, "num_input_tokens_seen": 65589168, "step": 112970 }, { "epoch": 16.82677986297289, "grad_norm": 1.3396625518798828, "learning_rate": 3.7374505755297494e-06, "loss": 0.9434, "num_input_tokens_seen": 65592048, "step": 112975 }, { "epoch": 16.827524575513852, "grad_norm": 1.1240077018737793, "learning_rate": 3.735741650924443e-06, "loss": 0.6327, "num_input_tokens_seen": 65594864, "step": 112980 }, { "epoch": 16.828269288054813, "grad_norm": 2.8218979835510254, "learning_rate": 3.7340330855571704e-06, "loss": 0.6726, "num_input_tokens_seen": 65597776, "step": 112985 }, { "epoch": 16.82901400059577, "grad_norm": 2.77301025390625, "learning_rate": 3.7323248794567942e-06, "loss": 0.6547, "num_input_tokens_seen": 65600592, "step": 112990 }, { "epoch": 16.82975871313673, "grad_norm": 0.8189020156860352, "learning_rate": 3.7306170326521674e-06, "loss": 0.5021, "num_input_tokens_seen": 65603216, "step": 112995 }, { "epoch": 16.830503425677687, "grad_norm": 1.699442744255066, "learning_rate": 3.7289095451721546e-06, "loss": 0.5542, "num_input_tokens_seen": 65605968, "step": 113000 }, { "epoch": 16.831248138218648, "grad_norm": 1.2480870485305786, "learning_rate": 3.7272024170455914e-06, "loss": 0.3668, "num_input_tokens_seen": 65608784, "step": 113005 }, { "epoch": 16.831992850759608, "grad_norm": 1.9931129217147827, "learning_rate": 3.7254956483013278e-06, "loss": 0.5001, "num_input_tokens_seen": 65611792, "step": 113010 }, { "epoch": 16.832737563300565, "grad_norm": 1.0186635255813599, "learning_rate": 3.7237892389681866e-06, "loss": 0.562, "num_input_tokens_seen": 65614768, "step": 113015 }, { "epoch": 16.833482275841526, "grad_norm": 2.443505048751831, "learning_rate": 3.7220831890750067e-06, "loss": 0.6339, "num_input_tokens_seen": 65617712, "step": 113020 }, { "epoch": 16.834226988382483, "grad_norm": 1.5184590816497803, "learning_rate": 3.7203774986506067e-06, "loss": 0.7444, "num_input_tokens_seen": 65620720, "step": 113025 }, { "epoch": 16.834971700923443, "grad_norm": 1.9285366535186768, "learning_rate": 3.718672167723797e-06, "loss": 0.4854, "num_input_tokens_seen": 65623568, "step": 113030 }, { "epoch": 16.835716413464404, "grad_norm": 2.3289968967437744, "learning_rate": 3.7169671963233952e-06, "loss": 0.6251, "num_input_tokens_seen": 65626256, "step": 113035 }, { "epoch": 16.83646112600536, "grad_norm": 1.1777514219284058, "learning_rate": 3.7152625844781963e-06, "loss": 0.494, "num_input_tokens_seen": 65629232, "step": 113040 }, { "epoch": 16.83720583854632, "grad_norm": 1.1794289350509644, "learning_rate": 3.7135583322170098e-06, "loss": 0.6385, "num_input_tokens_seen": 65632144, "step": 113045 }, { "epoch": 16.837950551087282, "grad_norm": 1.18585205078125, "learning_rate": 3.7118544395686213e-06, "loss": 0.7661, "num_input_tokens_seen": 65635056, "step": 113050 }, { "epoch": 16.83869526362824, "grad_norm": 1.7025883197784424, "learning_rate": 3.710150906561813e-06, "loss": 0.5926, "num_input_tokens_seen": 65637840, "step": 113055 }, { "epoch": 16.8394399761692, "grad_norm": 2.601501941680908, "learning_rate": 3.708447733225362e-06, "loss": 0.8775, "num_input_tokens_seen": 65641040, "step": 113060 }, { "epoch": 16.840184688710156, "grad_norm": 1.258732557296753, "learning_rate": 3.706744919588054e-06, "loss": 0.5053, "num_input_tokens_seen": 65643760, "step": 113065 }, { "epoch": 16.840929401251117, "grad_norm": 2.0761873722076416, "learning_rate": 3.7050424656786486e-06, "loss": 0.4972, "num_input_tokens_seen": 65646768, "step": 113070 }, { "epoch": 16.841674113792077, "grad_norm": 3.9242074489593506, "learning_rate": 3.7033403715259014e-06, "loss": 0.921, "num_input_tokens_seen": 65649840, "step": 113075 }, { "epoch": 16.842418826333034, "grad_norm": 0.7793087959289551, "learning_rate": 3.7016386371585804e-06, "loss": 0.5542, "num_input_tokens_seen": 65652432, "step": 113080 }, { "epoch": 16.843163538873995, "grad_norm": 1.0702874660491943, "learning_rate": 3.6999372626054212e-06, "loss": 0.7676, "num_input_tokens_seen": 65655536, "step": 113085 }, { "epoch": 16.843908251414955, "grad_norm": 1.19428288936615, "learning_rate": 3.6982362478951786e-06, "loss": 0.6256, "num_input_tokens_seen": 65658384, "step": 113090 }, { "epoch": 16.844652963955912, "grad_norm": 2.3500454425811768, "learning_rate": 3.6965355930565796e-06, "loss": 0.6424, "num_input_tokens_seen": 65661232, "step": 113095 }, { "epoch": 16.845397676496873, "grad_norm": 1.4160959720611572, "learning_rate": 3.6948352981183683e-06, "loss": 0.3792, "num_input_tokens_seen": 65663920, "step": 113100 }, { "epoch": 16.84614238903783, "grad_norm": 1.1298049688339233, "learning_rate": 3.693135363109257e-06, "loss": 0.4299, "num_input_tokens_seen": 65666704, "step": 113105 }, { "epoch": 16.84688710157879, "grad_norm": 4.317020416259766, "learning_rate": 3.6914357880579647e-06, "loss": 0.601, "num_input_tokens_seen": 65669808, "step": 113110 }, { "epoch": 16.84763181411975, "grad_norm": 1.1230579614639282, "learning_rate": 3.689736572993213e-06, "loss": 0.7195, "num_input_tokens_seen": 65672752, "step": 113115 }, { "epoch": 16.848376526660708, "grad_norm": 1.6753896474838257, "learning_rate": 3.6880377179437014e-06, "loss": 0.5056, "num_input_tokens_seen": 65675280, "step": 113120 }, { "epoch": 16.84912123920167, "grad_norm": 1.7170084714889526, "learning_rate": 3.6863392229381342e-06, "loss": 0.6199, "num_input_tokens_seen": 65678256, "step": 113125 }, { "epoch": 16.84986595174263, "grad_norm": 0.9441230893135071, "learning_rate": 3.684641088005197e-06, "loss": 0.5075, "num_input_tokens_seen": 65681200, "step": 113130 }, { "epoch": 16.850610664283586, "grad_norm": 1.5716235637664795, "learning_rate": 3.6829433131735895e-06, "loss": 0.4718, "num_input_tokens_seen": 65684208, "step": 113135 }, { "epoch": 16.851355376824547, "grad_norm": 2.0889225006103516, "learning_rate": 3.6812458984719857e-06, "loss": 0.6352, "num_input_tokens_seen": 65687344, "step": 113140 }, { "epoch": 16.852100089365504, "grad_norm": 0.9500895738601685, "learning_rate": 3.6795488439290706e-06, "loss": 0.5238, "num_input_tokens_seen": 65690480, "step": 113145 }, { "epoch": 16.852844801906464, "grad_norm": 1.0062068700790405, "learning_rate": 3.6778521495735073e-06, "loss": 0.4644, "num_input_tokens_seen": 65693232, "step": 113150 }, { "epoch": 16.853589514447425, "grad_norm": 1.780674934387207, "learning_rate": 3.6761558154339544e-06, "loss": 0.4681, "num_input_tokens_seen": 65696240, "step": 113155 }, { "epoch": 16.85433422698838, "grad_norm": 1.9633954763412476, "learning_rate": 3.6744598415390823e-06, "loss": 0.6311, "num_input_tokens_seen": 65699344, "step": 113160 }, { "epoch": 16.855078939529342, "grad_norm": 1.6809862852096558, "learning_rate": 3.672764227917533e-06, "loss": 0.545, "num_input_tokens_seen": 65702640, "step": 113165 }, { "epoch": 16.8558236520703, "grad_norm": 0.9983080625534058, "learning_rate": 3.6710689745979606e-06, "loss": 0.5709, "num_input_tokens_seen": 65705712, "step": 113170 }, { "epoch": 16.85656836461126, "grad_norm": 2.510655164718628, "learning_rate": 3.6693740816090005e-06, "loss": 0.7542, "num_input_tokens_seen": 65708464, "step": 113175 }, { "epoch": 16.85731307715222, "grad_norm": 1.1756325960159302, "learning_rate": 3.6676795489792857e-06, "loss": 0.6648, "num_input_tokens_seen": 65711504, "step": 113180 }, { "epoch": 16.858057789693177, "grad_norm": 1.398719310760498, "learning_rate": 3.665985376737438e-06, "loss": 0.5913, "num_input_tokens_seen": 65714320, "step": 113185 }, { "epoch": 16.858802502234138, "grad_norm": 1.3436598777770996, "learning_rate": 3.6642915649120894e-06, "loss": 0.53, "num_input_tokens_seen": 65717136, "step": 113190 }, { "epoch": 16.859547214775098, "grad_norm": 1.7366381883621216, "learning_rate": 3.662598113531851e-06, "loss": 0.6125, "num_input_tokens_seen": 65719920, "step": 113195 }, { "epoch": 16.860291927316055, "grad_norm": 1.145198106765747, "learning_rate": 3.6609050226253273e-06, "loss": 0.6445, "num_input_tokens_seen": 65722640, "step": 113200 }, { "epoch": 16.861036639857016, "grad_norm": 0.9556400179862976, "learning_rate": 3.6592122922211287e-06, "loss": 0.5091, "num_input_tokens_seen": 65725488, "step": 113205 }, { "epoch": 16.861781352397973, "grad_norm": 1.409790277481079, "learning_rate": 3.6575199223478436e-06, "loss": 0.8095, "num_input_tokens_seen": 65728304, "step": 113210 }, { "epoch": 16.862526064938933, "grad_norm": 1.520954966545105, "learning_rate": 3.655827913034071e-06, "loss": 0.8096, "num_input_tokens_seen": 65731088, "step": 113215 }, { "epoch": 16.863270777479894, "grad_norm": 1.4640038013458252, "learning_rate": 3.6541362643083887e-06, "loss": 0.5187, "num_input_tokens_seen": 65734000, "step": 113220 }, { "epoch": 16.86401549002085, "grad_norm": 2.565261125564575, "learning_rate": 3.6524449761993874e-06, "loss": 0.498, "num_input_tokens_seen": 65736784, "step": 113225 }, { "epoch": 16.86476020256181, "grad_norm": 3.156615972518921, "learning_rate": 3.6507540487356274e-06, "loss": 0.8207, "num_input_tokens_seen": 65739568, "step": 113230 }, { "epoch": 16.865504915102772, "grad_norm": 3.4530818462371826, "learning_rate": 3.6490634819456775e-06, "loss": 0.572, "num_input_tokens_seen": 65742448, "step": 113235 }, { "epoch": 16.86624962764373, "grad_norm": 1.140429139137268, "learning_rate": 3.6473732758581013e-06, "loss": 0.3738, "num_input_tokens_seen": 65745232, "step": 113240 }, { "epoch": 16.86699434018469, "grad_norm": 1.6801365613937378, "learning_rate": 3.6456834305014565e-06, "loss": 0.5969, "num_input_tokens_seen": 65748080, "step": 113245 }, { "epoch": 16.867739052725646, "grad_norm": 3.241105794906616, "learning_rate": 3.6439939459042836e-06, "loss": 0.7626, "num_input_tokens_seen": 65751280, "step": 113250 }, { "epoch": 16.868483765266607, "grad_norm": 2.3962478637695312, "learning_rate": 3.6423048220951216e-06, "loss": 0.5899, "num_input_tokens_seen": 65754608, "step": 113255 }, { "epoch": 16.869228477807567, "grad_norm": 2.1835503578186035, "learning_rate": 3.6406160591025196e-06, "loss": 0.5591, "num_input_tokens_seen": 65757616, "step": 113260 }, { "epoch": 16.869973190348524, "grad_norm": 1.0789196491241455, "learning_rate": 3.638927656954996e-06, "loss": 0.4406, "num_input_tokens_seen": 65760912, "step": 113265 }, { "epoch": 16.870717902889485, "grad_norm": 1.71230947971344, "learning_rate": 3.6372396156810817e-06, "loss": 0.5077, "num_input_tokens_seen": 65764048, "step": 113270 }, { "epoch": 16.871462615430445, "grad_norm": 1.3259779214859009, "learning_rate": 3.6355519353092947e-06, "loss": 0.6711, "num_input_tokens_seen": 65766832, "step": 113275 }, { "epoch": 16.872207327971402, "grad_norm": 1.5150227546691895, "learning_rate": 3.6338646158681377e-06, "loss": 0.699, "num_input_tokens_seen": 65769744, "step": 113280 }, { "epoch": 16.872952040512363, "grad_norm": 1.1500478982925415, "learning_rate": 3.6321776573861266e-06, "loss": 0.5745, "num_input_tokens_seen": 65772496, "step": 113285 }, { "epoch": 16.87369675305332, "grad_norm": 1.9750458002090454, "learning_rate": 3.6304910598917526e-06, "loss": 0.5106, "num_input_tokens_seen": 65775536, "step": 113290 }, { "epoch": 16.87444146559428, "grad_norm": 3.0179498195648193, "learning_rate": 3.6288048234135203e-06, "loss": 0.6815, "num_input_tokens_seen": 65778512, "step": 113295 }, { "epoch": 16.87518617813524, "grad_norm": 2.0752298831939697, "learning_rate": 3.6271189479799017e-06, "loss": 0.6113, "num_input_tokens_seen": 65781360, "step": 113300 }, { "epoch": 16.875930890676198, "grad_norm": 1.2238001823425293, "learning_rate": 3.625433433619391e-06, "loss": 0.5449, "num_input_tokens_seen": 65784464, "step": 113305 }, { "epoch": 16.87667560321716, "grad_norm": 2.0939269065856934, "learning_rate": 3.623748280360459e-06, "loss": 0.8115, "num_input_tokens_seen": 65787408, "step": 113310 }, { "epoch": 16.87742031575812, "grad_norm": 1.2688783407211304, "learning_rate": 3.622063488231575e-06, "loss": 0.4714, "num_input_tokens_seen": 65790096, "step": 113315 }, { "epoch": 16.878165028299076, "grad_norm": 2.7245795726776123, "learning_rate": 3.6203790572612002e-06, "loss": 0.5243, "num_input_tokens_seen": 65792912, "step": 113320 }, { "epoch": 16.878909740840037, "grad_norm": 1.2892465591430664, "learning_rate": 3.6186949874777858e-06, "loss": 0.5578, "num_input_tokens_seen": 65795696, "step": 113325 }, { "epoch": 16.879654453380994, "grad_norm": 2.0666401386260986, "learning_rate": 3.617011278909796e-06, "loss": 0.7038, "num_input_tokens_seen": 65798608, "step": 113330 }, { "epoch": 16.880399165921954, "grad_norm": 0.9133644700050354, "learning_rate": 3.6153279315856603e-06, "loss": 0.4635, "num_input_tokens_seen": 65801488, "step": 113335 }, { "epoch": 16.881143878462915, "grad_norm": 1.029942512512207, "learning_rate": 3.613644945533831e-06, "loss": 0.6252, "num_input_tokens_seen": 65804176, "step": 113340 }, { "epoch": 16.88188859100387, "grad_norm": 1.6608506441116333, "learning_rate": 3.61196232078273e-06, "loss": 0.5983, "num_input_tokens_seen": 65806896, "step": 113345 }, { "epoch": 16.882633303544832, "grad_norm": 1.5533466339111328, "learning_rate": 3.610280057360793e-06, "loss": 0.4185, "num_input_tokens_seen": 65809840, "step": 113350 }, { "epoch": 16.883378016085793, "grad_norm": 2.0654478073120117, "learning_rate": 3.608598155296433e-06, "loss": 0.5276, "num_input_tokens_seen": 65812720, "step": 113355 }, { "epoch": 16.88412272862675, "grad_norm": 2.2102787494659424, "learning_rate": 3.606916614618061e-06, "loss": 0.5033, "num_input_tokens_seen": 65815792, "step": 113360 }, { "epoch": 16.88486744116771, "grad_norm": 3.3257951736450195, "learning_rate": 3.605235435354096e-06, "loss": 0.7558, "num_input_tokens_seen": 65818704, "step": 113365 }, { "epoch": 16.885612153708667, "grad_norm": 1.2506260871887207, "learning_rate": 3.6035546175329283e-06, "loss": 0.3908, "num_input_tokens_seen": 65821680, "step": 113370 }, { "epoch": 16.886356866249628, "grad_norm": 2.3336944580078125, "learning_rate": 3.6018741611829637e-06, "loss": 0.4591, "num_input_tokens_seen": 65824368, "step": 113375 }, { "epoch": 16.88710157879059, "grad_norm": 1.0604716539382935, "learning_rate": 3.6001940663325876e-06, "loss": 0.5038, "num_input_tokens_seen": 65827472, "step": 113380 }, { "epoch": 16.887846291331545, "grad_norm": 1.071851134300232, "learning_rate": 3.598514333010183e-06, "loss": 0.6659, "num_input_tokens_seen": 65830544, "step": 113385 }, { "epoch": 16.888591003872506, "grad_norm": 1.2611793279647827, "learning_rate": 3.5968349612441277e-06, "loss": 0.4196, "num_input_tokens_seen": 65833680, "step": 113390 }, { "epoch": 16.889335716413463, "grad_norm": 1.0378352403640747, "learning_rate": 3.595155951062787e-06, "loss": 0.4935, "num_input_tokens_seen": 65836528, "step": 113395 }, { "epoch": 16.890080428954423, "grad_norm": 1.4645757675170898, "learning_rate": 3.5934773024945355e-06, "loss": 0.5083, "num_input_tokens_seen": 65839600, "step": 113400 }, { "epoch": 16.890825141495384, "grad_norm": 2.392566680908203, "learning_rate": 3.591799015567726e-06, "loss": 0.522, "num_input_tokens_seen": 65842672, "step": 113405 }, { "epoch": 16.89156985403634, "grad_norm": 1.139037847518921, "learning_rate": 3.5901210903107165e-06, "loss": 0.6334, "num_input_tokens_seen": 65845488, "step": 113410 }, { "epoch": 16.8923145665773, "grad_norm": 3.441343069076538, "learning_rate": 3.5884435267518476e-06, "loss": 0.7054, "num_input_tokens_seen": 65848304, "step": 113415 }, { "epoch": 16.893059279118262, "grad_norm": 1.6040030717849731, "learning_rate": 3.586766324919466e-06, "loss": 0.7345, "num_input_tokens_seen": 65851248, "step": 113420 }, { "epoch": 16.89380399165922, "grad_norm": 1.5991860628128052, "learning_rate": 3.5850894848418997e-06, "loss": 0.6937, "num_input_tokens_seen": 65854096, "step": 113425 }, { "epoch": 16.89454870420018, "grad_norm": 2.176018238067627, "learning_rate": 3.5834130065474897e-06, "loss": 0.5008, "num_input_tokens_seen": 65856816, "step": 113430 }, { "epoch": 16.895293416741136, "grad_norm": 1.97401762008667, "learning_rate": 3.5817368900645466e-06, "loss": 0.6311, "num_input_tokens_seen": 65859536, "step": 113435 }, { "epoch": 16.896038129282097, "grad_norm": 1.389379620552063, "learning_rate": 3.5800611354213866e-06, "loss": 0.6198, "num_input_tokens_seen": 65862256, "step": 113440 }, { "epoch": 16.896782841823057, "grad_norm": 1.3466601371765137, "learning_rate": 3.5783857426463286e-06, "loss": 0.5495, "num_input_tokens_seen": 65865072, "step": 113445 }, { "epoch": 16.897527554364014, "grad_norm": 1.6930205821990967, "learning_rate": 3.5767107117676698e-06, "loss": 0.7344, "num_input_tokens_seen": 65867792, "step": 113450 }, { "epoch": 16.898272266904975, "grad_norm": 1.0961374044418335, "learning_rate": 3.575036042813712e-06, "loss": 0.5735, "num_input_tokens_seen": 65870608, "step": 113455 }, { "epoch": 16.899016979445936, "grad_norm": 1.7358226776123047, "learning_rate": 3.5733617358127384e-06, "loss": 0.4947, "num_input_tokens_seen": 65873584, "step": 113460 }, { "epoch": 16.899761691986892, "grad_norm": 3.8037455081939697, "learning_rate": 3.5716877907930465e-06, "loss": 0.701, "num_input_tokens_seen": 65876784, "step": 113465 }, { "epoch": 16.900506404527853, "grad_norm": 2.222385883331299, "learning_rate": 3.5700142077829122e-06, "loss": 0.4678, "num_input_tokens_seen": 65879664, "step": 113470 }, { "epoch": 16.90125111706881, "grad_norm": 1.36923086643219, "learning_rate": 3.5683409868106004e-06, "loss": 0.5515, "num_input_tokens_seen": 65882512, "step": 113475 }, { "epoch": 16.90199582960977, "grad_norm": 2.1130425930023193, "learning_rate": 3.566668127904391e-06, "loss": 0.6766, "num_input_tokens_seen": 65885520, "step": 113480 }, { "epoch": 16.90274054215073, "grad_norm": 1.1796883344650269, "learning_rate": 3.5649956310925355e-06, "loss": 0.6384, "num_input_tokens_seen": 65888144, "step": 113485 }, { "epoch": 16.903485254691688, "grad_norm": 1.2575037479400635, "learning_rate": 3.563323496403298e-06, "loss": 0.6541, "num_input_tokens_seen": 65890896, "step": 113490 }, { "epoch": 16.90422996723265, "grad_norm": 1.9113714694976807, "learning_rate": 3.5616517238649172e-06, "loss": 0.6037, "num_input_tokens_seen": 65894000, "step": 113495 }, { "epoch": 16.90497467977361, "grad_norm": 2.2306673526763916, "learning_rate": 3.5599803135056455e-06, "loss": 0.817, "num_input_tokens_seen": 65896816, "step": 113500 }, { "epoch": 16.905719392314566, "grad_norm": 1.0024068355560303, "learning_rate": 3.558309265353718e-06, "loss": 0.6914, "num_input_tokens_seen": 65899568, "step": 113505 }, { "epoch": 16.906464104855527, "grad_norm": 1.9988112449645996, "learning_rate": 3.556638579437363e-06, "loss": 0.5121, "num_input_tokens_seen": 65902384, "step": 113510 }, { "epoch": 16.907208817396484, "grad_norm": 4.397140026092529, "learning_rate": 3.554968255784799e-06, "loss": 0.6319, "num_input_tokens_seen": 65905264, "step": 113515 }, { "epoch": 16.907953529937444, "grad_norm": 1.1969599723815918, "learning_rate": 3.5532982944242567e-06, "loss": 0.4402, "num_input_tokens_seen": 65908144, "step": 113520 }, { "epoch": 16.908698242478405, "grad_norm": 1.5583454370498657, "learning_rate": 3.5516286953839406e-06, "loss": 0.7261, "num_input_tokens_seen": 65911056, "step": 113525 }, { "epoch": 16.90944295501936, "grad_norm": 1.1489768028259277, "learning_rate": 3.549959458692054e-06, "loss": 0.3851, "num_input_tokens_seen": 65913936, "step": 113530 }, { "epoch": 16.910187667560322, "grad_norm": 1.5248340368270874, "learning_rate": 3.5482905843768065e-06, "loss": 0.7798, "num_input_tokens_seen": 65917424, "step": 113535 }, { "epoch": 16.91093238010128, "grad_norm": 1.4721906185150146, "learning_rate": 3.5466220724663824e-06, "loss": 0.6949, "num_input_tokens_seen": 65920272, "step": 113540 }, { "epoch": 16.91167709264224, "grad_norm": 1.9703562259674072, "learning_rate": 3.5449539229889807e-06, "loss": 0.5412, "num_input_tokens_seen": 65923248, "step": 113545 }, { "epoch": 16.9124218051832, "grad_norm": 1.1830614805221558, "learning_rate": 3.543286135972773e-06, "loss": 0.5848, "num_input_tokens_seen": 65926000, "step": 113550 }, { "epoch": 16.913166517724157, "grad_norm": 1.3853769302368164, "learning_rate": 3.541618711445935e-06, "loss": 0.4634, "num_input_tokens_seen": 65929744, "step": 113555 }, { "epoch": 16.913911230265118, "grad_norm": 1.3634099960327148, "learning_rate": 3.5399516494366457e-06, "loss": 0.7277, "num_input_tokens_seen": 65932432, "step": 113560 }, { "epoch": 16.91465594280608, "grad_norm": 1.4554836750030518, "learning_rate": 3.5382849499730558e-06, "loss": 0.5852, "num_input_tokens_seen": 65935152, "step": 113565 }, { "epoch": 16.915400655347035, "grad_norm": 1.565551996231079, "learning_rate": 3.536618613083337e-06, "loss": 0.5657, "num_input_tokens_seen": 65937776, "step": 113570 }, { "epoch": 16.916145367887996, "grad_norm": 0.7612650394439697, "learning_rate": 3.53495263879563e-06, "loss": 0.4848, "num_input_tokens_seen": 65940336, "step": 113575 }, { "epoch": 16.916890080428953, "grad_norm": 1.5372679233551025, "learning_rate": 3.5332870271380797e-06, "loss": 0.4953, "num_input_tokens_seen": 65943184, "step": 113580 }, { "epoch": 16.917634792969913, "grad_norm": 1.1686455011367798, "learning_rate": 3.5316217781388243e-06, "loss": 0.4322, "num_input_tokens_seen": 65946032, "step": 113585 }, { "epoch": 16.918379505510874, "grad_norm": 1.1051489114761353, "learning_rate": 3.5299568918260054e-06, "loss": 0.5826, "num_input_tokens_seen": 65948816, "step": 113590 }, { "epoch": 16.91912421805183, "grad_norm": 2.205073595046997, "learning_rate": 3.528292368227745e-06, "loss": 0.625, "num_input_tokens_seen": 65951792, "step": 113595 }, { "epoch": 16.91986893059279, "grad_norm": 2.393935203552246, "learning_rate": 3.526628207372154e-06, "loss": 0.5913, "num_input_tokens_seen": 65954864, "step": 113600 }, { "epoch": 16.920613643133752, "grad_norm": 2.891939640045166, "learning_rate": 3.5249644092873624e-06, "loss": 0.5098, "num_input_tokens_seen": 65957552, "step": 113605 }, { "epoch": 16.92135835567471, "grad_norm": 1.955958366394043, "learning_rate": 3.5233009740014617e-06, "loss": 0.7084, "num_input_tokens_seen": 65960240, "step": 113610 }, { "epoch": 16.92210306821567, "grad_norm": 3.859286069869995, "learning_rate": 3.521637901542571e-06, "loss": 0.5644, "num_input_tokens_seen": 65963088, "step": 113615 }, { "epoch": 16.922847780756626, "grad_norm": 1.965624451637268, "learning_rate": 3.5199751919387735e-06, "loss": 0.5993, "num_input_tokens_seen": 65966000, "step": 113620 }, { "epoch": 16.923592493297587, "grad_norm": 1.147481083869934, "learning_rate": 3.518312845218169e-06, "loss": 0.5855, "num_input_tokens_seen": 65969072, "step": 113625 }, { "epoch": 16.924337205838548, "grad_norm": 1.5356751680374146, "learning_rate": 3.516650861408835e-06, "loss": 0.6166, "num_input_tokens_seen": 65972080, "step": 113630 }, { "epoch": 16.925081918379504, "grad_norm": 1.248311161994934, "learning_rate": 3.514989240538846e-06, "loss": 0.5216, "num_input_tokens_seen": 65974928, "step": 113635 }, { "epoch": 16.925826630920465, "grad_norm": 2.5782268047332764, "learning_rate": 3.513327982636283e-06, "loss": 0.5316, "num_input_tokens_seen": 65977808, "step": 113640 }, { "epoch": 16.926571343461426, "grad_norm": 1.1016453504562378, "learning_rate": 3.5116670877292034e-06, "loss": 0.5467, "num_input_tokens_seen": 65980528, "step": 113645 }, { "epoch": 16.927316056002383, "grad_norm": 1.6773681640625, "learning_rate": 3.5100065558456714e-06, "loss": 0.5382, "num_input_tokens_seen": 65983056, "step": 113650 }, { "epoch": 16.928060768543343, "grad_norm": 0.9307501316070557, "learning_rate": 3.5083463870137306e-06, "loss": 0.4828, "num_input_tokens_seen": 65986160, "step": 113655 }, { "epoch": 16.9288054810843, "grad_norm": 1.149012565612793, "learning_rate": 3.50668658126144e-06, "loss": 0.6248, "num_input_tokens_seen": 65989488, "step": 113660 }, { "epoch": 16.92955019362526, "grad_norm": 2.210451602935791, "learning_rate": 3.5050271386168287e-06, "loss": 0.6622, "num_input_tokens_seen": 65992496, "step": 113665 }, { "epoch": 16.93029490616622, "grad_norm": 1.7356594800949097, "learning_rate": 3.503368059107942e-06, "loss": 0.5265, "num_input_tokens_seen": 65995664, "step": 113670 }, { "epoch": 16.931039618707178, "grad_norm": 1.2759321928024292, "learning_rate": 3.5017093427628045e-06, "loss": 0.5915, "num_input_tokens_seen": 65998352, "step": 113675 }, { "epoch": 16.93178433124814, "grad_norm": 1.4664655923843384, "learning_rate": 3.5000509896094323e-06, "loss": 0.5979, "num_input_tokens_seen": 66001168, "step": 113680 }, { "epoch": 16.932529043789096, "grad_norm": 1.0443023443222046, "learning_rate": 3.4983929996758535e-06, "loss": 0.5345, "num_input_tokens_seen": 66004016, "step": 113685 }, { "epoch": 16.933273756330056, "grad_norm": 1.524578332901001, "learning_rate": 3.496735372990065e-06, "loss": 0.5411, "num_input_tokens_seen": 66006704, "step": 113690 }, { "epoch": 16.934018468871017, "grad_norm": 2.0648491382598877, "learning_rate": 3.4950781095800828e-06, "loss": 0.6756, "num_input_tokens_seen": 66009968, "step": 113695 }, { "epoch": 16.934763181411974, "grad_norm": 1.1126368045806885, "learning_rate": 3.493421209473896e-06, "loss": 0.5182, "num_input_tokens_seen": 66013008, "step": 113700 }, { "epoch": 16.935507893952934, "grad_norm": 1.2971280813217163, "learning_rate": 3.4917646726995018e-06, "loss": 0.717, "num_input_tokens_seen": 66015888, "step": 113705 }, { "epoch": 16.936252606493895, "grad_norm": 1.5187327861785889, "learning_rate": 3.490108499284886e-06, "loss": 0.6446, "num_input_tokens_seen": 66018704, "step": 113710 }, { "epoch": 16.93699731903485, "grad_norm": 2.544736623764038, "learning_rate": 3.488452689258026e-06, "loss": 0.7333, "num_input_tokens_seen": 66021712, "step": 113715 }, { "epoch": 16.937742031575812, "grad_norm": 1.516248106956482, "learning_rate": 3.4867972426468915e-06, "loss": 0.5455, "num_input_tokens_seen": 66024272, "step": 113720 }, { "epoch": 16.93848674411677, "grad_norm": 1.5925990343093872, "learning_rate": 3.4851421594794486e-06, "loss": 0.6639, "num_input_tokens_seen": 66027056, "step": 113725 }, { "epoch": 16.93923145665773, "grad_norm": 1.810408115386963, "learning_rate": 3.483487439783667e-06, "loss": 0.5699, "num_input_tokens_seen": 66029680, "step": 113730 }, { "epoch": 16.93997616919869, "grad_norm": 1.0930707454681396, "learning_rate": 3.4818330835874937e-06, "loss": 0.5101, "num_input_tokens_seen": 66032624, "step": 113735 }, { "epoch": 16.940720881739647, "grad_norm": 1.340315580368042, "learning_rate": 3.4801790909188837e-06, "loss": 0.6355, "num_input_tokens_seen": 66035600, "step": 113740 }, { "epoch": 16.941465594280608, "grad_norm": 1.06544828414917, "learning_rate": 3.4785254618057707e-06, "loss": 0.5576, "num_input_tokens_seen": 66038544, "step": 113745 }, { "epoch": 16.94221030682157, "grad_norm": 2.1096580028533936, "learning_rate": 3.4768721962761015e-06, "loss": 0.6233, "num_input_tokens_seen": 66041456, "step": 113750 }, { "epoch": 16.942955019362525, "grad_norm": 1.9678555727005005, "learning_rate": 3.4752192943578038e-06, "loss": 0.631, "num_input_tokens_seen": 66044432, "step": 113755 }, { "epoch": 16.943699731903486, "grad_norm": 1.872193455696106, "learning_rate": 3.4735667560787916e-06, "loss": 0.6193, "num_input_tokens_seen": 66047440, "step": 113760 }, { "epoch": 16.944444444444443, "grad_norm": 1.7384918928146362, "learning_rate": 3.471914581466998e-06, "loss": 0.6718, "num_input_tokens_seen": 66050000, "step": 113765 }, { "epoch": 16.945189156985403, "grad_norm": 1.9706430435180664, "learning_rate": 3.4702627705503197e-06, "loss": 0.5206, "num_input_tokens_seen": 66052944, "step": 113770 }, { "epoch": 16.945933869526364, "grad_norm": 1.1465226411819458, "learning_rate": 3.468611323356677e-06, "loss": 0.4472, "num_input_tokens_seen": 66056048, "step": 113775 }, { "epoch": 16.94667858206732, "grad_norm": 2.0477826595306396, "learning_rate": 3.4669602399139607e-06, "loss": 0.6129, "num_input_tokens_seen": 66058992, "step": 113780 }, { "epoch": 16.94742329460828, "grad_norm": 1.898594617843628, "learning_rate": 3.4653095202500677e-06, "loss": 0.5131, "num_input_tokens_seen": 66061872, "step": 113785 }, { "epoch": 16.948168007149242, "grad_norm": 1.8461120128631592, "learning_rate": 3.4636591643928823e-06, "loss": 0.7347, "num_input_tokens_seen": 66065040, "step": 113790 }, { "epoch": 16.9489127196902, "grad_norm": 1.42673659324646, "learning_rate": 3.462009172370284e-06, "loss": 0.7954, "num_input_tokens_seen": 66067856, "step": 113795 }, { "epoch": 16.94965743223116, "grad_norm": 1.6687283515930176, "learning_rate": 3.4603595442101537e-06, "loss": 0.6548, "num_input_tokens_seen": 66070512, "step": 113800 }, { "epoch": 16.950402144772116, "grad_norm": 1.334857702255249, "learning_rate": 3.45871027994035e-06, "loss": 0.5943, "num_input_tokens_seen": 66073424, "step": 113805 }, { "epoch": 16.951146857313077, "grad_norm": 0.6603553295135498, "learning_rate": 3.45706137958875e-06, "loss": 0.516, "num_input_tokens_seen": 66076528, "step": 113810 }, { "epoch": 16.951891569854038, "grad_norm": 2.3349449634552, "learning_rate": 3.4554128431831976e-06, "loss": 0.5488, "num_input_tokens_seen": 66079760, "step": 113815 }, { "epoch": 16.952636282394995, "grad_norm": 1.6599501371383667, "learning_rate": 3.4537646707515527e-06, "loss": 0.5996, "num_input_tokens_seen": 66082896, "step": 113820 }, { "epoch": 16.953380994935955, "grad_norm": 3.21754789352417, "learning_rate": 3.45211686232165e-06, "loss": 0.7184, "num_input_tokens_seen": 66086000, "step": 113825 }, { "epoch": 16.954125707476916, "grad_norm": 2.4753000736236572, "learning_rate": 3.45046941792134e-06, "loss": 0.59, "num_input_tokens_seen": 66088976, "step": 113830 }, { "epoch": 16.954870420017873, "grad_norm": 1.2387725114822388, "learning_rate": 3.4488223375784447e-06, "loss": 0.7573, "num_input_tokens_seen": 66091856, "step": 113835 }, { "epoch": 16.955615132558833, "grad_norm": 1.4759693145751953, "learning_rate": 3.447175621320792e-06, "loss": 0.4637, "num_input_tokens_seen": 66094704, "step": 113840 }, { "epoch": 16.95635984509979, "grad_norm": 1.8722447156906128, "learning_rate": 3.445529269176198e-06, "loss": 0.4353, "num_input_tokens_seen": 66097840, "step": 113845 }, { "epoch": 16.95710455764075, "grad_norm": 1.6224058866500854, "learning_rate": 3.443883281172486e-06, "loss": 0.5215, "num_input_tokens_seen": 66100560, "step": 113850 }, { "epoch": 16.95784927018171, "grad_norm": 2.0049362182617188, "learning_rate": 3.442237657337455e-06, "loss": 0.5395, "num_input_tokens_seen": 66103568, "step": 113855 }, { "epoch": 16.958593982722668, "grad_norm": 1.9185936450958252, "learning_rate": 3.440592397698905e-06, "loss": 0.507, "num_input_tokens_seen": 66106672, "step": 113860 }, { "epoch": 16.95933869526363, "grad_norm": 2.0165908336639404, "learning_rate": 3.4389475022846395e-06, "loss": 0.5024, "num_input_tokens_seen": 66109904, "step": 113865 }, { "epoch": 16.96008340780459, "grad_norm": 2.1225507259368896, "learning_rate": 3.4373029711224356e-06, "loss": 0.628, "num_input_tokens_seen": 66112656, "step": 113870 }, { "epoch": 16.960828120345546, "grad_norm": 0.8626012206077576, "learning_rate": 3.435658804240088e-06, "loss": 0.5843, "num_input_tokens_seen": 66115504, "step": 113875 }, { "epoch": 16.961572832886507, "grad_norm": 0.923557698726654, "learning_rate": 3.4340150016653695e-06, "loss": 0.6025, "num_input_tokens_seen": 66118608, "step": 113880 }, { "epoch": 16.962317545427464, "grad_norm": 1.74355149269104, "learning_rate": 3.432371563426043e-06, "loss": 0.5926, "num_input_tokens_seen": 66121648, "step": 113885 }, { "epoch": 16.963062257968424, "grad_norm": 1.6586596965789795, "learning_rate": 3.4307284895498836e-06, "loss": 0.6272, "num_input_tokens_seen": 66124432, "step": 113890 }, { "epoch": 16.963806970509385, "grad_norm": 1.8467828035354614, "learning_rate": 3.429085780064639e-06, "loss": 0.6234, "num_input_tokens_seen": 66127184, "step": 113895 }, { "epoch": 16.964551683050342, "grad_norm": 2.4908924102783203, "learning_rate": 3.427443434998073e-06, "loss": 0.6925, "num_input_tokens_seen": 66130064, "step": 113900 }, { "epoch": 16.965296395591302, "grad_norm": 1.7723188400268555, "learning_rate": 3.4258014543779222e-06, "loss": 0.5507, "num_input_tokens_seen": 66132848, "step": 113905 }, { "epoch": 16.96604110813226, "grad_norm": 4.403363227844238, "learning_rate": 3.4241598382319303e-06, "loss": 0.6359, "num_input_tokens_seen": 66135856, "step": 113910 }, { "epoch": 16.96678582067322, "grad_norm": 1.094470500946045, "learning_rate": 3.422518586587831e-06, "loss": 0.5715, "num_input_tokens_seen": 66138608, "step": 113915 }, { "epoch": 16.96753053321418, "grad_norm": 1.3886481523513794, "learning_rate": 3.4208776994733405e-06, "loss": 0.5899, "num_input_tokens_seen": 66141232, "step": 113920 }, { "epoch": 16.968275245755137, "grad_norm": 2.823819875717163, "learning_rate": 3.4192371769161987e-06, "loss": 0.7419, "num_input_tokens_seen": 66144208, "step": 113925 }, { "epoch": 16.969019958296098, "grad_norm": 1.428063154220581, "learning_rate": 3.417597018944102e-06, "loss": 0.5454, "num_input_tokens_seen": 66146896, "step": 113930 }, { "epoch": 16.96976467083706, "grad_norm": 2.0828044414520264, "learning_rate": 3.415957225584776e-06, "loss": 0.7306, "num_input_tokens_seen": 66150000, "step": 113935 }, { "epoch": 16.970509383378015, "grad_norm": 0.8612834811210632, "learning_rate": 3.4143177968659098e-06, "loss": 0.5521, "num_input_tokens_seen": 66152880, "step": 113940 }, { "epoch": 16.971254095918976, "grad_norm": 2.056919574737549, "learning_rate": 3.412678732815211e-06, "loss": 0.5274, "num_input_tokens_seen": 66155504, "step": 113945 }, { "epoch": 16.971998808459933, "grad_norm": 1.4668900966644287, "learning_rate": 3.411040033460361e-06, "loss": 0.587, "num_input_tokens_seen": 66158896, "step": 113950 }, { "epoch": 16.972743521000893, "grad_norm": 1.3286432027816772, "learning_rate": 3.4094016988290512e-06, "loss": 0.5493, "num_input_tokens_seen": 66161968, "step": 113955 }, { "epoch": 16.973488233541854, "grad_norm": 1.7195321321487427, "learning_rate": 3.407763728948954e-06, "loss": 0.9883, "num_input_tokens_seen": 66164592, "step": 113960 }, { "epoch": 16.97423294608281, "grad_norm": 0.9233013987541199, "learning_rate": 3.4061261238477414e-06, "loss": 0.5909, "num_input_tokens_seen": 66167152, "step": 113965 }, { "epoch": 16.97497765862377, "grad_norm": 1.6014411449432373, "learning_rate": 3.4044888835530835e-06, "loss": 0.4959, "num_input_tokens_seen": 66169968, "step": 113970 }, { "epoch": 16.975722371164732, "grad_norm": 3.8265116214752197, "learning_rate": 3.4028520080926383e-06, "loss": 0.687, "num_input_tokens_seen": 66173136, "step": 113975 }, { "epoch": 16.97646708370569, "grad_norm": 2.199160099029541, "learning_rate": 3.401215497494059e-06, "loss": 0.7853, "num_input_tokens_seen": 66176368, "step": 113980 }, { "epoch": 16.97721179624665, "grad_norm": 1.4544339179992676, "learning_rate": 3.3995793517849846e-06, "loss": 0.5565, "num_input_tokens_seen": 66179344, "step": 113985 }, { "epoch": 16.977956508787607, "grad_norm": 1.87664794921875, "learning_rate": 3.3979435709930703e-06, "loss": 0.5052, "num_input_tokens_seen": 66182512, "step": 113990 }, { "epoch": 16.978701221328567, "grad_norm": 1.222568392753601, "learning_rate": 3.3963081551459442e-06, "loss": 0.8184, "num_input_tokens_seen": 66185296, "step": 113995 }, { "epoch": 16.979445933869528, "grad_norm": 2.4175477027893066, "learning_rate": 3.3946731042712286e-06, "loss": 0.5843, "num_input_tokens_seen": 66188240, "step": 114000 }, { "epoch": 16.980190646410485, "grad_norm": 1.6086238622665405, "learning_rate": 3.3930384183965573e-06, "loss": 0.6455, "num_input_tokens_seen": 66190960, "step": 114005 }, { "epoch": 16.980935358951445, "grad_norm": 1.1318306922912598, "learning_rate": 3.3914040975495387e-06, "loss": 0.672, "num_input_tokens_seen": 66193648, "step": 114010 }, { "epoch": 16.981680071492406, "grad_norm": 1.1230789422988892, "learning_rate": 3.3897701417577893e-06, "loss": 0.6667, "num_input_tokens_seen": 66196656, "step": 114015 }, { "epoch": 16.982424784033363, "grad_norm": 0.7847487926483154, "learning_rate": 3.388136551048904e-06, "loss": 0.5342, "num_input_tokens_seen": 66199568, "step": 114020 }, { "epoch": 16.983169496574323, "grad_norm": 1.1899306774139404, "learning_rate": 3.386503325450491e-06, "loss": 0.4621, "num_input_tokens_seen": 66202384, "step": 114025 }, { "epoch": 16.98391420911528, "grad_norm": 1.4568763971328735, "learning_rate": 3.3848704649901336e-06, "loss": 0.5503, "num_input_tokens_seen": 66205104, "step": 114030 }, { "epoch": 16.98465892165624, "grad_norm": 1.324676513671875, "learning_rate": 3.3832379696954243e-06, "loss": 0.485, "num_input_tokens_seen": 66207824, "step": 114035 }, { "epoch": 16.9854036341972, "grad_norm": 1.0019309520721436, "learning_rate": 3.3816058395939434e-06, "loss": 0.604, "num_input_tokens_seen": 66210512, "step": 114040 }, { "epoch": 16.986148346738158, "grad_norm": 2.462094783782959, "learning_rate": 3.3799740747132547e-06, "loss": 0.6486, "num_input_tokens_seen": 66213552, "step": 114045 }, { "epoch": 16.98689305927912, "grad_norm": 1.6891883611679077, "learning_rate": 3.378342675080934e-06, "loss": 0.5646, "num_input_tokens_seen": 66216304, "step": 114050 }, { "epoch": 16.987637771820076, "grad_norm": 1.316533088684082, "learning_rate": 3.376711640724531e-06, "loss": 0.5146, "num_input_tokens_seen": 66219312, "step": 114055 }, { "epoch": 16.988382484361036, "grad_norm": 2.7339882850646973, "learning_rate": 3.375080971671615e-06, "loss": 0.5432, "num_input_tokens_seen": 66222320, "step": 114060 }, { "epoch": 16.989127196901997, "grad_norm": 1.6556060314178467, "learning_rate": 3.3734506679497207e-06, "loss": 0.8303, "num_input_tokens_seen": 66225488, "step": 114065 }, { "epoch": 16.989871909442954, "grad_norm": 1.6136869192123413, "learning_rate": 3.3718207295864028e-06, "loss": 0.5342, "num_input_tokens_seen": 66228560, "step": 114070 }, { "epoch": 16.990616621983914, "grad_norm": 1.3499784469604492, "learning_rate": 3.3701911566091925e-06, "loss": 0.5704, "num_input_tokens_seen": 66232112, "step": 114075 }, { "epoch": 16.991361334524875, "grad_norm": 1.3333834409713745, "learning_rate": 3.368561949045615e-06, "loss": 0.5091, "num_input_tokens_seen": 66234896, "step": 114080 }, { "epoch": 16.992106047065832, "grad_norm": 0.937263548374176, "learning_rate": 3.3669331069232006e-06, "loss": 0.6217, "num_input_tokens_seen": 66237488, "step": 114085 }, { "epoch": 16.992850759606792, "grad_norm": 1.8993767499923706, "learning_rate": 3.3653046302694614e-06, "loss": 0.585, "num_input_tokens_seen": 66240496, "step": 114090 }, { "epoch": 16.99359547214775, "grad_norm": 1.8068811893463135, "learning_rate": 3.3636765191119165e-06, "loss": 0.6412, "num_input_tokens_seen": 66243728, "step": 114095 }, { "epoch": 16.99434018468871, "grad_norm": 1.45784330368042, "learning_rate": 3.3620487734780603e-06, "loss": 0.6081, "num_input_tokens_seen": 66246896, "step": 114100 }, { "epoch": 16.99508489722967, "grad_norm": 1.9920848608016968, "learning_rate": 3.3604213933954048e-06, "loss": 0.6054, "num_input_tokens_seen": 66250000, "step": 114105 }, { "epoch": 16.995829609770627, "grad_norm": 1.6985875368118286, "learning_rate": 3.358794378891436e-06, "loss": 0.5364, "num_input_tokens_seen": 66252816, "step": 114110 }, { "epoch": 16.996574322311588, "grad_norm": 4.595815181732178, "learning_rate": 3.3571677299936403e-06, "loss": 0.5194, "num_input_tokens_seen": 66255696, "step": 114115 }, { "epoch": 16.99731903485255, "grad_norm": 1.447113275527954, "learning_rate": 3.3555414467295017e-06, "loss": 0.5615, "num_input_tokens_seen": 66258736, "step": 114120 }, { "epoch": 16.998063747393505, "grad_norm": 1.778160810470581, "learning_rate": 3.3539155291264833e-06, "loss": 0.6678, "num_input_tokens_seen": 66261424, "step": 114125 }, { "epoch": 16.998808459934466, "grad_norm": 1.3775007724761963, "learning_rate": 3.352289977212067e-06, "loss": 0.6946, "num_input_tokens_seen": 66264176, "step": 114130 }, { "epoch": 16.999553172475423, "grad_norm": 2.907848358154297, "learning_rate": 3.3506647910137078e-06, "loss": 0.461, "num_input_tokens_seen": 66267120, "step": 114135 }, { "epoch": 17.0, "eval_loss": 0.657900333404541, "eval_runtime": 47.0134, "eval_samples_per_second": 63.471, "eval_steps_per_second": 15.868, "num_input_tokens_seen": 66268336, "step": 114138 }, { "epoch": 17.000297885016384, "grad_norm": 1.2809762954711914, "learning_rate": 3.3490399705588677e-06, "loss": 0.6195, "num_input_tokens_seen": 66269776, "step": 114140 }, { "epoch": 17.001042597557344, "grad_norm": 1.5946964025497437, "learning_rate": 3.3474155158749854e-06, "loss": 0.6539, "num_input_tokens_seen": 66272496, "step": 114145 }, { "epoch": 17.0017873100983, "grad_norm": 1.6904655694961548, "learning_rate": 3.345791426989517e-06, "loss": 0.5576, "num_input_tokens_seen": 66275248, "step": 114150 }, { "epoch": 17.00253202263926, "grad_norm": 2.269235849380493, "learning_rate": 3.3441677039298956e-06, "loss": 0.6213, "num_input_tokens_seen": 66277776, "step": 114155 }, { "epoch": 17.003276735180222, "grad_norm": 1.41157865524292, "learning_rate": 3.3425443467235443e-06, "loss": 0.6302, "num_input_tokens_seen": 66280656, "step": 114160 }, { "epoch": 17.00402144772118, "grad_norm": 0.580687403678894, "learning_rate": 3.3409213553979e-06, "loss": 0.5575, "num_input_tokens_seen": 66283888, "step": 114165 }, { "epoch": 17.00476616026214, "grad_norm": 1.4557321071624756, "learning_rate": 3.3392987299803753e-06, "loss": 0.548, "num_input_tokens_seen": 66286864, "step": 114170 }, { "epoch": 17.005510872803097, "grad_norm": 1.3385109901428223, "learning_rate": 3.33767647049838e-06, "loss": 0.8192, "num_input_tokens_seen": 66289808, "step": 114175 }, { "epoch": 17.006255585344057, "grad_norm": 1.7566256523132324, "learning_rate": 3.3360545769793277e-06, "loss": 0.6507, "num_input_tokens_seen": 66292848, "step": 114180 }, { "epoch": 17.007000297885018, "grad_norm": 1.212519884109497, "learning_rate": 3.3344330494506166e-06, "loss": 0.4254, "num_input_tokens_seen": 66295760, "step": 114185 }, { "epoch": 17.007745010425975, "grad_norm": 1.9352095127105713, "learning_rate": 3.3328118879396324e-06, "loss": 0.4595, "num_input_tokens_seen": 66298768, "step": 114190 }, { "epoch": 17.008489722966935, "grad_norm": 1.370811104774475, "learning_rate": 3.331191092473776e-06, "loss": 0.5615, "num_input_tokens_seen": 66301680, "step": 114195 }, { "epoch": 17.009234435507896, "grad_norm": 1.1555196046829224, "learning_rate": 3.3295706630804222e-06, "loss": 0.6623, "num_input_tokens_seen": 66304688, "step": 114200 }, { "epoch": 17.009979148048853, "grad_norm": 4.245675563812256, "learning_rate": 3.3279505997869442e-06, "loss": 0.6237, "num_input_tokens_seen": 66307536, "step": 114205 }, { "epoch": 17.010723860589813, "grad_norm": 1.803490161895752, "learning_rate": 3.3263309026207166e-06, "loss": 0.5354, "num_input_tokens_seen": 66310352, "step": 114210 }, { "epoch": 17.01146857313077, "grad_norm": 1.422687292098999, "learning_rate": 3.3247115716090987e-06, "loss": 0.5489, "num_input_tokens_seen": 66313360, "step": 114215 }, { "epoch": 17.01221328567173, "grad_norm": 1.4392250776290894, "learning_rate": 3.3230926067794516e-06, "loss": 0.4852, "num_input_tokens_seen": 66316176, "step": 114220 }, { "epoch": 17.01295799821269, "grad_norm": 3.71108078956604, "learning_rate": 3.3214740081591173e-06, "loss": 0.4385, "num_input_tokens_seen": 66318928, "step": 114225 }, { "epoch": 17.01370271075365, "grad_norm": 1.5962413549423218, "learning_rate": 3.3198557757754544e-06, "loss": 0.6453, "num_input_tokens_seen": 66321680, "step": 114230 }, { "epoch": 17.01444742329461, "grad_norm": 1.2651301622390747, "learning_rate": 3.3182379096557916e-06, "loss": 0.6762, "num_input_tokens_seen": 66324336, "step": 114235 }, { "epoch": 17.015192135835566, "grad_norm": 1.0681816339492798, "learning_rate": 3.3166204098274643e-06, "loss": 0.5604, "num_input_tokens_seen": 66327056, "step": 114240 }, { "epoch": 17.015936848376526, "grad_norm": 1.9521270990371704, "learning_rate": 3.3150032763177962e-06, "loss": 0.5321, "num_input_tokens_seen": 66330128, "step": 114245 }, { "epoch": 17.016681560917487, "grad_norm": 1.7692618370056152, "learning_rate": 3.3133865091541037e-06, "loss": 0.6397, "num_input_tokens_seen": 66333136, "step": 114250 }, { "epoch": 17.017426273458444, "grad_norm": 1.5481948852539062, "learning_rate": 3.31177010836371e-06, "loss": 0.7643, "num_input_tokens_seen": 66336016, "step": 114255 }, { "epoch": 17.018170985999404, "grad_norm": 1.1677937507629395, "learning_rate": 3.310154073973909e-06, "loss": 0.6338, "num_input_tokens_seen": 66339120, "step": 114260 }, { "epoch": 17.018915698540365, "grad_norm": 1.1454030275344849, "learning_rate": 3.3085384060120185e-06, "loss": 0.4081, "num_input_tokens_seen": 66341936, "step": 114265 }, { "epoch": 17.019660411081322, "grad_norm": 2.144307851791382, "learning_rate": 3.3069231045053216e-06, "loss": 0.6306, "num_input_tokens_seen": 66344944, "step": 114270 }, { "epoch": 17.020405123622282, "grad_norm": 1.2259128093719482, "learning_rate": 3.3053081694811137e-06, "loss": 0.5049, "num_input_tokens_seen": 66347696, "step": 114275 }, { "epoch": 17.02114983616324, "grad_norm": 1.5014368295669556, "learning_rate": 3.303693600966676e-06, "loss": 0.6595, "num_input_tokens_seen": 66350736, "step": 114280 }, { "epoch": 17.0218945487042, "grad_norm": 1.122100830078125, "learning_rate": 3.3020793989892774e-06, "loss": 0.4706, "num_input_tokens_seen": 66353776, "step": 114285 }, { "epoch": 17.02263926124516, "grad_norm": 0.9268113970756531, "learning_rate": 3.3004655635761994e-06, "loss": 0.7898, "num_input_tokens_seen": 66356592, "step": 114290 }, { "epoch": 17.023383973786117, "grad_norm": 2.5446527004241943, "learning_rate": 3.298852094754698e-06, "loss": 0.5832, "num_input_tokens_seen": 66359344, "step": 114295 }, { "epoch": 17.024128686327078, "grad_norm": 1.8477181196212769, "learning_rate": 3.29723899255204e-06, "loss": 0.6087, "num_input_tokens_seen": 66362096, "step": 114300 }, { "epoch": 17.02487339886804, "grad_norm": 1.3478182554244995, "learning_rate": 3.295626256995471e-06, "loss": 0.5615, "num_input_tokens_seen": 66364944, "step": 114305 }, { "epoch": 17.025618111408996, "grad_norm": 1.0659314393997192, "learning_rate": 3.294013888112235e-06, "loss": 0.4164, "num_input_tokens_seen": 66367696, "step": 114310 }, { "epoch": 17.026362823949956, "grad_norm": 1.2527930736541748, "learning_rate": 3.2924018859295746e-06, "loss": 0.7506, "num_input_tokens_seen": 66370832, "step": 114315 }, { "epoch": 17.027107536490913, "grad_norm": 0.7684668898582458, "learning_rate": 3.290790250474718e-06, "loss": 0.5442, "num_input_tokens_seen": 66373808, "step": 114320 }, { "epoch": 17.027852249031874, "grad_norm": 1.1173322200775146, "learning_rate": 3.2891789817748984e-06, "loss": 0.4828, "num_input_tokens_seen": 66376560, "step": 114325 }, { "epoch": 17.028596961572834, "grad_norm": 2.833477020263672, "learning_rate": 3.287568079857331e-06, "loss": 0.408, "num_input_tokens_seen": 66379312, "step": 114330 }, { "epoch": 17.02934167411379, "grad_norm": 1.1978247165679932, "learning_rate": 3.285957544749238e-06, "loss": 0.5787, "num_input_tokens_seen": 66382224, "step": 114335 }, { "epoch": 17.03008638665475, "grad_norm": 1.392646312713623, "learning_rate": 3.284347376477817e-06, "loss": 0.6, "num_input_tokens_seen": 66385200, "step": 114340 }, { "epoch": 17.030831099195712, "grad_norm": 0.9470461010932922, "learning_rate": 3.2827375750702825e-06, "loss": 0.5524, "num_input_tokens_seen": 66388272, "step": 114345 }, { "epoch": 17.03157581173667, "grad_norm": 2.374990940093994, "learning_rate": 3.2811281405538188e-06, "loss": 0.4778, "num_input_tokens_seen": 66391216, "step": 114350 }, { "epoch": 17.03232052427763, "grad_norm": 2.187018871307373, "learning_rate": 3.2795190729556254e-06, "loss": 0.5094, "num_input_tokens_seen": 66394320, "step": 114355 }, { "epoch": 17.033065236818587, "grad_norm": 1.8416991233825684, "learning_rate": 3.2779103723028807e-06, "loss": 0.4789, "num_input_tokens_seen": 66397264, "step": 114360 }, { "epoch": 17.033809949359547, "grad_norm": 0.9819027781486511, "learning_rate": 3.276302038622761e-06, "loss": 0.5828, "num_input_tokens_seen": 66400208, "step": 114365 }, { "epoch": 17.034554661900508, "grad_norm": 1.753832221031189, "learning_rate": 3.2746940719424414e-06, "loss": 0.6865, "num_input_tokens_seen": 66403056, "step": 114370 }, { "epoch": 17.035299374441465, "grad_norm": 1.5757054090499878, "learning_rate": 3.2730864722890886e-06, "loss": 0.5905, "num_input_tokens_seen": 66405936, "step": 114375 }, { "epoch": 17.036044086982425, "grad_norm": 1.6977741718292236, "learning_rate": 3.2714792396898534e-06, "loss": 0.4882, "num_input_tokens_seen": 66408912, "step": 114380 }, { "epoch": 17.036788799523382, "grad_norm": 1.7258743047714233, "learning_rate": 3.2698723741718894e-06, "loss": 0.614, "num_input_tokens_seen": 66411856, "step": 114385 }, { "epoch": 17.037533512064343, "grad_norm": 0.8944920897483826, "learning_rate": 3.2682658757623526e-06, "loss": 0.3689, "num_input_tokens_seen": 66414576, "step": 114390 }, { "epoch": 17.038278224605303, "grad_norm": 1.1267162561416626, "learning_rate": 3.2666597444883734e-06, "loss": 0.3477, "num_input_tokens_seen": 66417328, "step": 114395 }, { "epoch": 17.03902293714626, "grad_norm": 2.2106997966766357, "learning_rate": 3.265053980377086e-06, "loss": 0.7453, "num_input_tokens_seen": 66420048, "step": 114400 }, { "epoch": 17.03976764968722, "grad_norm": 1.4490506649017334, "learning_rate": 3.2634485834556276e-06, "loss": 0.5812, "num_input_tokens_seen": 66422672, "step": 114405 }, { "epoch": 17.04051236222818, "grad_norm": 1.0996441841125488, "learning_rate": 3.2618435537511066e-06, "loss": 0.6079, "num_input_tokens_seen": 66425520, "step": 114410 }, { "epoch": 17.04125707476914, "grad_norm": 0.8524723649024963, "learning_rate": 3.2602388912906482e-06, "loss": 0.519, "num_input_tokens_seen": 66428400, "step": 114415 }, { "epoch": 17.0420017873101, "grad_norm": 1.5285980701446533, "learning_rate": 3.2586345961013565e-06, "loss": 0.6147, "num_input_tokens_seen": 66431760, "step": 114420 }, { "epoch": 17.042746499851056, "grad_norm": 0.9203211665153503, "learning_rate": 3.2570306682103396e-06, "loss": 0.445, "num_input_tokens_seen": 66434992, "step": 114425 }, { "epoch": 17.043491212392016, "grad_norm": 1.581355094909668, "learning_rate": 3.2554271076446873e-06, "loss": 0.5263, "num_input_tokens_seen": 66437904, "step": 114430 }, { "epoch": 17.044235924932977, "grad_norm": 2.264364242553711, "learning_rate": 3.2538239144314974e-06, "loss": 0.5554, "num_input_tokens_seen": 66440688, "step": 114435 }, { "epoch": 17.044980637473934, "grad_norm": 1.422330379486084, "learning_rate": 3.252221088597854e-06, "loss": 0.4784, "num_input_tokens_seen": 66443696, "step": 114440 }, { "epoch": 17.045725350014894, "grad_norm": 1.1567484140396118, "learning_rate": 3.250618630170829e-06, "loss": 0.6584, "num_input_tokens_seen": 66446256, "step": 114445 }, { "epoch": 17.046470062555855, "grad_norm": 1.6375393867492676, "learning_rate": 3.2490165391774963e-06, "loss": 0.5189, "num_input_tokens_seen": 66449552, "step": 114450 }, { "epoch": 17.047214775096812, "grad_norm": 2.09552264213562, "learning_rate": 3.2474148156449195e-06, "loss": 0.6774, "num_input_tokens_seen": 66452688, "step": 114455 }, { "epoch": 17.047959487637772, "grad_norm": 1.8585033416748047, "learning_rate": 3.2458134596001636e-06, "loss": 0.4193, "num_input_tokens_seen": 66455696, "step": 114460 }, { "epoch": 17.04870420017873, "grad_norm": 2.220053195953369, "learning_rate": 3.2442124710702764e-06, "loss": 0.6772, "num_input_tokens_seen": 66458416, "step": 114465 }, { "epoch": 17.04944891271969, "grad_norm": 2.099754571914673, "learning_rate": 3.242611850082314e-06, "loss": 0.7525, "num_input_tokens_seen": 66461328, "step": 114470 }, { "epoch": 17.05019362526065, "grad_norm": 1.44818115234375, "learning_rate": 3.2410115966633044e-06, "loss": 0.6414, "num_input_tokens_seen": 66464112, "step": 114475 }, { "epoch": 17.050938337801608, "grad_norm": 1.4695624113082886, "learning_rate": 3.239411710840293e-06, "loss": 0.543, "num_input_tokens_seen": 66467088, "step": 114480 }, { "epoch": 17.051683050342568, "grad_norm": 1.5035772323608398, "learning_rate": 3.2378121926403077e-06, "loss": 0.661, "num_input_tokens_seen": 66470160, "step": 114485 }, { "epoch": 17.05242776288353, "grad_norm": 2.7440996170043945, "learning_rate": 3.236213042090358e-06, "loss": 0.7125, "num_input_tokens_seen": 66473104, "step": 114490 }, { "epoch": 17.053172475424486, "grad_norm": 1.386776089668274, "learning_rate": 3.234614259217478e-06, "loss": 0.5853, "num_input_tokens_seen": 66476208, "step": 114495 }, { "epoch": 17.053917187965446, "grad_norm": 0.9100648760795593, "learning_rate": 3.2330158440486672e-06, "loss": 0.4331, "num_input_tokens_seen": 66479344, "step": 114500 }, { "epoch": 17.054661900506403, "grad_norm": 1.6033685207366943, "learning_rate": 3.231417796610925e-06, "loss": 0.4093, "num_input_tokens_seen": 66482256, "step": 114505 }, { "epoch": 17.055406613047364, "grad_norm": 1.8537368774414062, "learning_rate": 3.229820116931259e-06, "loss": 0.6238, "num_input_tokens_seen": 66485616, "step": 114510 }, { "epoch": 17.056151325588324, "grad_norm": 1.9217461347579956, "learning_rate": 3.228222805036657e-06, "loss": 0.6629, "num_input_tokens_seen": 66488720, "step": 114515 }, { "epoch": 17.05689603812928, "grad_norm": 1.413139820098877, "learning_rate": 3.226625860954105e-06, "loss": 0.3437, "num_input_tokens_seen": 66491568, "step": 114520 }, { "epoch": 17.05764075067024, "grad_norm": 2.4454147815704346, "learning_rate": 3.225029284710571e-06, "loss": 0.7545, "num_input_tokens_seen": 66494416, "step": 114525 }, { "epoch": 17.058385463211202, "grad_norm": 1.9439705610275269, "learning_rate": 3.2234330763330432e-06, "loss": 0.5113, "num_input_tokens_seen": 66497520, "step": 114530 }, { "epoch": 17.05913017575216, "grad_norm": 2.6580564975738525, "learning_rate": 3.221837235848474e-06, "loss": 0.6466, "num_input_tokens_seen": 66500592, "step": 114535 }, { "epoch": 17.05987488829312, "grad_norm": 1.0705894231796265, "learning_rate": 3.220241763283838e-06, "loss": 0.7229, "num_input_tokens_seen": 66503760, "step": 114540 }, { "epoch": 17.060619600834077, "grad_norm": 1.1959410905838013, "learning_rate": 3.2186466586660746e-06, "loss": 0.4397, "num_input_tokens_seen": 66506704, "step": 114545 }, { "epoch": 17.061364313375037, "grad_norm": 1.6062198877334595, "learning_rate": 3.2170519220221435e-06, "loss": 0.7035, "num_input_tokens_seen": 66509744, "step": 114550 }, { "epoch": 17.062109025915998, "grad_norm": 1.6315897703170776, "learning_rate": 3.2154575533789753e-06, "loss": 0.6598, "num_input_tokens_seen": 66512944, "step": 114555 }, { "epoch": 17.062853738456955, "grad_norm": 0.9864280223846436, "learning_rate": 3.2138635527635186e-06, "loss": 0.5458, "num_input_tokens_seen": 66516080, "step": 114560 }, { "epoch": 17.063598450997915, "grad_norm": 2.6461265087127686, "learning_rate": 3.2122699202026927e-06, "loss": 0.6874, "num_input_tokens_seen": 66518896, "step": 114565 }, { "epoch": 17.064343163538872, "grad_norm": 1.1451088190078735, "learning_rate": 3.2106766557234243e-06, "loss": 0.631, "num_input_tokens_seen": 66521648, "step": 114570 }, { "epoch": 17.065087876079833, "grad_norm": 1.3056670427322388, "learning_rate": 3.209083759352627e-06, "loss": 0.5836, "num_input_tokens_seen": 66524528, "step": 114575 }, { "epoch": 17.065832588620793, "grad_norm": 1.229055404663086, "learning_rate": 3.2074912311172046e-06, "loss": 0.5918, "num_input_tokens_seen": 66527856, "step": 114580 }, { "epoch": 17.06657730116175, "grad_norm": 1.9411671161651611, "learning_rate": 3.2058990710440773e-06, "loss": 0.4964, "num_input_tokens_seen": 66530512, "step": 114585 }, { "epoch": 17.06732201370271, "grad_norm": 2.748403310775757, "learning_rate": 3.2043072791601293e-06, "loss": 0.6721, "num_input_tokens_seen": 66533360, "step": 114590 }, { "epoch": 17.06806672624367, "grad_norm": 1.279801607131958, "learning_rate": 3.202715855492261e-06, "loss": 0.5506, "num_input_tokens_seen": 66536208, "step": 114595 }, { "epoch": 17.06881143878463, "grad_norm": 1.8251906633377075, "learning_rate": 3.201124800067357e-06, "loss": 0.6308, "num_input_tokens_seen": 66538864, "step": 114600 }, { "epoch": 17.06955615132559, "grad_norm": 1.690599799156189, "learning_rate": 3.1995341129122864e-06, "loss": 0.6323, "num_input_tokens_seen": 66541584, "step": 114605 }, { "epoch": 17.070300863866546, "grad_norm": 1.7527694702148438, "learning_rate": 3.197943794053937e-06, "loss": 0.6461, "num_input_tokens_seen": 66544400, "step": 114610 }, { "epoch": 17.071045576407506, "grad_norm": 2.4665112495422363, "learning_rate": 3.196353843519162e-06, "loss": 0.6684, "num_input_tokens_seen": 66547056, "step": 114615 }, { "epoch": 17.071790288948467, "grad_norm": 1.185667634010315, "learning_rate": 3.1947642613348344e-06, "loss": 0.5977, "num_input_tokens_seen": 66550192, "step": 114620 }, { "epoch": 17.072535001489424, "grad_norm": 1.1476447582244873, "learning_rate": 3.193175047527797e-06, "loss": 0.5681, "num_input_tokens_seen": 66553264, "step": 114625 }, { "epoch": 17.073279714030384, "grad_norm": 1.4862593412399292, "learning_rate": 3.1915862021249105e-06, "loss": 0.4786, "num_input_tokens_seen": 66556336, "step": 114630 }, { "epoch": 17.074024426571345, "grad_norm": 0.9278555512428284, "learning_rate": 3.18999772515301e-06, "loss": 0.5147, "num_input_tokens_seen": 66559440, "step": 114635 }, { "epoch": 17.074769139112302, "grad_norm": 1.5268597602844238, "learning_rate": 3.1884096166389292e-06, "loss": 0.5696, "num_input_tokens_seen": 66562416, "step": 114640 }, { "epoch": 17.075513851653263, "grad_norm": 1.8972477912902832, "learning_rate": 3.1868218766095e-06, "loss": 0.5125, "num_input_tokens_seen": 66565136, "step": 114645 }, { "epoch": 17.07625856419422, "grad_norm": 1.8069026470184326, "learning_rate": 3.1852345050915415e-06, "loss": 0.5604, "num_input_tokens_seen": 66567984, "step": 114650 }, { "epoch": 17.07700327673518, "grad_norm": 3.039785385131836, "learning_rate": 3.1836475021118804e-06, "loss": 0.6809, "num_input_tokens_seen": 66570768, "step": 114655 }, { "epoch": 17.07774798927614, "grad_norm": 0.9632329940795898, "learning_rate": 3.1820608676973144e-06, "loss": 0.5785, "num_input_tokens_seen": 66573648, "step": 114660 }, { "epoch": 17.078492701817098, "grad_norm": 1.7011213302612305, "learning_rate": 3.180474601874661e-06, "loss": 0.6579, "num_input_tokens_seen": 66576752, "step": 114665 }, { "epoch": 17.079237414358058, "grad_norm": 2.564288854598999, "learning_rate": 3.1788887046707072e-06, "loss": 0.7456, "num_input_tokens_seen": 66579856, "step": 114670 }, { "epoch": 17.07998212689902, "grad_norm": 1.8968311548233032, "learning_rate": 3.177303176112256e-06, "loss": 0.6665, "num_input_tokens_seen": 66582608, "step": 114675 }, { "epoch": 17.080726839439976, "grad_norm": 1.6266274452209473, "learning_rate": 3.1757180162260897e-06, "loss": 0.6931, "num_input_tokens_seen": 66585840, "step": 114680 }, { "epoch": 17.081471551980936, "grad_norm": 1.2513506412506104, "learning_rate": 3.174133225038978e-06, "loss": 0.6315, "num_input_tokens_seen": 66588944, "step": 114685 }, { "epoch": 17.082216264521893, "grad_norm": 1.5754008293151855, "learning_rate": 3.17254880257771e-06, "loss": 0.6977, "num_input_tokens_seen": 66591696, "step": 114690 }, { "epoch": 17.082960977062854, "grad_norm": 2.003537654876709, "learning_rate": 3.1709647488690404e-06, "loss": 0.5942, "num_input_tokens_seen": 66595088, "step": 114695 }, { "epoch": 17.083705689603814, "grad_norm": 1.7042828798294067, "learning_rate": 3.1693810639397412e-06, "loss": 0.5628, "num_input_tokens_seen": 66597872, "step": 114700 }, { "epoch": 17.08445040214477, "grad_norm": 1.2891746759414673, "learning_rate": 3.1677977478165588e-06, "loss": 0.504, "num_input_tokens_seen": 66600752, "step": 114705 }, { "epoch": 17.08519511468573, "grad_norm": 3.5646018981933594, "learning_rate": 3.166214800526246e-06, "loss": 0.6585, "num_input_tokens_seen": 66603632, "step": 114710 }, { "epoch": 17.085939827226692, "grad_norm": 1.8006728887557983, "learning_rate": 3.1646322220955372e-06, "loss": 0.6922, "num_input_tokens_seen": 66606608, "step": 114715 }, { "epoch": 17.08668453976765, "grad_norm": 1.3964086771011353, "learning_rate": 3.16305001255118e-06, "loss": 0.7893, "num_input_tokens_seen": 66609488, "step": 114720 }, { "epoch": 17.08742925230861, "grad_norm": 1.5194916725158691, "learning_rate": 3.1614681719199015e-06, "loss": 0.6114, "num_input_tokens_seen": 66612560, "step": 114725 }, { "epoch": 17.088173964849567, "grad_norm": 3.3203351497650146, "learning_rate": 3.1598867002284148e-06, "loss": 0.7164, "num_input_tokens_seen": 66615440, "step": 114730 }, { "epoch": 17.088918677390527, "grad_norm": 1.4304567575454712, "learning_rate": 3.15830559750345e-06, "loss": 0.6854, "num_input_tokens_seen": 66618224, "step": 114735 }, { "epoch": 17.089663389931488, "grad_norm": 2.1950016021728516, "learning_rate": 3.1567248637717066e-06, "loss": 0.5339, "num_input_tokens_seen": 66621008, "step": 114740 }, { "epoch": 17.090408102472445, "grad_norm": 1.2600517272949219, "learning_rate": 3.1551444990599033e-06, "loss": 0.6236, "num_input_tokens_seen": 66623856, "step": 114745 }, { "epoch": 17.091152815013405, "grad_norm": 1.0563960075378418, "learning_rate": 3.1535645033947265e-06, "loss": 0.6802, "num_input_tokens_seen": 66626512, "step": 114750 }, { "epoch": 17.091897527554362, "grad_norm": 1.9879672527313232, "learning_rate": 3.15198487680288e-06, "loss": 0.5154, "num_input_tokens_seen": 66629456, "step": 114755 }, { "epoch": 17.092642240095323, "grad_norm": 1.2584720849990845, "learning_rate": 3.150405619311042e-06, "loss": 0.7851, "num_input_tokens_seen": 66632368, "step": 114760 }, { "epoch": 17.093386952636283, "grad_norm": 1.776206612586975, "learning_rate": 3.148826730945889e-06, "loss": 0.5948, "num_input_tokens_seen": 66635440, "step": 114765 }, { "epoch": 17.09413166517724, "grad_norm": 1.1455039978027344, "learning_rate": 3.147248211734105e-06, "loss": 0.5245, "num_input_tokens_seen": 66638480, "step": 114770 }, { "epoch": 17.0948763777182, "grad_norm": 2.3887290954589844, "learning_rate": 3.145670061702352e-06, "loss": 0.6304, "num_input_tokens_seen": 66641488, "step": 114775 }, { "epoch": 17.09562109025916, "grad_norm": 1.5442774295806885, "learning_rate": 3.144092280877292e-06, "loss": 0.6157, "num_input_tokens_seen": 66644432, "step": 114780 }, { "epoch": 17.09636580280012, "grad_norm": 1.3927667140960693, "learning_rate": 3.1425148692855734e-06, "loss": 0.5945, "num_input_tokens_seen": 66647248, "step": 114785 }, { "epoch": 17.09711051534108, "grad_norm": 2.2762866020202637, "learning_rate": 3.1409378269538574e-06, "loss": 0.5591, "num_input_tokens_seen": 66650064, "step": 114790 }, { "epoch": 17.097855227882036, "grad_norm": 1.092898964881897, "learning_rate": 3.1393611539087765e-06, "loss": 0.601, "num_input_tokens_seen": 66653040, "step": 114795 }, { "epoch": 17.098599940422996, "grad_norm": 1.6371053457260132, "learning_rate": 3.1377848501769724e-06, "loss": 0.5389, "num_input_tokens_seen": 66655760, "step": 114800 }, { "epoch": 17.099344652963957, "grad_norm": 1.1653789281845093, "learning_rate": 3.136208915785077e-06, "loss": 0.5919, "num_input_tokens_seen": 66658448, "step": 114805 }, { "epoch": 17.100089365504914, "grad_norm": 3.29843807220459, "learning_rate": 3.1346333507597027e-06, "loss": 0.8247, "num_input_tokens_seen": 66661456, "step": 114810 }, { "epoch": 17.100834078045875, "grad_norm": 1.0526760816574097, "learning_rate": 3.1330581551274827e-06, "loss": 0.465, "num_input_tokens_seen": 66664272, "step": 114815 }, { "epoch": 17.101578790586835, "grad_norm": 4.156631946563721, "learning_rate": 3.1314833289150138e-06, "loss": 0.6334, "num_input_tokens_seen": 66666992, "step": 114820 }, { "epoch": 17.102323503127792, "grad_norm": 2.9819118976593018, "learning_rate": 3.129908872148912e-06, "loss": 0.6261, "num_input_tokens_seen": 66669968, "step": 114825 }, { "epoch": 17.103068215668753, "grad_norm": 1.166656494140625, "learning_rate": 3.128334784855774e-06, "loss": 0.4707, "num_input_tokens_seen": 66672912, "step": 114830 }, { "epoch": 17.10381292820971, "grad_norm": 2.0341031551361084, "learning_rate": 3.126761067062184e-06, "loss": 0.728, "num_input_tokens_seen": 66675728, "step": 114835 }, { "epoch": 17.10455764075067, "grad_norm": 1.071842908859253, "learning_rate": 3.125187718794742e-06, "loss": 0.5935, "num_input_tokens_seen": 66678832, "step": 114840 }, { "epoch": 17.10530235329163, "grad_norm": 1.6218129396438599, "learning_rate": 3.1236147400800194e-06, "loss": 0.4611, "num_input_tokens_seen": 66681552, "step": 114845 }, { "epoch": 17.106047065832588, "grad_norm": 3.2639260292053223, "learning_rate": 3.1220421309445913e-06, "loss": 0.7085, "num_input_tokens_seen": 66684240, "step": 114850 }, { "epoch": 17.106791778373548, "grad_norm": 1.3997949361801147, "learning_rate": 3.1204698914150205e-06, "loss": 0.5002, "num_input_tokens_seen": 66686992, "step": 114855 }, { "epoch": 17.10753649091451, "grad_norm": 1.432758092880249, "learning_rate": 3.11889802151788e-06, "loss": 0.5703, "num_input_tokens_seen": 66690000, "step": 114860 }, { "epoch": 17.108281203455466, "grad_norm": 1.376970648765564, "learning_rate": 3.117326521279712e-06, "loss": 0.4552, "num_input_tokens_seen": 66692688, "step": 114865 }, { "epoch": 17.109025915996426, "grad_norm": 1.727982997894287, "learning_rate": 3.1157553907270766e-06, "loss": 0.6097, "num_input_tokens_seen": 66695632, "step": 114870 }, { "epoch": 17.109770628537383, "grad_norm": 1.8225626945495605, "learning_rate": 3.1141846298865074e-06, "loss": 0.5871, "num_input_tokens_seen": 66698416, "step": 114875 }, { "epoch": 17.110515341078344, "grad_norm": 1.8159911632537842, "learning_rate": 3.11261423878455e-06, "loss": 0.6519, "num_input_tokens_seen": 66701520, "step": 114880 }, { "epoch": 17.111260053619304, "grad_norm": 1.888450264930725, "learning_rate": 3.111044217447731e-06, "loss": 0.4619, "num_input_tokens_seen": 66704560, "step": 114885 }, { "epoch": 17.11200476616026, "grad_norm": 2.3996946811676025, "learning_rate": 3.1094745659025674e-06, "loss": 0.5458, "num_input_tokens_seen": 66707408, "step": 114890 }, { "epoch": 17.11274947870122, "grad_norm": 1.6150230169296265, "learning_rate": 3.1079052841755857e-06, "loss": 0.4281, "num_input_tokens_seen": 66710320, "step": 114895 }, { "epoch": 17.113494191242182, "grad_norm": 1.1261699199676514, "learning_rate": 3.1063363722932975e-06, "loss": 0.5957, "num_input_tokens_seen": 66713072, "step": 114900 }, { "epoch": 17.11423890378314, "grad_norm": 1.526688575744629, "learning_rate": 3.1047678302822016e-06, "loss": 0.6816, "num_input_tokens_seen": 66716080, "step": 114905 }, { "epoch": 17.1149836163241, "grad_norm": 1.688495397567749, "learning_rate": 3.1031996581687955e-06, "loss": 0.5532, "num_input_tokens_seen": 66718992, "step": 114910 }, { "epoch": 17.115728328865057, "grad_norm": 1.3087666034698486, "learning_rate": 3.101631855979581e-06, "loss": 0.7591, "num_input_tokens_seen": 66721936, "step": 114915 }, { "epoch": 17.116473041406017, "grad_norm": 1.249678611755371, "learning_rate": 3.100064423741042e-06, "loss": 0.5034, "num_input_tokens_seen": 66724496, "step": 114920 }, { "epoch": 17.117217753946978, "grad_norm": 1.3776638507843018, "learning_rate": 3.098497361479649e-06, "loss": 0.6265, "num_input_tokens_seen": 66727504, "step": 114925 }, { "epoch": 17.117962466487935, "grad_norm": 1.9575207233428955, "learning_rate": 3.0969306692218897e-06, "loss": 0.4743, "num_input_tokens_seen": 66730352, "step": 114930 }, { "epoch": 17.118707179028895, "grad_norm": 2.036785125732422, "learning_rate": 3.0953643469942173e-06, "loss": 0.4909, "num_input_tokens_seen": 66733648, "step": 114935 }, { "epoch": 17.119451891569852, "grad_norm": 2.1612067222595215, "learning_rate": 3.093798394823111e-06, "loss": 0.7218, "num_input_tokens_seen": 66736784, "step": 114940 }, { "epoch": 17.120196604110813, "grad_norm": 1.6016179323196411, "learning_rate": 3.0922328127350076e-06, "loss": 0.5584, "num_input_tokens_seen": 66739792, "step": 114945 }, { "epoch": 17.120941316651773, "grad_norm": 1.2320576906204224, "learning_rate": 3.090667600756372e-06, "loss": 0.5369, "num_input_tokens_seen": 66743088, "step": 114950 }, { "epoch": 17.12168602919273, "grad_norm": 1.032435417175293, "learning_rate": 3.089102758913634e-06, "loss": 0.6387, "num_input_tokens_seen": 66745840, "step": 114955 }, { "epoch": 17.12243074173369, "grad_norm": 1.9997704029083252, "learning_rate": 3.087538287233241e-06, "loss": 0.568, "num_input_tokens_seen": 66748624, "step": 114960 }, { "epoch": 17.12317545427465, "grad_norm": 1.6310256719589233, "learning_rate": 3.0859741857416193e-06, "loss": 0.7593, "num_input_tokens_seen": 66751536, "step": 114965 }, { "epoch": 17.12392016681561, "grad_norm": 1.4657618999481201, "learning_rate": 3.0844104544651893e-06, "loss": 0.7119, "num_input_tokens_seen": 66754576, "step": 114970 }, { "epoch": 17.12466487935657, "grad_norm": 1.2618509531021118, "learning_rate": 3.082847093430369e-06, "loss": 0.7786, "num_input_tokens_seen": 66757840, "step": 114975 }, { "epoch": 17.125409591897526, "grad_norm": 1.2246770858764648, "learning_rate": 3.0812841026635705e-06, "loss": 0.7041, "num_input_tokens_seen": 66760720, "step": 114980 }, { "epoch": 17.126154304438487, "grad_norm": 0.90424644947052, "learning_rate": 3.079721482191203e-06, "loss": 0.4339, "num_input_tokens_seen": 66763504, "step": 114985 }, { "epoch": 17.126899016979447, "grad_norm": 1.7758030891418457, "learning_rate": 3.0781592320396568e-06, "loss": 0.5304, "num_input_tokens_seen": 66766256, "step": 114990 }, { "epoch": 17.127643729520404, "grad_norm": 1.3175464868545532, "learning_rate": 3.076597352235333e-06, "loss": 0.6195, "num_input_tokens_seen": 66769136, "step": 114995 }, { "epoch": 17.128388442061365, "grad_norm": 1.690057635307312, "learning_rate": 3.075035842804619e-06, "loss": 0.3607, "num_input_tokens_seen": 66772112, "step": 115000 }, { "epoch": 17.129133154602325, "grad_norm": 1.947002649307251, "learning_rate": 3.073474703773885e-06, "loss": 0.6869, "num_input_tokens_seen": 66775024, "step": 115005 }, { "epoch": 17.129877867143282, "grad_norm": 2.8392856121063232, "learning_rate": 3.0719139351695125e-06, "loss": 0.7597, "num_input_tokens_seen": 66777840, "step": 115010 }, { "epoch": 17.130622579684243, "grad_norm": 1.5176373720169067, "learning_rate": 3.070353537017867e-06, "loss": 0.4905, "num_input_tokens_seen": 66781104, "step": 115015 }, { "epoch": 17.1313672922252, "grad_norm": 1.9335358142852783, "learning_rate": 3.0687935093453106e-06, "loss": 0.4789, "num_input_tokens_seen": 66783760, "step": 115020 }, { "epoch": 17.13211200476616, "grad_norm": 1.563883900642395, "learning_rate": 3.0672338521781975e-06, "loss": 0.4768, "num_input_tokens_seen": 66786544, "step": 115025 }, { "epoch": 17.13285671730712, "grad_norm": 2.8933067321777344, "learning_rate": 3.0656745655428783e-06, "loss": 0.7385, "num_input_tokens_seen": 66789360, "step": 115030 }, { "epoch": 17.133601429848078, "grad_norm": 1.399238109588623, "learning_rate": 3.0641156494656957e-06, "loss": 0.5701, "num_input_tokens_seen": 66792368, "step": 115035 }, { "epoch": 17.134346142389038, "grad_norm": 1.273916482925415, "learning_rate": 3.062557103972985e-06, "loss": 0.6037, "num_input_tokens_seen": 66795216, "step": 115040 }, { "epoch": 17.13509085493, "grad_norm": 1.8153061866760254, "learning_rate": 3.0609989290910775e-06, "loss": 0.7313, "num_input_tokens_seen": 66798160, "step": 115045 }, { "epoch": 17.135835567470956, "grad_norm": 2.047775983810425, "learning_rate": 3.059441124846288e-06, "loss": 0.591, "num_input_tokens_seen": 66800880, "step": 115050 }, { "epoch": 17.136580280011916, "grad_norm": 1.4341480731964111, "learning_rate": 3.0578836912649458e-06, "loss": 0.6046, "num_input_tokens_seen": 66803952, "step": 115055 }, { "epoch": 17.137324992552873, "grad_norm": 2.5887722969055176, "learning_rate": 3.0563266283733517e-06, "loss": 0.5844, "num_input_tokens_seen": 66806672, "step": 115060 }, { "epoch": 17.138069705093834, "grad_norm": 1.6940274238586426, "learning_rate": 3.054769936197824e-06, "loss": 0.5842, "num_input_tokens_seen": 66809648, "step": 115065 }, { "epoch": 17.138814417634794, "grad_norm": 1.606339931488037, "learning_rate": 3.0532136147646496e-06, "loss": 0.6944, "num_input_tokens_seen": 66812656, "step": 115070 }, { "epoch": 17.13955913017575, "grad_norm": 1.4031691551208496, "learning_rate": 3.05165766410013e-06, "loss": 0.4594, "num_input_tokens_seen": 66815280, "step": 115075 }, { "epoch": 17.140303842716712, "grad_norm": 1.9954107999801636, "learning_rate": 3.050102084230541e-06, "loss": 0.489, "num_input_tokens_seen": 66818384, "step": 115080 }, { "epoch": 17.14104855525767, "grad_norm": 1.3284116983413696, "learning_rate": 3.0485468751821735e-06, "loss": 0.8147, "num_input_tokens_seen": 66821584, "step": 115085 }, { "epoch": 17.14179326779863, "grad_norm": 2.2526793479919434, "learning_rate": 3.046992036981294e-06, "loss": 0.6638, "num_input_tokens_seen": 66824528, "step": 115090 }, { "epoch": 17.14253798033959, "grad_norm": 1.7677006721496582, "learning_rate": 3.0454375696541694e-06, "loss": 0.6035, "num_input_tokens_seen": 66827248, "step": 115095 }, { "epoch": 17.143282692880547, "grad_norm": 0.9730921983718872, "learning_rate": 3.0438834732270686e-06, "loss": 0.4979, "num_input_tokens_seen": 66830224, "step": 115100 }, { "epoch": 17.144027405421507, "grad_norm": 1.5764907598495483, "learning_rate": 3.0423297477262415e-06, "loss": 0.6129, "num_input_tokens_seen": 66833232, "step": 115105 }, { "epoch": 17.144772117962468, "grad_norm": 2.713634490966797, "learning_rate": 3.0407763931779354e-06, "loss": 0.7282, "num_input_tokens_seen": 66836432, "step": 115110 }, { "epoch": 17.145516830503425, "grad_norm": 2.260279417037964, "learning_rate": 3.039223409608391e-06, "loss": 0.567, "num_input_tokens_seen": 66839184, "step": 115115 }, { "epoch": 17.146261543044385, "grad_norm": 2.738679885864258, "learning_rate": 3.0376707970438513e-06, "loss": 0.4448, "num_input_tokens_seen": 66841968, "step": 115120 }, { "epoch": 17.147006255585342, "grad_norm": 1.0939834117889404, "learning_rate": 3.036118555510539e-06, "loss": 0.6052, "num_input_tokens_seen": 66845072, "step": 115125 }, { "epoch": 17.147750968126303, "grad_norm": 1.7944471836090088, "learning_rate": 3.0345666850346787e-06, "loss": 0.5873, "num_input_tokens_seen": 66847888, "step": 115130 }, { "epoch": 17.148495680667263, "grad_norm": 1.6810166835784912, "learning_rate": 3.033015185642493e-06, "loss": 0.5861, "num_input_tokens_seen": 66850768, "step": 115135 }, { "epoch": 17.14924039320822, "grad_norm": 1.4924571514129639, "learning_rate": 3.0314640573601864e-06, "loss": 0.5044, "num_input_tokens_seen": 66853968, "step": 115140 }, { "epoch": 17.14998510574918, "grad_norm": 1.3204482793807983, "learning_rate": 3.029913300213971e-06, "loss": 0.5486, "num_input_tokens_seen": 66856816, "step": 115145 }, { "epoch": 17.15072981829014, "grad_norm": 3.0035488605499268, "learning_rate": 3.0283629142300347e-06, "loss": 0.8173, "num_input_tokens_seen": 66859312, "step": 115150 }, { "epoch": 17.1514745308311, "grad_norm": 3.34277081489563, "learning_rate": 3.0268128994345807e-06, "loss": 0.7433, "num_input_tokens_seen": 66862352, "step": 115155 }, { "epoch": 17.15221924337206, "grad_norm": 1.1440380811691284, "learning_rate": 3.0252632558537913e-06, "loss": 0.879, "num_input_tokens_seen": 66865104, "step": 115160 }, { "epoch": 17.152963955913016, "grad_norm": 0.722080409526825, "learning_rate": 3.0237139835138402e-06, "loss": 0.4333, "num_input_tokens_seen": 66868208, "step": 115165 }, { "epoch": 17.153708668453977, "grad_norm": 1.5459920167922974, "learning_rate": 3.0221650824409114e-06, "loss": 0.6185, "num_input_tokens_seen": 66871248, "step": 115170 }, { "epoch": 17.154453380994937, "grad_norm": 2.416766881942749, "learning_rate": 3.0206165526611654e-06, "loss": 0.6161, "num_input_tokens_seen": 66874192, "step": 115175 }, { "epoch": 17.155198093535894, "grad_norm": 2.2379097938537598, "learning_rate": 3.0190683942007637e-06, "loss": 0.7173, "num_input_tokens_seen": 66877200, "step": 115180 }, { "epoch": 17.155942806076855, "grad_norm": 1.7875797748565674, "learning_rate": 3.017520607085858e-06, "loss": 0.4841, "num_input_tokens_seen": 66880176, "step": 115185 }, { "epoch": 17.156687518617815, "grad_norm": 1.701522946357727, "learning_rate": 3.0159731913426027e-06, "loss": 0.631, "num_input_tokens_seen": 66882960, "step": 115190 }, { "epoch": 17.157432231158772, "grad_norm": 1.4599425792694092, "learning_rate": 3.014426146997132e-06, "loss": 0.5852, "num_input_tokens_seen": 66885936, "step": 115195 }, { "epoch": 17.158176943699733, "grad_norm": 1.60698664188385, "learning_rate": 3.0128794740755916e-06, "loss": 0.503, "num_input_tokens_seen": 66888688, "step": 115200 }, { "epoch": 17.15892165624069, "grad_norm": 1.639499306678772, "learning_rate": 3.0113331726041055e-06, "loss": 0.7756, "num_input_tokens_seen": 66891792, "step": 115205 }, { "epoch": 17.15966636878165, "grad_norm": 0.8927173018455505, "learning_rate": 3.0097872426087914e-06, "loss": 0.5574, "num_input_tokens_seen": 66894832, "step": 115210 }, { "epoch": 17.16041108132261, "grad_norm": 0.862782895565033, "learning_rate": 3.0082416841157783e-06, "loss": 0.7061, "num_input_tokens_seen": 66897520, "step": 115215 }, { "epoch": 17.161155793863568, "grad_norm": 1.9002019166946411, "learning_rate": 3.006696497151165e-06, "loss": 0.4396, "num_input_tokens_seen": 66900240, "step": 115220 }, { "epoch": 17.16190050640453, "grad_norm": 1.9612352848052979, "learning_rate": 3.005151681741067e-06, "loss": 0.5829, "num_input_tokens_seen": 66903152, "step": 115225 }, { "epoch": 17.16264521894549, "grad_norm": 1.0751032829284668, "learning_rate": 3.0036072379115737e-06, "loss": 0.5895, "num_input_tokens_seen": 66906224, "step": 115230 }, { "epoch": 17.163389931486446, "grad_norm": 2.8843483924865723, "learning_rate": 3.0020631656887845e-06, "loss": 0.4688, "num_input_tokens_seen": 66908816, "step": 115235 }, { "epoch": 17.164134644027406, "grad_norm": 2.5057778358459473, "learning_rate": 3.000519465098772e-06, "loss": 0.6528, "num_input_tokens_seen": 66912048, "step": 115240 }, { "epoch": 17.164879356568363, "grad_norm": 3.5907671451568604, "learning_rate": 2.9989761361676306e-06, "loss": 0.7345, "num_input_tokens_seen": 66914864, "step": 115245 }, { "epoch": 17.165624069109324, "grad_norm": 0.7806528210639954, "learning_rate": 2.997433178921427e-06, "loss": 0.6705, "num_input_tokens_seen": 66917520, "step": 115250 }, { "epoch": 17.166368781650284, "grad_norm": 2.897624969482422, "learning_rate": 2.995890593386222e-06, "loss": 0.5076, "num_input_tokens_seen": 66920272, "step": 115255 }, { "epoch": 17.16711349419124, "grad_norm": 1.145643949508667, "learning_rate": 2.9943483795880854e-06, "loss": 0.5996, "num_input_tokens_seen": 66923056, "step": 115260 }, { "epoch": 17.167858206732202, "grad_norm": 2.5231032371520996, "learning_rate": 2.992806537553064e-06, "loss": 0.5548, "num_input_tokens_seen": 66926064, "step": 115265 }, { "epoch": 17.16860291927316, "grad_norm": 1.6814451217651367, "learning_rate": 2.9912650673072113e-06, "loss": 0.6027, "num_input_tokens_seen": 66930096, "step": 115270 }, { "epoch": 17.16934763181412, "grad_norm": 1.5671193599700928, "learning_rate": 2.989723968876565e-06, "loss": 0.5368, "num_input_tokens_seen": 66932752, "step": 115275 }, { "epoch": 17.17009234435508, "grad_norm": 1.3572566509246826, "learning_rate": 2.9881832422871654e-06, "loss": 0.6822, "num_input_tokens_seen": 66936144, "step": 115280 }, { "epoch": 17.170837056896037, "grad_norm": 1.4219045639038086, "learning_rate": 2.986642887565036e-06, "loss": 0.5881, "num_input_tokens_seen": 66939120, "step": 115285 }, { "epoch": 17.171581769436997, "grad_norm": 1.2093172073364258, "learning_rate": 2.9851029047362008e-06, "loss": 0.5441, "num_input_tokens_seen": 66942032, "step": 115290 }, { "epoch": 17.172326481977958, "grad_norm": 2.3430778980255127, "learning_rate": 2.98356329382668e-06, "loss": 0.5988, "num_input_tokens_seen": 66944880, "step": 115295 }, { "epoch": 17.173071194518915, "grad_norm": 1.3044639825820923, "learning_rate": 2.9820240548624814e-06, "loss": 0.5783, "num_input_tokens_seen": 66947760, "step": 115300 }, { "epoch": 17.173815907059875, "grad_norm": 2.1118643283843994, "learning_rate": 2.9804851878696054e-06, "loss": 0.6534, "num_input_tokens_seen": 66950608, "step": 115305 }, { "epoch": 17.174560619600832, "grad_norm": 2.8766982555389404, "learning_rate": 2.9789466928740515e-06, "loss": 0.7433, "num_input_tokens_seen": 66953424, "step": 115310 }, { "epoch": 17.175305332141793, "grad_norm": 1.267789602279663, "learning_rate": 2.9774085699018158e-06, "loss": 0.5953, "num_input_tokens_seen": 66956208, "step": 115315 }, { "epoch": 17.176050044682754, "grad_norm": 1.7223007678985596, "learning_rate": 2.9758708189788736e-06, "loss": 0.5156, "num_input_tokens_seen": 66958928, "step": 115320 }, { "epoch": 17.17679475722371, "grad_norm": 1.1439749002456665, "learning_rate": 2.9743334401312133e-06, "loss": 0.4869, "num_input_tokens_seen": 66961840, "step": 115325 }, { "epoch": 17.17753946976467, "grad_norm": 1.3395923376083374, "learning_rate": 2.9727964333848056e-06, "loss": 0.5316, "num_input_tokens_seen": 66964816, "step": 115330 }, { "epoch": 17.17828418230563, "grad_norm": 1.1197707653045654, "learning_rate": 2.9712597987656105e-06, "loss": 0.7927, "num_input_tokens_seen": 66967760, "step": 115335 }, { "epoch": 17.17902889484659, "grad_norm": 0.1532144844532013, "learning_rate": 2.9697235362995955e-06, "loss": 0.4776, "num_input_tokens_seen": 66970544, "step": 115340 }, { "epoch": 17.17977360738755, "grad_norm": 1.3083664178848267, "learning_rate": 2.9681876460127073e-06, "loss": 0.5814, "num_input_tokens_seen": 66973488, "step": 115345 }, { "epoch": 17.180518319928506, "grad_norm": 1.6457650661468506, "learning_rate": 2.9666521279309023e-06, "loss": 0.6218, "num_input_tokens_seen": 66976240, "step": 115350 }, { "epoch": 17.181263032469467, "grad_norm": 1.9376916885375977, "learning_rate": 2.965116982080107e-06, "loss": 0.6187, "num_input_tokens_seen": 66979088, "step": 115355 }, { "epoch": 17.182007745010427, "grad_norm": 2.8482325077056885, "learning_rate": 2.9635822084862737e-06, "loss": 0.8145, "num_input_tokens_seen": 66981488, "step": 115360 }, { "epoch": 17.182752457551384, "grad_norm": 3.742997407913208, "learning_rate": 2.9620478071753223e-06, "loss": 0.5841, "num_input_tokens_seen": 66984144, "step": 115365 }, { "epoch": 17.183497170092345, "grad_norm": 1.8017213344573975, "learning_rate": 2.9605137781731713e-06, "loss": 0.5743, "num_input_tokens_seen": 66986960, "step": 115370 }, { "epoch": 17.184241882633305, "grad_norm": 1.5220301151275635, "learning_rate": 2.9589801215057445e-06, "loss": 0.5855, "num_input_tokens_seen": 66989808, "step": 115375 }, { "epoch": 17.184986595174262, "grad_norm": 1.2937132120132446, "learning_rate": 2.9574468371989378e-06, "loss": 0.6888, "num_input_tokens_seen": 66992560, "step": 115380 }, { "epoch": 17.185731307715223, "grad_norm": 2.30222749710083, "learning_rate": 2.955913925278672e-06, "loss": 0.3881, "num_input_tokens_seen": 66995728, "step": 115385 }, { "epoch": 17.18647602025618, "grad_norm": 1.945478081703186, "learning_rate": 2.95438138577083e-06, "loss": 0.6371, "num_input_tokens_seen": 66998640, "step": 115390 }, { "epoch": 17.18722073279714, "grad_norm": 1.0981886386871338, "learning_rate": 2.952849218701312e-06, "loss": 0.6326, "num_input_tokens_seen": 67001264, "step": 115395 }, { "epoch": 17.1879654453381, "grad_norm": 2.4063220024108887, "learning_rate": 2.951317424095995e-06, "loss": 0.5538, "num_input_tokens_seen": 67003920, "step": 115400 }, { "epoch": 17.188710157879058, "grad_norm": 1.6380783319473267, "learning_rate": 2.9497860019807643e-06, "loss": 0.4682, "num_input_tokens_seen": 67006704, "step": 115405 }, { "epoch": 17.18945487042002, "grad_norm": 2.621884822845459, "learning_rate": 2.948254952381491e-06, "loss": 0.6734, "num_input_tokens_seen": 67009424, "step": 115410 }, { "epoch": 17.19019958296098, "grad_norm": 1.1527974605560303, "learning_rate": 2.946724275324031e-06, "loss": 0.6033, "num_input_tokens_seen": 67012208, "step": 115415 }, { "epoch": 17.190944295501936, "grad_norm": 1.6600216627120972, "learning_rate": 2.9451939708342564e-06, "loss": 0.6262, "num_input_tokens_seen": 67015088, "step": 115420 }, { "epoch": 17.191689008042896, "grad_norm": 1.8453556299209595, "learning_rate": 2.9436640389380073e-06, "loss": 0.5661, "num_input_tokens_seen": 67018064, "step": 115425 }, { "epoch": 17.192433720583853, "grad_norm": 1.8180360794067383, "learning_rate": 2.9421344796611435e-06, "loss": 0.551, "num_input_tokens_seen": 67021008, "step": 115430 }, { "epoch": 17.193178433124814, "grad_norm": 1.9743435382843018, "learning_rate": 2.9406052930295e-06, "loss": 0.5418, "num_input_tokens_seen": 67024080, "step": 115435 }, { "epoch": 17.193923145665774, "grad_norm": 1.227266550064087, "learning_rate": 2.9390764790689085e-06, "loss": 0.7052, "num_input_tokens_seen": 67027056, "step": 115440 }, { "epoch": 17.19466785820673, "grad_norm": 1.2949209213256836, "learning_rate": 2.9375480378051987e-06, "loss": 0.556, "num_input_tokens_seen": 67029936, "step": 115445 }, { "epoch": 17.195412570747692, "grad_norm": 2.054671287536621, "learning_rate": 2.9360199692641864e-06, "loss": 0.5707, "num_input_tokens_seen": 67032784, "step": 115450 }, { "epoch": 17.19615728328865, "grad_norm": 3.9316117763519287, "learning_rate": 2.9344922734716977e-06, "loss": 0.6652, "num_input_tokens_seen": 67035408, "step": 115455 }, { "epoch": 17.19690199582961, "grad_norm": 1.0208396911621094, "learning_rate": 2.932964950453529e-06, "loss": 0.6388, "num_input_tokens_seen": 67038352, "step": 115460 }, { "epoch": 17.19764670837057, "grad_norm": 2.24800705909729, "learning_rate": 2.9314380002354953e-06, "loss": 0.6654, "num_input_tokens_seen": 67041264, "step": 115465 }, { "epoch": 17.198391420911527, "grad_norm": 1.1774739027023315, "learning_rate": 2.9299114228433816e-06, "loss": 0.6413, "num_input_tokens_seen": 67044176, "step": 115470 }, { "epoch": 17.199136133452487, "grad_norm": 2.874220848083496, "learning_rate": 2.9283852183029898e-06, "loss": 0.6831, "num_input_tokens_seen": 67046768, "step": 115475 }, { "epoch": 17.199880845993448, "grad_norm": 1.1565879583358765, "learning_rate": 2.9268593866400907e-06, "loss": 0.4852, "num_input_tokens_seen": 67049648, "step": 115480 }, { "epoch": 17.200625558534405, "grad_norm": 1.3708134889602661, "learning_rate": 2.9253339278804748e-06, "loss": 0.6997, "num_input_tokens_seen": 67052720, "step": 115485 }, { "epoch": 17.201370271075366, "grad_norm": 0.8784262537956238, "learning_rate": 2.923808842049905e-06, "loss": 0.4652, "num_input_tokens_seen": 67055344, "step": 115490 }, { "epoch": 17.202114983616323, "grad_norm": 1.6445719003677368, "learning_rate": 2.922284129174141e-06, "loss": 0.4393, "num_input_tokens_seen": 67058160, "step": 115495 }, { "epoch": 17.202859696157283, "grad_norm": 2.587442398071289, "learning_rate": 2.920759789278957e-06, "loss": 0.4545, "num_input_tokens_seen": 67061072, "step": 115500 }, { "epoch": 17.203604408698244, "grad_norm": 1.0702919960021973, "learning_rate": 2.919235822390093e-06, "loss": 0.438, "num_input_tokens_seen": 67063952, "step": 115505 }, { "epoch": 17.2043491212392, "grad_norm": 1.481674313545227, "learning_rate": 2.9177122285332982e-06, "loss": 0.5728, "num_input_tokens_seen": 67066960, "step": 115510 }, { "epoch": 17.20509383378016, "grad_norm": 1.2124639749526978, "learning_rate": 2.9161890077343074e-06, "loss": 0.7149, "num_input_tokens_seen": 67069840, "step": 115515 }, { "epoch": 17.20583854632112, "grad_norm": 0.9347342252731323, "learning_rate": 2.914666160018864e-06, "loss": 0.5139, "num_input_tokens_seen": 67072528, "step": 115520 }, { "epoch": 17.20658325886208, "grad_norm": 1.056052803993225, "learning_rate": 2.9131436854126894e-06, "loss": 0.4652, "num_input_tokens_seen": 67075696, "step": 115525 }, { "epoch": 17.20732797140304, "grad_norm": 2.133986711502075, "learning_rate": 2.9116215839414986e-06, "loss": 0.5996, "num_input_tokens_seen": 67078352, "step": 115530 }, { "epoch": 17.208072683943996, "grad_norm": 1.2877614498138428, "learning_rate": 2.9100998556310153e-06, "loss": 0.5831, "num_input_tokens_seen": 67081424, "step": 115535 }, { "epoch": 17.208817396484957, "grad_norm": 1.247930884361267, "learning_rate": 2.9085785005069394e-06, "loss": 0.5624, "num_input_tokens_seen": 67084304, "step": 115540 }, { "epoch": 17.209562109025917, "grad_norm": 1.708410620689392, "learning_rate": 2.907057518594983e-06, "loss": 0.5571, "num_input_tokens_seen": 67087344, "step": 115545 }, { "epoch": 17.210306821566874, "grad_norm": 1.8062357902526855, "learning_rate": 2.9055369099208306e-06, "loss": 0.7164, "num_input_tokens_seen": 67090448, "step": 115550 }, { "epoch": 17.211051534107835, "grad_norm": 2.0532102584838867, "learning_rate": 2.904016674510179e-06, "loss": 0.5664, "num_input_tokens_seen": 67093392, "step": 115555 }, { "epoch": 17.211796246648795, "grad_norm": 3.388176441192627, "learning_rate": 2.9024968123887107e-06, "loss": 0.6767, "num_input_tokens_seen": 67096560, "step": 115560 }, { "epoch": 17.212540959189752, "grad_norm": 1.3705111742019653, "learning_rate": 2.900977323582099e-06, "loss": 0.6414, "num_input_tokens_seen": 67099376, "step": 115565 }, { "epoch": 17.213285671730713, "grad_norm": 1.6249034404754639, "learning_rate": 2.8994582081160155e-06, "loss": 0.6248, "num_input_tokens_seen": 67102096, "step": 115570 }, { "epoch": 17.21403038427167, "grad_norm": 1.4376883506774902, "learning_rate": 2.897939466016117e-06, "loss": 0.6038, "num_input_tokens_seen": 67104848, "step": 115575 }, { "epoch": 17.21477509681263, "grad_norm": 2.640850305557251, "learning_rate": 2.8964210973080745e-06, "loss": 0.5275, "num_input_tokens_seen": 67107824, "step": 115580 }, { "epoch": 17.21551980935359, "grad_norm": 1.6206414699554443, "learning_rate": 2.8949031020175264e-06, "loss": 0.5242, "num_input_tokens_seen": 67110640, "step": 115585 }, { "epoch": 17.216264521894548, "grad_norm": 1.4563803672790527, "learning_rate": 2.89338548017013e-06, "loss": 0.7025, "num_input_tokens_seen": 67113520, "step": 115590 }, { "epoch": 17.21700923443551, "grad_norm": 2.0010244846343994, "learning_rate": 2.8918682317915115e-06, "loss": 0.6315, "num_input_tokens_seen": 67116400, "step": 115595 }, { "epoch": 17.217753946976465, "grad_norm": 1.1806920766830444, "learning_rate": 2.890351356907314e-06, "loss": 0.4061, "num_input_tokens_seen": 67119440, "step": 115600 }, { "epoch": 17.218498659517426, "grad_norm": 0.9992042183876038, "learning_rate": 2.8888348555431625e-06, "loss": 0.6355, "num_input_tokens_seen": 67122512, "step": 115605 }, { "epoch": 17.219243372058386, "grad_norm": 1.7171213626861572, "learning_rate": 2.887318727724664e-06, "loss": 0.6687, "num_input_tokens_seen": 67125264, "step": 115610 }, { "epoch": 17.219988084599343, "grad_norm": 1.350462555885315, "learning_rate": 2.88580297347745e-06, "loss": 0.5237, "num_input_tokens_seen": 67128144, "step": 115615 }, { "epoch": 17.220732797140304, "grad_norm": 1.3352515697479248, "learning_rate": 2.884287592827112e-06, "loss": 0.6207, "num_input_tokens_seen": 67131248, "step": 115620 }, { "epoch": 17.221477509681264, "grad_norm": 1.354494333267212, "learning_rate": 2.882772585799262e-06, "loss": 0.5569, "num_input_tokens_seen": 67134256, "step": 115625 }, { "epoch": 17.22222222222222, "grad_norm": 1.3870844841003418, "learning_rate": 2.8812579524194916e-06, "loss": 0.7357, "num_input_tokens_seen": 67137264, "step": 115630 }, { "epoch": 17.222966934763182, "grad_norm": 1.1792534589767456, "learning_rate": 2.879743692713388e-06, "loss": 0.548, "num_input_tokens_seen": 67139984, "step": 115635 }, { "epoch": 17.22371164730414, "grad_norm": 1.7964959144592285, "learning_rate": 2.8782298067065256e-06, "loss": 0.7455, "num_input_tokens_seen": 67142736, "step": 115640 }, { "epoch": 17.2244563598451, "grad_norm": 1.113037347793579, "learning_rate": 2.8767162944244918e-06, "loss": 0.6151, "num_input_tokens_seen": 67145648, "step": 115645 }, { "epoch": 17.22520107238606, "grad_norm": 1.5302445888519287, "learning_rate": 2.875203155892853e-06, "loss": 0.6415, "num_input_tokens_seen": 67148496, "step": 115650 }, { "epoch": 17.225945784927017, "grad_norm": 1.075028419494629, "learning_rate": 2.8736903911371652e-06, "loss": 0.7039, "num_input_tokens_seen": 67151568, "step": 115655 }, { "epoch": 17.226690497467978, "grad_norm": 2.130093812942505, "learning_rate": 2.8721780001829956e-06, "loss": 0.5264, "num_input_tokens_seen": 67154352, "step": 115660 }, { "epoch": 17.227435210008938, "grad_norm": 2.352160692214966, "learning_rate": 2.870665983055881e-06, "loss": 0.6505, "num_input_tokens_seen": 67157264, "step": 115665 }, { "epoch": 17.228179922549895, "grad_norm": 1.9320837259292603, "learning_rate": 2.8691543397813824e-06, "loss": 0.3224, "num_input_tokens_seen": 67159824, "step": 115670 }, { "epoch": 17.228924635090856, "grad_norm": 2.371800661087036, "learning_rate": 2.8676430703850206e-06, "loss": 0.6308, "num_input_tokens_seen": 67162640, "step": 115675 }, { "epoch": 17.229669347631813, "grad_norm": 0.9048948287963867, "learning_rate": 2.8661321748923416e-06, "loss": 0.6848, "num_input_tokens_seen": 67165456, "step": 115680 }, { "epoch": 17.230414060172773, "grad_norm": 1.5815978050231934, "learning_rate": 2.8646216533288556e-06, "loss": 0.4847, "num_input_tokens_seen": 67168176, "step": 115685 }, { "epoch": 17.231158772713734, "grad_norm": 2.4088141918182373, "learning_rate": 2.863111505720098e-06, "loss": 0.6763, "num_input_tokens_seen": 67171184, "step": 115690 }, { "epoch": 17.23190348525469, "grad_norm": 1.6772295236587524, "learning_rate": 2.8616017320915704e-06, "loss": 0.4329, "num_input_tokens_seen": 67173808, "step": 115695 }, { "epoch": 17.23264819779565, "grad_norm": 0.9991193413734436, "learning_rate": 2.8600923324687807e-06, "loss": 0.5056, "num_input_tokens_seen": 67176656, "step": 115700 }, { "epoch": 17.23339291033661, "grad_norm": 1.6457159519195557, "learning_rate": 2.85858330687723e-06, "loss": 0.7398, "num_input_tokens_seen": 67179664, "step": 115705 }, { "epoch": 17.23413762287757, "grad_norm": 0.7433386445045471, "learning_rate": 2.8570746553424065e-06, "loss": 0.5505, "num_input_tokens_seen": 67182384, "step": 115710 }, { "epoch": 17.23488233541853, "grad_norm": 1.4239479303359985, "learning_rate": 2.8555663778898066e-06, "loss": 0.4814, "num_input_tokens_seen": 67185488, "step": 115715 }, { "epoch": 17.235627047959486, "grad_norm": 2.328331470489502, "learning_rate": 2.854058474544899e-06, "loss": 0.7775, "num_input_tokens_seen": 67188336, "step": 115720 }, { "epoch": 17.236371760500447, "grad_norm": 1.2362419366836548, "learning_rate": 2.852550945333174e-06, "loss": 0.5428, "num_input_tokens_seen": 67191376, "step": 115725 }, { "epoch": 17.237116473041407, "grad_norm": 2.4589345455169678, "learning_rate": 2.851043790280089e-06, "loss": 0.5736, "num_input_tokens_seen": 67194288, "step": 115730 }, { "epoch": 17.237861185582364, "grad_norm": 1.577614188194275, "learning_rate": 2.849537009411102e-06, "loss": 0.6271, "num_input_tokens_seen": 67197296, "step": 115735 }, { "epoch": 17.238605898123325, "grad_norm": 3.0889031887054443, "learning_rate": 2.8480306027516807e-06, "loss": 0.5871, "num_input_tokens_seen": 67200656, "step": 115740 }, { "epoch": 17.239350610664285, "grad_norm": 0.8470688462257385, "learning_rate": 2.8465245703272607e-06, "loss": 0.5413, "num_input_tokens_seen": 67203568, "step": 115745 }, { "epoch": 17.240095323205242, "grad_norm": 1.3215335607528687, "learning_rate": 2.8450189121632998e-06, "loss": 0.6205, "num_input_tokens_seen": 67206608, "step": 115750 }, { "epoch": 17.240840035746203, "grad_norm": 0.8535419702529907, "learning_rate": 2.8435136282852217e-06, "loss": 0.4378, "num_input_tokens_seen": 67209520, "step": 115755 }, { "epoch": 17.24158474828716, "grad_norm": 2.152285575866699, "learning_rate": 2.842008718718467e-06, "loss": 0.5528, "num_input_tokens_seen": 67212240, "step": 115760 }, { "epoch": 17.24232946082812, "grad_norm": 1.8921451568603516, "learning_rate": 2.840504183488457e-06, "loss": 0.5802, "num_input_tokens_seen": 67215440, "step": 115765 }, { "epoch": 17.24307417336908, "grad_norm": 0.9382721781730652, "learning_rate": 2.8390000226206025e-06, "loss": 0.4033, "num_input_tokens_seen": 67218512, "step": 115770 }, { "epoch": 17.243818885910038, "grad_norm": 1.3582642078399658, "learning_rate": 2.837496236140322e-06, "loss": 0.5656, "num_input_tokens_seen": 67221392, "step": 115775 }, { "epoch": 17.244563598451, "grad_norm": 1.520674467086792, "learning_rate": 2.835992824073011e-06, "loss": 0.6311, "num_input_tokens_seen": 67224336, "step": 115780 }, { "epoch": 17.245308310991955, "grad_norm": 1.0451042652130127, "learning_rate": 2.8344897864440805e-06, "loss": 0.5255, "num_input_tokens_seen": 67227280, "step": 115785 }, { "epoch": 17.246053023532916, "grad_norm": 1.8484679460525513, "learning_rate": 2.832987123278913e-06, "loss": 0.4628, "num_input_tokens_seen": 67230128, "step": 115790 }, { "epoch": 17.246797736073876, "grad_norm": 1.285872459411621, "learning_rate": 2.8314848346029017e-06, "loss": 0.3596, "num_input_tokens_seen": 67233072, "step": 115795 }, { "epoch": 17.247542448614833, "grad_norm": 1.7012215852737427, "learning_rate": 2.829982920441421e-06, "loss": 0.4958, "num_input_tokens_seen": 67235952, "step": 115800 }, { "epoch": 17.248287161155794, "grad_norm": 1.0533406734466553, "learning_rate": 2.8284813808198473e-06, "loss": 0.6378, "num_input_tokens_seen": 67238896, "step": 115805 }, { "epoch": 17.249031873696755, "grad_norm": 1.6887153387069702, "learning_rate": 2.82698021576355e-06, "loss": 0.4713, "num_input_tokens_seen": 67241392, "step": 115810 }, { "epoch": 17.24977658623771, "grad_norm": 3.4398696422576904, "learning_rate": 2.825479425297878e-06, "loss": 0.6307, "num_input_tokens_seen": 67244144, "step": 115815 }, { "epoch": 17.250521298778672, "grad_norm": 1.4386448860168457, "learning_rate": 2.823979009448202e-06, "loss": 0.5964, "num_input_tokens_seen": 67247216, "step": 115820 }, { "epoch": 17.25126601131963, "grad_norm": 1.475583553314209, "learning_rate": 2.8224789682398556e-06, "loss": 0.5888, "num_input_tokens_seen": 67250256, "step": 115825 }, { "epoch": 17.25201072386059, "grad_norm": 1.2137799263000488, "learning_rate": 2.8209793016981927e-06, "loss": 0.5336, "num_input_tokens_seen": 67253040, "step": 115830 }, { "epoch": 17.25275543640155, "grad_norm": 2.021514654159546, "learning_rate": 2.8194800098485407e-06, "loss": 0.5012, "num_input_tokens_seen": 67255952, "step": 115835 }, { "epoch": 17.253500148942507, "grad_norm": 1.640937328338623, "learning_rate": 2.817981092716232e-06, "loss": 0.5199, "num_input_tokens_seen": 67258640, "step": 115840 }, { "epoch": 17.254244861483468, "grad_norm": 1.2511776685714722, "learning_rate": 2.8164825503265825e-06, "loss": 0.579, "num_input_tokens_seen": 67261584, "step": 115845 }, { "epoch": 17.254989574024428, "grad_norm": 1.5308159589767456, "learning_rate": 2.8149843827049186e-06, "loss": 0.575, "num_input_tokens_seen": 67264528, "step": 115850 }, { "epoch": 17.255734286565385, "grad_norm": 1.9165456295013428, "learning_rate": 2.813486589876549e-06, "loss": 0.6226, "num_input_tokens_seen": 67267536, "step": 115855 }, { "epoch": 17.256478999106346, "grad_norm": 2.3659608364105225, "learning_rate": 2.8119891718667664e-06, "loss": 0.7032, "num_input_tokens_seen": 67270320, "step": 115860 }, { "epoch": 17.257223711647303, "grad_norm": 1.3073222637176514, "learning_rate": 2.8104921287008785e-06, "loss": 0.6443, "num_input_tokens_seen": 67273104, "step": 115865 }, { "epoch": 17.257968424188263, "grad_norm": 1.3921469449996948, "learning_rate": 2.8089954604041734e-06, "loss": 0.4351, "num_input_tokens_seen": 67276080, "step": 115870 }, { "epoch": 17.258713136729224, "grad_norm": 1.9348491430282593, "learning_rate": 2.807499167001937e-06, "loss": 0.8735, "num_input_tokens_seen": 67279152, "step": 115875 }, { "epoch": 17.25945784927018, "grad_norm": 3.414216995239258, "learning_rate": 2.8060032485194453e-06, "loss": 0.6709, "num_input_tokens_seen": 67282192, "step": 115880 }, { "epoch": 17.26020256181114, "grad_norm": 1.0623425245285034, "learning_rate": 2.8045077049819733e-06, "loss": 0.4415, "num_input_tokens_seen": 67284880, "step": 115885 }, { "epoch": 17.2609472743521, "grad_norm": 2.378139019012451, "learning_rate": 2.8030125364147868e-06, "loss": 0.7732, "num_input_tokens_seen": 67287728, "step": 115890 }, { "epoch": 17.26169198689306, "grad_norm": 0.7446974515914917, "learning_rate": 2.8015177428431433e-06, "loss": 0.4953, "num_input_tokens_seen": 67290576, "step": 115895 }, { "epoch": 17.26243669943402, "grad_norm": 2.8199121952056885, "learning_rate": 2.8000233242922973e-06, "loss": 0.63, "num_input_tokens_seen": 67293808, "step": 115900 }, { "epoch": 17.263181411974976, "grad_norm": 2.1778998374938965, "learning_rate": 2.7985292807874873e-06, "loss": 0.4434, "num_input_tokens_seen": 67296432, "step": 115905 }, { "epoch": 17.263926124515937, "grad_norm": 1.517390489578247, "learning_rate": 2.797035612353968e-06, "loss": 0.6503, "num_input_tokens_seen": 67299344, "step": 115910 }, { "epoch": 17.264670837056897, "grad_norm": 1.2513262033462524, "learning_rate": 2.7955423190169585e-06, "loss": 0.5269, "num_input_tokens_seen": 67302384, "step": 115915 }, { "epoch": 17.265415549597854, "grad_norm": 1.4332008361816406, "learning_rate": 2.794049400801699e-06, "loss": 0.4966, "num_input_tokens_seen": 67305552, "step": 115920 }, { "epoch": 17.266160262138815, "grad_norm": 3.4904770851135254, "learning_rate": 2.792556857733403e-06, "loss": 0.6195, "num_input_tokens_seen": 67308336, "step": 115925 }, { "epoch": 17.266904974679775, "grad_norm": 1.9812783002853394, "learning_rate": 2.7910646898372916e-06, "loss": 0.6168, "num_input_tokens_seen": 67311440, "step": 115930 }, { "epoch": 17.267649687220732, "grad_norm": 1.5440821647644043, "learning_rate": 2.7895728971385706e-06, "loss": 0.4234, "num_input_tokens_seen": 67314352, "step": 115935 }, { "epoch": 17.268394399761693, "grad_norm": 2.1084961891174316, "learning_rate": 2.7880814796624355e-06, "loss": 0.5995, "num_input_tokens_seen": 67317168, "step": 115940 }, { "epoch": 17.26913911230265, "grad_norm": 3.068626642227173, "learning_rate": 2.7865904374340947e-06, "loss": 0.3653, "num_input_tokens_seen": 67319920, "step": 115945 }, { "epoch": 17.26988382484361, "grad_norm": 2.0862061977386475, "learning_rate": 2.7850997704787244e-06, "loss": 0.782, "num_input_tokens_seen": 67322960, "step": 115950 }, { "epoch": 17.27062853738457, "grad_norm": 2.9406118392944336, "learning_rate": 2.783609478821525e-06, "loss": 0.5255, "num_input_tokens_seen": 67325776, "step": 115955 }, { "epoch": 17.271373249925528, "grad_norm": 2.530400037765503, "learning_rate": 2.782119562487662e-06, "loss": 0.7713, "num_input_tokens_seen": 67329008, "step": 115960 }, { "epoch": 17.27211796246649, "grad_norm": 1.9659295082092285, "learning_rate": 2.7806300215023063e-06, "loss": 0.6797, "num_input_tokens_seen": 67331760, "step": 115965 }, { "epoch": 17.272862675007445, "grad_norm": 1.4258917570114136, "learning_rate": 2.7791408558906245e-06, "loss": 0.4573, "num_input_tokens_seen": 67334864, "step": 115970 }, { "epoch": 17.273607387548406, "grad_norm": 2.0779871940612793, "learning_rate": 2.777652065677766e-06, "loss": 0.7966, "num_input_tokens_seen": 67337840, "step": 115975 }, { "epoch": 17.274352100089367, "grad_norm": 1.3366700410842896, "learning_rate": 2.7761636508888995e-06, "loss": 0.6834, "num_input_tokens_seen": 67340752, "step": 115980 }, { "epoch": 17.275096812630323, "grad_norm": 1.5527582168579102, "learning_rate": 2.774675611549152e-06, "loss": 0.4676, "num_input_tokens_seen": 67343856, "step": 115985 }, { "epoch": 17.275841525171284, "grad_norm": 1.819974660873413, "learning_rate": 2.773187947683678e-06, "loss": 0.6257, "num_input_tokens_seen": 67346704, "step": 115990 }, { "epoch": 17.276586237712245, "grad_norm": 1.5272818803787231, "learning_rate": 2.7717006593175997e-06, "loss": 0.4868, "num_input_tokens_seen": 67349360, "step": 115995 }, { "epoch": 17.2773309502532, "grad_norm": 0.7152654528617859, "learning_rate": 2.7702137464760497e-06, "loss": 0.4387, "num_input_tokens_seen": 67352752, "step": 116000 }, { "epoch": 17.278075662794162, "grad_norm": 1.3551350831985474, "learning_rate": 2.768727209184141e-06, "loss": 0.6412, "num_input_tokens_seen": 67355440, "step": 116005 }, { "epoch": 17.27882037533512, "grad_norm": 1.6470085382461548, "learning_rate": 2.767241047466998e-06, "loss": 0.5412, "num_input_tokens_seen": 67358256, "step": 116010 }, { "epoch": 17.27956508787608, "grad_norm": 1.1381813287734985, "learning_rate": 2.765755261349717e-06, "loss": 0.4838, "num_input_tokens_seen": 67361072, "step": 116015 }, { "epoch": 17.28030980041704, "grad_norm": 1.6493496894836426, "learning_rate": 2.764269850857401e-06, "loss": 0.6392, "num_input_tokens_seen": 67363888, "step": 116020 }, { "epoch": 17.281054512957997, "grad_norm": 2.7256693840026855, "learning_rate": 2.7627848160151513e-06, "loss": 0.6336, "num_input_tokens_seen": 67366768, "step": 116025 }, { "epoch": 17.281799225498958, "grad_norm": 2.7305006980895996, "learning_rate": 2.7613001568480514e-06, "loss": 0.5564, "num_input_tokens_seen": 67369872, "step": 116030 }, { "epoch": 17.282543938039918, "grad_norm": 1.7517635822296143, "learning_rate": 2.759815873381183e-06, "loss": 0.564, "num_input_tokens_seen": 67372944, "step": 116035 }, { "epoch": 17.283288650580875, "grad_norm": 1.782002329826355, "learning_rate": 2.7583319656396155e-06, "loss": 0.5985, "num_input_tokens_seen": 67376208, "step": 116040 }, { "epoch": 17.284033363121836, "grad_norm": 1.2644098997116089, "learning_rate": 2.756848433648429e-06, "loss": 0.6456, "num_input_tokens_seen": 67378832, "step": 116045 }, { "epoch": 17.284778075662793, "grad_norm": 1.9272968769073486, "learning_rate": 2.755365277432681e-06, "loss": 0.6994, "num_input_tokens_seen": 67381776, "step": 116050 }, { "epoch": 17.285522788203753, "grad_norm": 2.495039463043213, "learning_rate": 2.753882497017424e-06, "loss": 0.6524, "num_input_tokens_seen": 67384784, "step": 116055 }, { "epoch": 17.286267500744714, "grad_norm": 3.287262439727783, "learning_rate": 2.7524000924277178e-06, "loss": 0.4502, "num_input_tokens_seen": 67387824, "step": 116060 }, { "epoch": 17.28701221328567, "grad_norm": 0.9245439171791077, "learning_rate": 2.7509180636885927e-06, "loss": 0.5398, "num_input_tokens_seen": 67390832, "step": 116065 }, { "epoch": 17.28775692582663, "grad_norm": 1.6238511800765991, "learning_rate": 2.7494364108251016e-06, "loss": 0.5546, "num_input_tokens_seen": 67393808, "step": 116070 }, { "epoch": 17.288501638367592, "grad_norm": 1.3905869722366333, "learning_rate": 2.747955133862262e-06, "loss": 0.4999, "num_input_tokens_seen": 67396944, "step": 116075 }, { "epoch": 17.28924635090855, "grad_norm": 2.1275408267974854, "learning_rate": 2.746474232825111e-06, "loss": 0.5466, "num_input_tokens_seen": 67399696, "step": 116080 }, { "epoch": 17.28999106344951, "grad_norm": 3.053846836090088, "learning_rate": 2.744993707738655e-06, "loss": 0.6123, "num_input_tokens_seen": 67402288, "step": 116085 }, { "epoch": 17.290735775990466, "grad_norm": 1.5323197841644287, "learning_rate": 2.7435135586279165e-06, "loss": 0.5605, "num_input_tokens_seen": 67405136, "step": 116090 }, { "epoch": 17.291480488531427, "grad_norm": 0.4102765619754791, "learning_rate": 2.7420337855178944e-06, "loss": 0.3603, "num_input_tokens_seen": 67407920, "step": 116095 }, { "epoch": 17.292225201072387, "grad_norm": 1.910326600074768, "learning_rate": 2.7405543884335887e-06, "loss": 0.5376, "num_input_tokens_seen": 67410832, "step": 116100 }, { "epoch": 17.292969913613344, "grad_norm": 2.0403387546539307, "learning_rate": 2.739075367399996e-06, "loss": 0.5494, "num_input_tokens_seen": 67413616, "step": 116105 }, { "epoch": 17.293714626154305, "grad_norm": 1.6645334959030151, "learning_rate": 2.7375967224420928e-06, "loss": 0.4935, "num_input_tokens_seen": 67416528, "step": 116110 }, { "epoch": 17.294459338695262, "grad_norm": 2.027353048324585, "learning_rate": 2.736118453584871e-06, "loss": 0.457, "num_input_tokens_seen": 67419504, "step": 116115 }, { "epoch": 17.295204051236222, "grad_norm": 1.265906572341919, "learning_rate": 2.7346405608532965e-06, "loss": 0.6029, "num_input_tokens_seen": 67422384, "step": 116120 }, { "epoch": 17.295948763777183, "grad_norm": 1.6655380725860596, "learning_rate": 2.7331630442723466e-06, "loss": 0.7193, "num_input_tokens_seen": 67425488, "step": 116125 }, { "epoch": 17.29669347631814, "grad_norm": 1.0996670722961426, "learning_rate": 2.7316859038669736e-06, "loss": 0.5066, "num_input_tokens_seen": 67428592, "step": 116130 }, { "epoch": 17.2974381888591, "grad_norm": 0.9632664918899536, "learning_rate": 2.7302091396621294e-06, "loss": 0.5376, "num_input_tokens_seen": 67431312, "step": 116135 }, { "epoch": 17.29818290140006, "grad_norm": 2.2681725025177, "learning_rate": 2.7287327516827748e-06, "loss": 0.6299, "num_input_tokens_seen": 67434128, "step": 116140 }, { "epoch": 17.298927613941018, "grad_norm": 1.6315829753875732, "learning_rate": 2.7272567399538375e-06, "loss": 0.8005, "num_input_tokens_seen": 67436912, "step": 116145 }, { "epoch": 17.29967232648198, "grad_norm": 1.7369887828826904, "learning_rate": 2.725781104500269e-06, "loss": 0.5908, "num_input_tokens_seen": 67439760, "step": 116150 }, { "epoch": 17.300417039022935, "grad_norm": 1.3394386768341064, "learning_rate": 2.7243058453469835e-06, "loss": 0.7556, "num_input_tokens_seen": 67442672, "step": 116155 }, { "epoch": 17.301161751563896, "grad_norm": 1.6368027925491333, "learning_rate": 2.722830962518913e-06, "loss": 0.6363, "num_input_tokens_seen": 67445328, "step": 116160 }, { "epoch": 17.301906464104857, "grad_norm": 1.258332371711731, "learning_rate": 2.7213564560409743e-06, "loss": 0.7178, "num_input_tokens_seen": 67448176, "step": 116165 }, { "epoch": 17.302651176645814, "grad_norm": 2.366704225540161, "learning_rate": 2.7198823259380777e-06, "loss": 0.5966, "num_input_tokens_seen": 67450960, "step": 116170 }, { "epoch": 17.303395889186774, "grad_norm": 1.2495020627975464, "learning_rate": 2.7184085722351205e-06, "loss": 0.6275, "num_input_tokens_seen": 67454000, "step": 116175 }, { "epoch": 17.304140601727735, "grad_norm": 2.636687994003296, "learning_rate": 2.7169351949570017e-06, "loss": 0.6409, "num_input_tokens_seen": 67456976, "step": 116180 }, { "epoch": 17.30488531426869, "grad_norm": 1.9201674461364746, "learning_rate": 2.7154621941286206e-06, "loss": 0.7108, "num_input_tokens_seen": 67459856, "step": 116185 }, { "epoch": 17.305630026809652, "grad_norm": 2.172562837600708, "learning_rate": 2.7139895697748496e-06, "loss": 0.6278, "num_input_tokens_seen": 67462640, "step": 116190 }, { "epoch": 17.30637473935061, "grad_norm": 2.235976457595825, "learning_rate": 2.7125173219205824e-06, "loss": 0.6295, "num_input_tokens_seen": 67465840, "step": 116195 }, { "epoch": 17.30711945189157, "grad_norm": 1.3095636367797852, "learning_rate": 2.711045450590677e-06, "loss": 0.7843, "num_input_tokens_seen": 67468656, "step": 116200 }, { "epoch": 17.30786416443253, "grad_norm": 0.8538109064102173, "learning_rate": 2.7095739558100074e-06, "loss": 0.5184, "num_input_tokens_seen": 67471600, "step": 116205 }, { "epoch": 17.308608876973487, "grad_norm": 1.215710163116455, "learning_rate": 2.708102837603435e-06, "loss": 0.5728, "num_input_tokens_seen": 67474256, "step": 116210 }, { "epoch": 17.309353589514448, "grad_norm": 1.167962670326233, "learning_rate": 2.706632095995801e-06, "loss": 0.6187, "num_input_tokens_seen": 67477072, "step": 116215 }, { "epoch": 17.31009830205541, "grad_norm": 1.6467512845993042, "learning_rate": 2.7051617310119653e-06, "loss": 0.5531, "num_input_tokens_seen": 67479920, "step": 116220 }, { "epoch": 17.310843014596365, "grad_norm": 0.8801074028015137, "learning_rate": 2.7036917426767615e-06, "loss": 0.5023, "num_input_tokens_seen": 67482704, "step": 116225 }, { "epoch": 17.311587727137326, "grad_norm": 1.272439956665039, "learning_rate": 2.702222131015028e-06, "loss": 0.5464, "num_input_tokens_seen": 67485584, "step": 116230 }, { "epoch": 17.312332439678283, "grad_norm": 1.2096244096755981, "learning_rate": 2.700752896051581e-06, "loss": 0.735, "num_input_tokens_seen": 67488432, "step": 116235 }, { "epoch": 17.313077152219243, "grad_norm": 2.8183085918426514, "learning_rate": 2.699284037811256e-06, "loss": 0.7967, "num_input_tokens_seen": 67491248, "step": 116240 }, { "epoch": 17.313821864760204, "grad_norm": 0.9192330837249756, "learning_rate": 2.6978155563188583e-06, "loss": 0.3802, "num_input_tokens_seen": 67494256, "step": 116245 }, { "epoch": 17.31456657730116, "grad_norm": 1.7734463214874268, "learning_rate": 2.6963474515992044e-06, "loss": 0.7979, "num_input_tokens_seen": 67496880, "step": 116250 }, { "epoch": 17.31531128984212, "grad_norm": 2.87323260307312, "learning_rate": 2.6948797236770907e-06, "loss": 0.674, "num_input_tokens_seen": 67499760, "step": 116255 }, { "epoch": 17.316056002383082, "grad_norm": 0.8463721871376038, "learning_rate": 2.6934123725773088e-06, "loss": 0.4086, "num_input_tokens_seen": 67502544, "step": 116260 }, { "epoch": 17.31680071492404, "grad_norm": 1.0040557384490967, "learning_rate": 2.6919453983246577e-06, "loss": 0.784, "num_input_tokens_seen": 67505360, "step": 116265 }, { "epoch": 17.317545427465, "grad_norm": 1.226261854171753, "learning_rate": 2.690478800943913e-06, "loss": 0.3791, "num_input_tokens_seen": 67508336, "step": 116270 }, { "epoch": 17.318290140005956, "grad_norm": 1.2816708087921143, "learning_rate": 2.68901258045986e-06, "loss": 0.5435, "num_input_tokens_seen": 67511376, "step": 116275 }, { "epoch": 17.319034852546917, "grad_norm": 1.4293229579925537, "learning_rate": 2.6875467368972563e-06, "loss": 0.6971, "num_input_tokens_seen": 67514448, "step": 116280 }, { "epoch": 17.319779565087877, "grad_norm": 1.5184013843536377, "learning_rate": 2.6860812702808795e-06, "loss": 0.6776, "num_input_tokens_seen": 67517104, "step": 116285 }, { "epoch": 17.320524277628834, "grad_norm": 0.8584113717079163, "learning_rate": 2.6846161806354826e-06, "loss": 0.52, "num_input_tokens_seen": 67519856, "step": 116290 }, { "epoch": 17.321268990169795, "grad_norm": 1.4821416139602661, "learning_rate": 2.6831514679858115e-06, "loss": 0.5641, "num_input_tokens_seen": 67522832, "step": 116295 }, { "epoch": 17.322013702710752, "grad_norm": 1.480578064918518, "learning_rate": 2.6816871323566165e-06, "loss": 0.6702, "num_input_tokens_seen": 67525712, "step": 116300 }, { "epoch": 17.322758415251712, "grad_norm": 1.9995189905166626, "learning_rate": 2.6802231737726307e-06, "loss": 0.667, "num_input_tokens_seen": 67528528, "step": 116305 }, { "epoch": 17.323503127792673, "grad_norm": 3.4719765186309814, "learning_rate": 2.6787595922585924e-06, "loss": 0.5854, "num_input_tokens_seen": 67531312, "step": 116310 }, { "epoch": 17.32424784033363, "grad_norm": 1.4246772527694702, "learning_rate": 2.6772963878392177e-06, "loss": 0.6806, "num_input_tokens_seen": 67534032, "step": 116315 }, { "epoch": 17.32499255287459, "grad_norm": 1.7798820734024048, "learning_rate": 2.67583356053924e-06, "loss": 0.5628, "num_input_tokens_seen": 67536752, "step": 116320 }, { "epoch": 17.32573726541555, "grad_norm": 2.774224042892456, "learning_rate": 2.6743711103833614e-06, "loss": 0.4013, "num_input_tokens_seen": 67539792, "step": 116325 }, { "epoch": 17.326481977956508, "grad_norm": 2.0906567573547363, "learning_rate": 2.6729090373962957e-06, "loss": 0.6408, "num_input_tokens_seen": 67542704, "step": 116330 }, { "epoch": 17.32722669049747, "grad_norm": 2.371169090270996, "learning_rate": 2.67144734160274e-06, "loss": 0.597, "num_input_tokens_seen": 67545712, "step": 116335 }, { "epoch": 17.327971403038426, "grad_norm": 2.557701826095581, "learning_rate": 2.669986023027382e-06, "loss": 0.5833, "num_input_tokens_seen": 67548592, "step": 116340 }, { "epoch": 17.328716115579386, "grad_norm": 2.0695791244506836, "learning_rate": 2.6685250816949196e-06, "loss": 0.639, "num_input_tokens_seen": 67551632, "step": 116345 }, { "epoch": 17.329460828120347, "grad_norm": 1.9114772081375122, "learning_rate": 2.6670645176300246e-06, "loss": 0.576, "num_input_tokens_seen": 67554800, "step": 116350 }, { "epoch": 17.330205540661304, "grad_norm": 1.9541651010513306, "learning_rate": 2.6656043308573826e-06, "loss": 1.091, "num_input_tokens_seen": 67557552, "step": 116355 }, { "epoch": 17.330950253202264, "grad_norm": 1.4003558158874512, "learning_rate": 2.664144521401654e-06, "loss": 0.4743, "num_input_tokens_seen": 67560944, "step": 116360 }, { "epoch": 17.331694965743225, "grad_norm": 2.2899088859558105, "learning_rate": 2.6626850892875033e-06, "loss": 0.3666, "num_input_tokens_seen": 67563664, "step": 116365 }, { "epoch": 17.33243967828418, "grad_norm": 1.174144983291626, "learning_rate": 2.6612260345395797e-06, "loss": 0.6703, "num_input_tokens_seen": 67566320, "step": 116370 }, { "epoch": 17.333184390825142, "grad_norm": 2.7692856788635254, "learning_rate": 2.6597673571825436e-06, "loss": 0.6991, "num_input_tokens_seen": 67569136, "step": 116375 }, { "epoch": 17.3339291033661, "grad_norm": 2.547654628753662, "learning_rate": 2.658309057241032e-06, "loss": 0.6611, "num_input_tokens_seen": 67571792, "step": 116380 }, { "epoch": 17.33467381590706, "grad_norm": 1.1486653089523315, "learning_rate": 2.6568511347396795e-06, "loss": 0.5987, "num_input_tokens_seen": 67574832, "step": 116385 }, { "epoch": 17.33541852844802, "grad_norm": 1.865315556526184, "learning_rate": 2.6553935897031203e-06, "loss": 0.487, "num_input_tokens_seen": 67577904, "step": 116390 }, { "epoch": 17.336163240988977, "grad_norm": 1.186225175857544, "learning_rate": 2.6539364221559725e-06, "loss": 0.7809, "num_input_tokens_seen": 67580816, "step": 116395 }, { "epoch": 17.336907953529938, "grad_norm": 1.1942477226257324, "learning_rate": 2.652479632122862e-06, "loss": 0.6933, "num_input_tokens_seen": 67583760, "step": 116400 }, { "epoch": 17.3376526660709, "grad_norm": 1.416257381439209, "learning_rate": 2.65102321962839e-06, "loss": 0.5142, "num_input_tokens_seen": 67586960, "step": 116405 }, { "epoch": 17.338397378611855, "grad_norm": 1.5628968477249146, "learning_rate": 2.6495671846971716e-06, "loss": 0.684, "num_input_tokens_seen": 67589776, "step": 116410 }, { "epoch": 17.339142091152816, "grad_norm": 2.701737880706787, "learning_rate": 2.6481115273538e-06, "loss": 0.6765, "num_input_tokens_seen": 67592592, "step": 116415 }, { "epoch": 17.339886803693773, "grad_norm": 1.4045076370239258, "learning_rate": 2.6466562476228612e-06, "loss": 0.4425, "num_input_tokens_seen": 67595760, "step": 116420 }, { "epoch": 17.340631516234733, "grad_norm": 1.6092547178268433, "learning_rate": 2.645201345528953e-06, "loss": 0.4297, "num_input_tokens_seen": 67598864, "step": 116425 }, { "epoch": 17.341376228775694, "grad_norm": 1.6273328065872192, "learning_rate": 2.643746821096646e-06, "loss": 0.6376, "num_input_tokens_seen": 67601744, "step": 116430 }, { "epoch": 17.34212094131665, "grad_norm": 2.8986356258392334, "learning_rate": 2.6422926743505132e-06, "loss": 0.5966, "num_input_tokens_seen": 67604592, "step": 116435 }, { "epoch": 17.34286565385761, "grad_norm": 2.6087253093719482, "learning_rate": 2.6408389053151185e-06, "loss": 0.5421, "num_input_tokens_seen": 67607376, "step": 116440 }, { "epoch": 17.343610366398572, "grad_norm": 3.6215243339538574, "learning_rate": 2.6393855140150304e-06, "loss": 0.5514, "num_input_tokens_seen": 67609872, "step": 116445 }, { "epoch": 17.34435507893953, "grad_norm": 2.05358624458313, "learning_rate": 2.6379325004747937e-06, "loss": 0.5407, "num_input_tokens_seen": 67613456, "step": 116450 }, { "epoch": 17.34509979148049, "grad_norm": 1.656511664390564, "learning_rate": 2.636479864718966e-06, "loss": 0.7242, "num_input_tokens_seen": 67616336, "step": 116455 }, { "epoch": 17.345844504021446, "grad_norm": 1.6636992692947388, "learning_rate": 2.635027606772078e-06, "loss": 0.4545, "num_input_tokens_seen": 67618992, "step": 116460 }, { "epoch": 17.346589216562407, "grad_norm": 2.07121205329895, "learning_rate": 2.633575726658666e-06, "loss": 0.7876, "num_input_tokens_seen": 67622032, "step": 116465 }, { "epoch": 17.347333929103367, "grad_norm": 1.7017511129379272, "learning_rate": 2.632124224403262e-06, "loss": 0.5324, "num_input_tokens_seen": 67624784, "step": 116470 }, { "epoch": 17.348078641644324, "grad_norm": 2.8639416694641113, "learning_rate": 2.6306731000303842e-06, "loss": 0.532, "num_input_tokens_seen": 67627856, "step": 116475 }, { "epoch": 17.348823354185285, "grad_norm": 1.3340048789978027, "learning_rate": 2.6292223535645507e-06, "loss": 0.5464, "num_input_tokens_seen": 67630544, "step": 116480 }, { "epoch": 17.349568066726242, "grad_norm": 1.5469518899917603, "learning_rate": 2.627771985030264e-06, "loss": 0.5405, "num_input_tokens_seen": 67633360, "step": 116485 }, { "epoch": 17.350312779267203, "grad_norm": 2.767695426940918, "learning_rate": 2.6263219944520383e-06, "loss": 0.6137, "num_input_tokens_seen": 67636240, "step": 116490 }, { "epoch": 17.351057491808163, "grad_norm": 1.363134503364563, "learning_rate": 2.6248723818543625e-06, "loss": 0.4895, "num_input_tokens_seen": 67639376, "step": 116495 }, { "epoch": 17.35180220434912, "grad_norm": 1.3473517894744873, "learning_rate": 2.6234231472617276e-06, "loss": 0.8934, "num_input_tokens_seen": 67642352, "step": 116500 }, { "epoch": 17.35254691689008, "grad_norm": 2.302757978439331, "learning_rate": 2.6219742906986143e-06, "loss": 0.551, "num_input_tokens_seen": 67645136, "step": 116505 }, { "epoch": 17.35329162943104, "grad_norm": 1.4854776859283447, "learning_rate": 2.6205258121894976e-06, "loss": 0.4838, "num_input_tokens_seen": 67647888, "step": 116510 }, { "epoch": 17.354036341971998, "grad_norm": 1.2205770015716553, "learning_rate": 2.619077711758858e-06, "loss": 0.5304, "num_input_tokens_seen": 67651184, "step": 116515 }, { "epoch": 17.35478105451296, "grad_norm": 0.9648435115814209, "learning_rate": 2.6176299894311444e-06, "loss": 0.6119, "num_input_tokens_seen": 67654448, "step": 116520 }, { "epoch": 17.355525767053916, "grad_norm": 1.4263811111450195, "learning_rate": 2.616182645230833e-06, "loss": 0.459, "num_input_tokens_seen": 67657232, "step": 116525 }, { "epoch": 17.356270479594876, "grad_norm": 1.483860969543457, "learning_rate": 2.614735679182359e-06, "loss": 0.6034, "num_input_tokens_seen": 67659888, "step": 116530 }, { "epoch": 17.357015192135837, "grad_norm": 1.607183575630188, "learning_rate": 2.6132890913101783e-06, "loss": 0.5824, "num_input_tokens_seen": 67662544, "step": 116535 }, { "epoch": 17.357759904676794, "grad_norm": 2.0911338329315186, "learning_rate": 2.6118428816387265e-06, "loss": 0.5347, "num_input_tokens_seen": 67665552, "step": 116540 }, { "epoch": 17.358504617217754, "grad_norm": 1.5092592239379883, "learning_rate": 2.610397050192431e-06, "loss": 0.7691, "num_input_tokens_seen": 67668688, "step": 116545 }, { "epoch": 17.359249329758715, "grad_norm": 2.1407294273376465, "learning_rate": 2.6089515969957263e-06, "loss": 0.6012, "num_input_tokens_seen": 67671536, "step": 116550 }, { "epoch": 17.35999404229967, "grad_norm": 3.0655081272125244, "learning_rate": 2.607506522073025e-06, "loss": 0.553, "num_input_tokens_seen": 67674288, "step": 116555 }, { "epoch": 17.360738754840632, "grad_norm": 3.0206985473632812, "learning_rate": 2.606061825448744e-06, "loss": 0.6643, "num_input_tokens_seen": 67677008, "step": 116560 }, { "epoch": 17.36148346738159, "grad_norm": 3.0847277641296387, "learning_rate": 2.6046175071472835e-06, "loss": 0.579, "num_input_tokens_seen": 67679664, "step": 116565 }, { "epoch": 17.36222817992255, "grad_norm": 2.105170726776123, "learning_rate": 2.603173567193054e-06, "loss": 0.588, "num_input_tokens_seen": 67682544, "step": 116570 }, { "epoch": 17.36297289246351, "grad_norm": 1.8155275583267212, "learning_rate": 2.601730005610442e-06, "loss": 0.4796, "num_input_tokens_seen": 67685616, "step": 116575 }, { "epoch": 17.363717605004467, "grad_norm": 1.9452881813049316, "learning_rate": 2.6002868224238334e-06, "loss": 0.6568, "num_input_tokens_seen": 67688688, "step": 116580 }, { "epoch": 17.364462317545428, "grad_norm": 2.4985580444335938, "learning_rate": 2.5988440176576172e-06, "loss": 0.6298, "num_input_tokens_seen": 67691472, "step": 116585 }, { "epoch": 17.36520703008639, "grad_norm": 1.7571053504943848, "learning_rate": 2.5974015913361597e-06, "loss": 0.7489, "num_input_tokens_seen": 67694352, "step": 116590 }, { "epoch": 17.365951742627345, "grad_norm": 1.5438053607940674, "learning_rate": 2.5959595434838363e-06, "loss": 0.6257, "num_input_tokens_seen": 67697200, "step": 116595 }, { "epoch": 17.366696455168306, "grad_norm": 0.7945802807807922, "learning_rate": 2.594517874125005e-06, "loss": 0.6802, "num_input_tokens_seen": 67699952, "step": 116600 }, { "epoch": 17.367441167709263, "grad_norm": 1.7573599815368652, "learning_rate": 2.5930765832840238e-06, "loss": 0.621, "num_input_tokens_seen": 67702928, "step": 116605 }, { "epoch": 17.368185880250223, "grad_norm": 1.9922144412994385, "learning_rate": 2.5916356709852373e-06, "loss": 0.555, "num_input_tokens_seen": 67705872, "step": 116610 }, { "epoch": 17.368930592791184, "grad_norm": 0.9800112843513489, "learning_rate": 2.5901951372529933e-06, "loss": 0.4869, "num_input_tokens_seen": 67708528, "step": 116615 }, { "epoch": 17.36967530533214, "grad_norm": 3.595702648162842, "learning_rate": 2.5887549821116297e-06, "loss": 0.7023, "num_input_tokens_seen": 67711568, "step": 116620 }, { "epoch": 17.3704200178731, "grad_norm": 1.98032808303833, "learning_rate": 2.5873152055854694e-06, "loss": 0.6002, "num_input_tokens_seen": 67714480, "step": 116625 }, { "epoch": 17.37116473041406, "grad_norm": 1.3702242374420166, "learning_rate": 2.5858758076988425e-06, "loss": 0.653, "num_input_tokens_seen": 67717296, "step": 116630 }, { "epoch": 17.37190944295502, "grad_norm": 1.246930718421936, "learning_rate": 2.5844367884760577e-06, "loss": 0.5212, "num_input_tokens_seen": 67720144, "step": 116635 }, { "epoch": 17.37265415549598, "grad_norm": 0.9762150049209595, "learning_rate": 2.5829981479414346e-06, "loss": 0.5656, "num_input_tokens_seen": 67723056, "step": 116640 }, { "epoch": 17.373398868036936, "grad_norm": 1.9955952167510986, "learning_rate": 2.58155988611927e-06, "loss": 0.6752, "num_input_tokens_seen": 67725936, "step": 116645 }, { "epoch": 17.374143580577897, "grad_norm": 1.1848300695419312, "learning_rate": 2.580122003033872e-06, "loss": 0.6595, "num_input_tokens_seen": 67728880, "step": 116650 }, { "epoch": 17.374888293118858, "grad_norm": 1.505780816078186, "learning_rate": 2.578684498709524e-06, "loss": 0.5719, "num_input_tokens_seen": 67731856, "step": 116655 }, { "epoch": 17.375633005659815, "grad_norm": 1.5504380464553833, "learning_rate": 2.5772473731705106e-06, "loss": 0.6829, "num_input_tokens_seen": 67734448, "step": 116660 }, { "epoch": 17.376377718200775, "grad_norm": 1.2710227966308594, "learning_rate": 2.5758106264411193e-06, "loss": 0.4497, "num_input_tokens_seen": 67737360, "step": 116665 }, { "epoch": 17.377122430741732, "grad_norm": 1.7195714712142944, "learning_rate": 2.574374258545609e-06, "loss": 0.7009, "num_input_tokens_seen": 67740432, "step": 116670 }, { "epoch": 17.377867143282693, "grad_norm": 0.8604565858840942, "learning_rate": 2.5729382695082572e-06, "loss": 0.4704, "num_input_tokens_seen": 67743344, "step": 116675 }, { "epoch": 17.378611855823653, "grad_norm": 1.701527714729309, "learning_rate": 2.5715026593533172e-06, "loss": 0.598, "num_input_tokens_seen": 67746128, "step": 116680 }, { "epoch": 17.37935656836461, "grad_norm": 1.1891478300094604, "learning_rate": 2.5700674281050496e-06, "loss": 0.5932, "num_input_tokens_seen": 67749200, "step": 116685 }, { "epoch": 17.38010128090557, "grad_norm": 1.7010059356689453, "learning_rate": 2.568632575787694e-06, "loss": 0.458, "num_input_tokens_seen": 67751920, "step": 116690 }, { "epoch": 17.38084599344653, "grad_norm": 1.1585553884506226, "learning_rate": 2.567198102425494e-06, "loss": 0.5825, "num_input_tokens_seen": 67754864, "step": 116695 }, { "epoch": 17.381590705987488, "grad_norm": 1.7651597261428833, "learning_rate": 2.5657640080426815e-06, "loss": 0.5713, "num_input_tokens_seen": 67757744, "step": 116700 }, { "epoch": 17.38233541852845, "grad_norm": 3.0407752990722656, "learning_rate": 2.5643302926634804e-06, "loss": 0.7631, "num_input_tokens_seen": 67760400, "step": 116705 }, { "epoch": 17.383080131069406, "grad_norm": 2.357652425765991, "learning_rate": 2.562896956312122e-06, "loss": 0.6584, "num_input_tokens_seen": 67763280, "step": 116710 }, { "epoch": 17.383824843610366, "grad_norm": 1.1849344968795776, "learning_rate": 2.5614639990128113e-06, "loss": 0.5025, "num_input_tokens_seen": 67766064, "step": 116715 }, { "epoch": 17.384569556151327, "grad_norm": 1.1533807516098022, "learning_rate": 2.560031420789763e-06, "loss": 0.6996, "num_input_tokens_seen": 67768752, "step": 116720 }, { "epoch": 17.385314268692284, "grad_norm": 1.750995397567749, "learning_rate": 2.558599221667174e-06, "loss": 0.5986, "num_input_tokens_seen": 67771728, "step": 116725 }, { "epoch": 17.386058981233244, "grad_norm": 1.6601312160491943, "learning_rate": 2.557167401669247e-06, "loss": 0.693, "num_input_tokens_seen": 67774864, "step": 116730 }, { "epoch": 17.386803693774205, "grad_norm": 0.8466936945915222, "learning_rate": 2.555735960820169e-06, "loss": 0.4674, "num_input_tokens_seen": 67777968, "step": 116735 }, { "epoch": 17.38754840631516, "grad_norm": 1.4947090148925781, "learning_rate": 2.554304899144111e-06, "loss": 0.7384, "num_input_tokens_seen": 67781232, "step": 116740 }, { "epoch": 17.388293118856122, "grad_norm": 3.2092764377593994, "learning_rate": 2.552874216665269e-06, "loss": 0.6457, "num_input_tokens_seen": 67784112, "step": 116745 }, { "epoch": 17.38903783139708, "grad_norm": 2.315803289413452, "learning_rate": 2.5514439134077945e-06, "loss": 0.6789, "num_input_tokens_seen": 67787120, "step": 116750 }, { "epoch": 17.38978254393804, "grad_norm": 1.276746392250061, "learning_rate": 2.5500139893958663e-06, "loss": 0.5802, "num_input_tokens_seen": 67790032, "step": 116755 }, { "epoch": 17.390527256479, "grad_norm": 1.9573884010314941, "learning_rate": 2.5485844446536316e-06, "loss": 0.5927, "num_input_tokens_seen": 67793264, "step": 116760 }, { "epoch": 17.391271969019957, "grad_norm": 2.3747830390930176, "learning_rate": 2.5471552792052463e-06, "loss": 0.6134, "num_input_tokens_seen": 67796016, "step": 116765 }, { "epoch": 17.392016681560918, "grad_norm": 1.7617475986480713, "learning_rate": 2.545726493074849e-06, "loss": 0.5554, "num_input_tokens_seen": 67798896, "step": 116770 }, { "epoch": 17.39276139410188, "grad_norm": 3.9982903003692627, "learning_rate": 2.544298086286584e-06, "loss": 0.7305, "num_input_tokens_seen": 67801680, "step": 116775 }, { "epoch": 17.393506106642835, "grad_norm": 1.136736273765564, "learning_rate": 2.54287005886458e-06, "loss": 0.5719, "num_input_tokens_seen": 67804592, "step": 116780 }, { "epoch": 17.394250819183796, "grad_norm": 2.0717413425445557, "learning_rate": 2.541442410832959e-06, "loss": 0.5193, "num_input_tokens_seen": 67807632, "step": 116785 }, { "epoch": 17.394995531724753, "grad_norm": 3.304609537124634, "learning_rate": 2.540015142215846e-06, "loss": 0.4143, "num_input_tokens_seen": 67810512, "step": 116790 }, { "epoch": 17.395740244265713, "grad_norm": 0.8697295188903809, "learning_rate": 2.5385882530373438e-06, "loss": 0.4712, "num_input_tokens_seen": 67813168, "step": 116795 }, { "epoch": 17.396484956806674, "grad_norm": 2.549126386642456, "learning_rate": 2.53716174332157e-06, "loss": 0.6861, "num_input_tokens_seen": 67815856, "step": 116800 }, { "epoch": 17.39722966934763, "grad_norm": 2.7560107707977295, "learning_rate": 2.535735613092613e-06, "loss": 0.8631, "num_input_tokens_seen": 67818576, "step": 116805 }, { "epoch": 17.39797438188859, "grad_norm": 2.407625198364258, "learning_rate": 2.5343098623745787e-06, "loss": 0.4536, "num_input_tokens_seen": 67821200, "step": 116810 }, { "epoch": 17.39871909442955, "grad_norm": 1.238795280456543, "learning_rate": 2.532884491191542e-06, "loss": 0.6181, "num_input_tokens_seen": 67823984, "step": 116815 }, { "epoch": 17.39946380697051, "grad_norm": 2.2653961181640625, "learning_rate": 2.5314594995675845e-06, "loss": 0.5998, "num_input_tokens_seen": 67826704, "step": 116820 }, { "epoch": 17.40020851951147, "grad_norm": 1.0091606378555298, "learning_rate": 2.530034887526789e-06, "loss": 0.5304, "num_input_tokens_seen": 67829712, "step": 116825 }, { "epoch": 17.400953232052427, "grad_norm": 1.5768989324569702, "learning_rate": 2.5286106550932164e-06, "loss": 0.5294, "num_input_tokens_seen": 67832528, "step": 116830 }, { "epoch": 17.401697944593387, "grad_norm": 1.2859281301498413, "learning_rate": 2.527186802290926e-06, "loss": 0.4954, "num_input_tokens_seen": 67835440, "step": 116835 }, { "epoch": 17.402442657134348, "grad_norm": 1.2381930351257324, "learning_rate": 2.525763329143971e-06, "loss": 0.6002, "num_input_tokens_seen": 67838768, "step": 116840 }, { "epoch": 17.403187369675305, "grad_norm": 1.933698296546936, "learning_rate": 2.5243402356764063e-06, "loss": 0.6317, "num_input_tokens_seen": 67841776, "step": 116845 }, { "epoch": 17.403932082216265, "grad_norm": 1.1451733112335205, "learning_rate": 2.522917521912266e-06, "loss": 0.3811, "num_input_tokens_seen": 67844368, "step": 116850 }, { "epoch": 17.404676794757222, "grad_norm": 1.1740601062774658, "learning_rate": 2.5214951878755944e-06, "loss": 0.561, "num_input_tokens_seen": 67847376, "step": 116855 }, { "epoch": 17.405421507298183, "grad_norm": 1.755906581878662, "learning_rate": 2.520073233590414e-06, "loss": 0.5031, "num_input_tokens_seen": 67850096, "step": 116860 }, { "epoch": 17.406166219839143, "grad_norm": 3.023996591567993, "learning_rate": 2.5186516590807453e-06, "loss": 0.68, "num_input_tokens_seen": 67852720, "step": 116865 }, { "epoch": 17.4069109323801, "grad_norm": 1.7246953248977661, "learning_rate": 2.5172304643706123e-06, "loss": 0.566, "num_input_tokens_seen": 67855376, "step": 116870 }, { "epoch": 17.40765564492106, "grad_norm": 0.9283047914505005, "learning_rate": 2.515809649484016e-06, "loss": 0.4655, "num_input_tokens_seen": 67858320, "step": 116875 }, { "epoch": 17.40840035746202, "grad_norm": 1.6639121770858765, "learning_rate": 2.5143892144449676e-06, "loss": 0.4915, "num_input_tokens_seen": 67861168, "step": 116880 }, { "epoch": 17.409145070002978, "grad_norm": 1.4789848327636719, "learning_rate": 2.512969159277459e-06, "loss": 0.5159, "num_input_tokens_seen": 67863920, "step": 116885 }, { "epoch": 17.40988978254394, "grad_norm": 2.680811882019043, "learning_rate": 2.511549484005485e-06, "loss": 0.6455, "num_input_tokens_seen": 67866608, "step": 116890 }, { "epoch": 17.410634495084896, "grad_norm": 2.707038640975952, "learning_rate": 2.5101301886530177e-06, "loss": 0.6127, "num_input_tokens_seen": 67869616, "step": 116895 }, { "epoch": 17.411379207625856, "grad_norm": 1.2243804931640625, "learning_rate": 2.508711273244052e-06, "loss": 0.3185, "num_input_tokens_seen": 67872464, "step": 116900 }, { "epoch": 17.412123920166817, "grad_norm": 1.5271434783935547, "learning_rate": 2.5072927378025467e-06, "loss": 0.5465, "num_input_tokens_seen": 67875088, "step": 116905 }, { "epoch": 17.412868632707774, "grad_norm": 1.9663515090942383, "learning_rate": 2.505874582352466e-06, "loss": 0.5224, "num_input_tokens_seen": 67878320, "step": 116910 }, { "epoch": 17.413613345248734, "grad_norm": 2.2119739055633545, "learning_rate": 2.5044568069177794e-06, "loss": 0.542, "num_input_tokens_seen": 67881488, "step": 116915 }, { "epoch": 17.414358057789695, "grad_norm": 1.2943915128707886, "learning_rate": 2.503039411522423e-06, "loss": 0.6495, "num_input_tokens_seen": 67884400, "step": 116920 }, { "epoch": 17.415102770330652, "grad_norm": 2.9177770614624023, "learning_rate": 2.501622396190359e-06, "loss": 0.6839, "num_input_tokens_seen": 67887504, "step": 116925 }, { "epoch": 17.415847482871612, "grad_norm": 1.469003677368164, "learning_rate": 2.500205760945512e-06, "loss": 0.5335, "num_input_tokens_seen": 67890512, "step": 116930 }, { "epoch": 17.41659219541257, "grad_norm": 1.9307891130447388, "learning_rate": 2.4987895058118244e-06, "loss": 0.7002, "num_input_tokens_seen": 67893712, "step": 116935 }, { "epoch": 17.41733690795353, "grad_norm": 1.055801510810852, "learning_rate": 2.4973736308132214e-06, "loss": 0.5786, "num_input_tokens_seen": 67896176, "step": 116940 }, { "epoch": 17.41808162049449, "grad_norm": 1.6809695959091187, "learning_rate": 2.4959581359736137e-06, "loss": 0.6909, "num_input_tokens_seen": 67899248, "step": 116945 }, { "epoch": 17.418826333035447, "grad_norm": 3.4905292987823486, "learning_rate": 2.494543021316928e-06, "loss": 0.5981, "num_input_tokens_seen": 67902032, "step": 116950 }, { "epoch": 17.419571045576408, "grad_norm": 1.5058408975601196, "learning_rate": 2.4931282868670634e-06, "loss": 0.5373, "num_input_tokens_seen": 67905168, "step": 116955 }, { "epoch": 17.42031575811737, "grad_norm": 2.6167285442352295, "learning_rate": 2.4917139326479213e-06, "loss": 0.5556, "num_input_tokens_seen": 67907856, "step": 116960 }, { "epoch": 17.421060470658325, "grad_norm": 1.9457265138626099, "learning_rate": 2.4902999586833897e-06, "loss": 0.5276, "num_input_tokens_seen": 67910704, "step": 116965 }, { "epoch": 17.421805183199286, "grad_norm": 1.8458868265151978, "learning_rate": 2.48888636499737e-06, "loss": 0.7878, "num_input_tokens_seen": 67913328, "step": 116970 }, { "epoch": 17.422549895740243, "grad_norm": 1.382908821105957, "learning_rate": 2.4874731516137283e-06, "loss": 0.5893, "num_input_tokens_seen": 67916144, "step": 116975 }, { "epoch": 17.423294608281203, "grad_norm": 0.4234912395477295, "learning_rate": 2.486060318556352e-06, "loss": 0.5894, "num_input_tokens_seen": 67918832, "step": 116980 }, { "epoch": 17.424039320822164, "grad_norm": 1.6370320320129395, "learning_rate": 2.4846478658491075e-06, "loss": 0.4706, "num_input_tokens_seen": 67921872, "step": 116985 }, { "epoch": 17.42478403336312, "grad_norm": 1.388196587562561, "learning_rate": 2.483235793515845e-06, "loss": 0.5883, "num_input_tokens_seen": 67924688, "step": 116990 }, { "epoch": 17.42552874590408, "grad_norm": 1.5419412851333618, "learning_rate": 2.4818241015804376e-06, "loss": 0.6195, "num_input_tokens_seen": 67927472, "step": 116995 }, { "epoch": 17.42627345844504, "grad_norm": 1.2735368013381958, "learning_rate": 2.480412790066719e-06, "loss": 0.5702, "num_input_tokens_seen": 67930512, "step": 117000 }, { "epoch": 17.427018170986, "grad_norm": 1.8692858219146729, "learning_rate": 2.479001858998542e-06, "loss": 0.6948, "num_input_tokens_seen": 67933392, "step": 117005 }, { "epoch": 17.42776288352696, "grad_norm": 2.512951612472534, "learning_rate": 2.4775913083997348e-06, "loss": 0.753, "num_input_tokens_seen": 67936400, "step": 117010 }, { "epoch": 17.428507596067917, "grad_norm": 2.830962657928467, "learning_rate": 2.476181138294137e-06, "loss": 0.5096, "num_input_tokens_seen": 67939600, "step": 117015 }, { "epoch": 17.429252308608877, "grad_norm": 1.5290905237197876, "learning_rate": 2.474771348705568e-06, "loss": 0.5095, "num_input_tokens_seen": 67942480, "step": 117020 }, { "epoch": 17.429997021149838, "grad_norm": 1.1046802997589111, "learning_rate": 2.4733619396578422e-06, "loss": 0.4469, "num_input_tokens_seen": 67945328, "step": 117025 }, { "epoch": 17.430741733690795, "grad_norm": 1.6112797260284424, "learning_rate": 2.4719529111747715e-06, "loss": 0.5261, "num_input_tokens_seen": 67948080, "step": 117030 }, { "epoch": 17.431486446231755, "grad_norm": 1.6170793771743774, "learning_rate": 2.470544263280153e-06, "loss": 0.6003, "num_input_tokens_seen": 67950672, "step": 117035 }, { "epoch": 17.432231158772712, "grad_norm": 1.369805932044983, "learning_rate": 2.4691359959977985e-06, "loss": 0.5613, "num_input_tokens_seen": 67953648, "step": 117040 }, { "epoch": 17.432975871313673, "grad_norm": 1.8683900833129883, "learning_rate": 2.467728109351486e-06, "loss": 0.5854, "num_input_tokens_seen": 67956592, "step": 117045 }, { "epoch": 17.433720583854633, "grad_norm": 1.4620352983474731, "learning_rate": 2.466320603365013e-06, "loss": 0.615, "num_input_tokens_seen": 67959504, "step": 117050 }, { "epoch": 17.43446529639559, "grad_norm": 0.9222475290298462, "learning_rate": 2.4649134780621446e-06, "loss": 0.5324, "num_input_tokens_seen": 67962320, "step": 117055 }, { "epoch": 17.43521000893655, "grad_norm": 1.6765531301498413, "learning_rate": 2.463506733466667e-06, "loss": 0.6414, "num_input_tokens_seen": 67965040, "step": 117060 }, { "epoch": 17.43595472147751, "grad_norm": 2.20241641998291, "learning_rate": 2.4621003696023354e-06, "loss": 0.6719, "num_input_tokens_seen": 67967760, "step": 117065 }, { "epoch": 17.43669943401847, "grad_norm": 1.8523061275482178, "learning_rate": 2.4606943864929064e-06, "loss": 0.544, "num_input_tokens_seen": 67970736, "step": 117070 }, { "epoch": 17.43744414655943, "grad_norm": 1.4682871103286743, "learning_rate": 2.459288784162142e-06, "loss": 0.5464, "num_input_tokens_seen": 67973712, "step": 117075 }, { "epoch": 17.438188859100386, "grad_norm": 1.5744322538375854, "learning_rate": 2.457883562633781e-06, "loss": 0.5716, "num_input_tokens_seen": 67976368, "step": 117080 }, { "epoch": 17.438933571641346, "grad_norm": 1.5198733806610107, "learning_rate": 2.456478721931571e-06, "loss": 0.4176, "num_input_tokens_seen": 67978928, "step": 117085 }, { "epoch": 17.439678284182307, "grad_norm": 2.610384702682495, "learning_rate": 2.4550742620792404e-06, "loss": 0.5496, "num_input_tokens_seen": 67981584, "step": 117090 }, { "epoch": 17.440422996723264, "grad_norm": 2.024528741836548, "learning_rate": 2.4536701831005177e-06, "loss": 0.6809, "num_input_tokens_seen": 67984464, "step": 117095 }, { "epoch": 17.441167709264224, "grad_norm": 1.2724121809005737, "learning_rate": 2.4522664850191223e-06, "loss": 0.5684, "num_input_tokens_seen": 67987344, "step": 117100 }, { "epoch": 17.441912421805185, "grad_norm": 1.9764909744262695, "learning_rate": 2.4508631678587635e-06, "loss": 0.8684, "num_input_tokens_seen": 67990448, "step": 117105 }, { "epoch": 17.442657134346142, "grad_norm": 2.597287178039551, "learning_rate": 2.4494602316431554e-06, "loss": 0.5952, "num_input_tokens_seen": 67993360, "step": 117110 }, { "epoch": 17.443401846887102, "grad_norm": 1.9147788286209106, "learning_rate": 2.4480576763959956e-06, "loss": 0.4529, "num_input_tokens_seen": 67996048, "step": 117115 }, { "epoch": 17.44414655942806, "grad_norm": 1.7103317975997925, "learning_rate": 2.4466555021409848e-06, "loss": 0.4411, "num_input_tokens_seen": 67998992, "step": 117120 }, { "epoch": 17.44489127196902, "grad_norm": 0.9960169196128845, "learning_rate": 2.445253708901801e-06, "loss": 0.6082, "num_input_tokens_seen": 68002192, "step": 117125 }, { "epoch": 17.44563598450998, "grad_norm": 1.839087963104248, "learning_rate": 2.443852296702137e-06, "loss": 0.7288, "num_input_tokens_seen": 68005072, "step": 117130 }, { "epoch": 17.446380697050937, "grad_norm": 0.8147164583206177, "learning_rate": 2.4424512655656597e-06, "loss": 0.6571, "num_input_tokens_seen": 68007984, "step": 117135 }, { "epoch": 17.447125409591898, "grad_norm": 1.030275821685791, "learning_rate": 2.4410506155160467e-06, "loss": 0.5469, "num_input_tokens_seen": 68010832, "step": 117140 }, { "epoch": 17.447870122132855, "grad_norm": 2.4396092891693115, "learning_rate": 2.439650346576955e-06, "loss": 0.794, "num_input_tokens_seen": 68013520, "step": 117145 }, { "epoch": 17.448614834673815, "grad_norm": 1.221897006034851, "learning_rate": 2.438250458772037e-06, "loss": 0.5151, "num_input_tokens_seen": 68016304, "step": 117150 }, { "epoch": 17.449359547214776, "grad_norm": 1.210024356842041, "learning_rate": 2.436850952124953e-06, "loss": 0.7032, "num_input_tokens_seen": 68019376, "step": 117155 }, { "epoch": 17.450104259755733, "grad_norm": 1.2164427042007446, "learning_rate": 2.435451826659338e-06, "loss": 0.5152, "num_input_tokens_seen": 68022192, "step": 117160 }, { "epoch": 17.450848972296694, "grad_norm": 1.6901062726974487, "learning_rate": 2.4340530823988292e-06, "loss": 0.6044, "num_input_tokens_seen": 68025264, "step": 117165 }, { "epoch": 17.451593684837654, "grad_norm": 1.9974406957626343, "learning_rate": 2.4326547193670556e-06, "loss": 0.5906, "num_input_tokens_seen": 68027984, "step": 117170 }, { "epoch": 17.45233839737861, "grad_norm": 1.5424320697784424, "learning_rate": 2.4312567375876503e-06, "loss": 0.7105, "num_input_tokens_seen": 68030672, "step": 117175 }, { "epoch": 17.45308310991957, "grad_norm": 2.229140281677246, "learning_rate": 2.4298591370842227e-06, "loss": 0.7251, "num_input_tokens_seen": 68033392, "step": 117180 }, { "epoch": 17.45382782246053, "grad_norm": 0.9998250603675842, "learning_rate": 2.4284619178803814e-06, "loss": 0.6927, "num_input_tokens_seen": 68036272, "step": 117185 }, { "epoch": 17.45457253500149, "grad_norm": 1.508479118347168, "learning_rate": 2.427065079999741e-06, "loss": 0.4922, "num_input_tokens_seen": 68038960, "step": 117190 }, { "epoch": 17.45531724754245, "grad_norm": 2.5416619777679443, "learning_rate": 2.4256686234658877e-06, "loss": 0.6752, "num_input_tokens_seen": 68041904, "step": 117195 }, { "epoch": 17.456061960083407, "grad_norm": 1.2234814167022705, "learning_rate": 2.4242725483024252e-06, "loss": 0.4654, "num_input_tokens_seen": 68044752, "step": 117200 }, { "epoch": 17.456806672624367, "grad_norm": 1.9067925214767456, "learning_rate": 2.4228768545329267e-06, "loss": 0.5542, "num_input_tokens_seen": 68047440, "step": 117205 }, { "epoch": 17.457551385165328, "grad_norm": 1.65292227268219, "learning_rate": 2.421481542180984e-06, "loss": 0.5372, "num_input_tokens_seen": 68050448, "step": 117210 }, { "epoch": 17.458296097706285, "grad_norm": 2.2209632396698, "learning_rate": 2.4200866112701643e-06, "loss": 0.6133, "num_input_tokens_seen": 68053328, "step": 117215 }, { "epoch": 17.459040810247245, "grad_norm": 1.6036516427993774, "learning_rate": 2.4186920618240294e-06, "loss": 0.6242, "num_input_tokens_seen": 68056048, "step": 117220 }, { "epoch": 17.459785522788202, "grad_norm": 1.771240234375, "learning_rate": 2.417297893866138e-06, "loss": 0.5502, "num_input_tokens_seen": 68058640, "step": 117225 }, { "epoch": 17.460530235329163, "grad_norm": 1.2467029094696045, "learning_rate": 2.41590410742005e-06, "loss": 0.4135, "num_input_tokens_seen": 68061584, "step": 117230 }, { "epoch": 17.461274947870123, "grad_norm": 1.513379454612732, "learning_rate": 2.4145107025093095e-06, "loss": 0.4784, "num_input_tokens_seen": 68064432, "step": 117235 }, { "epoch": 17.46201966041108, "grad_norm": 1.5006850957870483, "learning_rate": 2.4131176791574504e-06, "loss": 0.5625, "num_input_tokens_seen": 68067312, "step": 117240 }, { "epoch": 17.46276437295204, "grad_norm": 3.1533002853393555, "learning_rate": 2.4117250373880184e-06, "loss": 0.6717, "num_input_tokens_seen": 68070064, "step": 117245 }, { "epoch": 17.463509085493, "grad_norm": 1.742094874382019, "learning_rate": 2.4103327772245275e-06, "loss": 0.635, "num_input_tokens_seen": 68073136, "step": 117250 }, { "epoch": 17.46425379803396, "grad_norm": 1.2853261232376099, "learning_rate": 2.408940898690512e-06, "loss": 0.5766, "num_input_tokens_seen": 68076304, "step": 117255 }, { "epoch": 17.46499851057492, "grad_norm": 1.4001997709274292, "learning_rate": 2.407549401809478e-06, "loss": 0.4766, "num_input_tokens_seen": 68079184, "step": 117260 }, { "epoch": 17.465743223115876, "grad_norm": 2.0899367332458496, "learning_rate": 2.4061582866049285e-06, "loss": 0.5137, "num_input_tokens_seen": 68082064, "step": 117265 }, { "epoch": 17.466487935656836, "grad_norm": 1.6689313650131226, "learning_rate": 2.4047675531003787e-06, "loss": 0.5968, "num_input_tokens_seen": 68085232, "step": 117270 }, { "epoch": 17.467232648197797, "grad_norm": 0.871898889541626, "learning_rate": 2.4033772013193123e-06, "loss": 0.4396, "num_input_tokens_seen": 68088176, "step": 117275 }, { "epoch": 17.467977360738754, "grad_norm": 0.8032339811325073, "learning_rate": 2.4019872312852244e-06, "loss": 0.5501, "num_input_tokens_seen": 68091504, "step": 117280 }, { "epoch": 17.468722073279714, "grad_norm": 1.1674609184265137, "learning_rate": 2.400597643021596e-06, "loss": 0.4669, "num_input_tokens_seen": 68094224, "step": 117285 }, { "epoch": 17.469466785820675, "grad_norm": 3.0240132808685303, "learning_rate": 2.3992084365519004e-06, "loss": 0.5835, "num_input_tokens_seen": 68097456, "step": 117290 }, { "epoch": 17.470211498361632, "grad_norm": 2.0261359214782715, "learning_rate": 2.3978196118996043e-06, "loss": 0.6249, "num_input_tokens_seen": 68100336, "step": 117295 }, { "epoch": 17.470956210902592, "grad_norm": 1.1434729099273682, "learning_rate": 2.3964311690881786e-06, "loss": 0.5748, "num_input_tokens_seen": 68103184, "step": 117300 }, { "epoch": 17.47170092344355, "grad_norm": 2.363279342651367, "learning_rate": 2.3950431081410734e-06, "loss": 0.8361, "num_input_tokens_seen": 68105872, "step": 117305 }, { "epoch": 17.47244563598451, "grad_norm": 1.820940375328064, "learning_rate": 2.393655429081737e-06, "loss": 0.4823, "num_input_tokens_seen": 68108656, "step": 117310 }, { "epoch": 17.47319034852547, "grad_norm": 1.8685611486434937, "learning_rate": 2.3922681319336197e-06, "loss": 0.5601, "num_input_tokens_seen": 68111376, "step": 117315 }, { "epoch": 17.473935061066427, "grad_norm": 2.3059749603271484, "learning_rate": 2.3908812167201532e-06, "loss": 0.7912, "num_input_tokens_seen": 68114384, "step": 117320 }, { "epoch": 17.474679773607388, "grad_norm": 2.339747428894043, "learning_rate": 2.3894946834647713e-06, "loss": 0.5815, "num_input_tokens_seen": 68117072, "step": 117325 }, { "epoch": 17.47542448614835, "grad_norm": 0.9926052093505859, "learning_rate": 2.3881085321908912e-06, "loss": 0.4725, "num_input_tokens_seen": 68120080, "step": 117330 }, { "epoch": 17.476169198689306, "grad_norm": 0.9132068157196045, "learning_rate": 2.3867227629219417e-06, "loss": 0.5484, "num_input_tokens_seen": 68122992, "step": 117335 }, { "epoch": 17.476913911230266, "grad_norm": 2.1265690326690674, "learning_rate": 2.385337375681329e-06, "loss": 0.6351, "num_input_tokens_seen": 68125680, "step": 117340 }, { "epoch": 17.477658623771223, "grad_norm": 1.4625556468963623, "learning_rate": 2.383952370492451e-06, "loss": 0.7022, "num_input_tokens_seen": 68128944, "step": 117345 }, { "epoch": 17.478403336312184, "grad_norm": 0.8570802211761475, "learning_rate": 2.3825677473787168e-06, "loss": 0.6137, "num_input_tokens_seen": 68131824, "step": 117350 }, { "epoch": 17.479148048853144, "grad_norm": 2.144716501235962, "learning_rate": 2.381183506363513e-06, "loss": 0.6571, "num_input_tokens_seen": 68134832, "step": 117355 }, { "epoch": 17.4798927613941, "grad_norm": 1.3767093420028687, "learning_rate": 2.3797996474702267e-06, "loss": 0.5783, "num_input_tokens_seen": 68137648, "step": 117360 }, { "epoch": 17.48063747393506, "grad_norm": 1.622134804725647, "learning_rate": 2.378416170722228e-06, "loss": 0.4602, "num_input_tokens_seen": 68140368, "step": 117365 }, { "epoch": 17.48138218647602, "grad_norm": 2.281170606613159, "learning_rate": 2.3770330761429012e-06, "loss": 0.6249, "num_input_tokens_seen": 68143440, "step": 117370 }, { "epoch": 17.48212689901698, "grad_norm": 0.962095320224762, "learning_rate": 2.375650363755605e-06, "loss": 0.4483, "num_input_tokens_seen": 68146064, "step": 117375 }, { "epoch": 17.48287161155794, "grad_norm": 2.759263277053833, "learning_rate": 2.3742680335837042e-06, "loss": 0.49, "num_input_tokens_seen": 68148944, "step": 117380 }, { "epoch": 17.483616324098897, "grad_norm": 1.2757221460342407, "learning_rate": 2.3728860856505526e-06, "loss": 0.4462, "num_input_tokens_seen": 68151984, "step": 117385 }, { "epoch": 17.484361036639857, "grad_norm": 2.026979923248291, "learning_rate": 2.371504519979484e-06, "loss": 0.4611, "num_input_tokens_seen": 68155120, "step": 117390 }, { "epoch": 17.485105749180818, "grad_norm": 1.8853925466537476, "learning_rate": 2.3701233365938547e-06, "loss": 0.5971, "num_input_tokens_seen": 68157904, "step": 117395 }, { "epoch": 17.485850461721775, "grad_norm": 1.758467197418213, "learning_rate": 2.368742535516988e-06, "loss": 0.5258, "num_input_tokens_seen": 68160528, "step": 117400 }, { "epoch": 17.486595174262735, "grad_norm": 2.475043535232544, "learning_rate": 2.3673621167722202e-06, "loss": 0.7014, "num_input_tokens_seen": 68163152, "step": 117405 }, { "epoch": 17.487339886803692, "grad_norm": 1.7266008853912354, "learning_rate": 2.3659820803828586e-06, "loss": 0.5829, "num_input_tokens_seen": 68166064, "step": 117410 }, { "epoch": 17.488084599344653, "grad_norm": 3.783301830291748, "learning_rate": 2.364602426372234e-06, "loss": 0.507, "num_input_tokens_seen": 68168848, "step": 117415 }, { "epoch": 17.488829311885613, "grad_norm": 1.8201329708099365, "learning_rate": 2.3632231547636443e-06, "loss": 0.6396, "num_input_tokens_seen": 68172112, "step": 117420 }, { "epoch": 17.48957402442657, "grad_norm": 2.1642112731933594, "learning_rate": 2.3618442655803903e-06, "loss": 0.4761, "num_input_tokens_seen": 68174768, "step": 117425 }, { "epoch": 17.49031873696753, "grad_norm": 1.5378586053848267, "learning_rate": 2.36046575884577e-06, "loss": 0.5246, "num_input_tokens_seen": 68177680, "step": 117430 }, { "epoch": 17.49106344950849, "grad_norm": 1.7792164087295532, "learning_rate": 2.359087634583068e-06, "loss": 0.5436, "num_input_tokens_seen": 68180656, "step": 117435 }, { "epoch": 17.49180816204945, "grad_norm": 1.335218906402588, "learning_rate": 2.357709892815574e-06, "loss": 0.632, "num_input_tokens_seen": 68183504, "step": 117440 }, { "epoch": 17.49255287459041, "grad_norm": 1.749932050704956, "learning_rate": 2.356332533566552e-06, "loss": 0.755, "num_input_tokens_seen": 68186256, "step": 117445 }, { "epoch": 17.493297587131366, "grad_norm": 2.3751769065856934, "learning_rate": 2.354955556859284e-06, "loss": 0.6157, "num_input_tokens_seen": 68189200, "step": 117450 }, { "epoch": 17.494042299672326, "grad_norm": 1.83805513381958, "learning_rate": 2.3535789627170205e-06, "loss": 0.714, "num_input_tokens_seen": 68192304, "step": 117455 }, { "epoch": 17.494787012213287, "grad_norm": 1.8315867185592651, "learning_rate": 2.3522027511630297e-06, "loss": 0.8042, "num_input_tokens_seen": 68195440, "step": 117460 }, { "epoch": 17.495531724754244, "grad_norm": 3.084721565246582, "learning_rate": 2.350826922220553e-06, "loss": 0.7173, "num_input_tokens_seen": 68198192, "step": 117465 }, { "epoch": 17.496276437295204, "grad_norm": 1.751663088798523, "learning_rate": 2.349451475912834e-06, "loss": 0.5793, "num_input_tokens_seen": 68200944, "step": 117470 }, { "epoch": 17.497021149836165, "grad_norm": 1.160858392715454, "learning_rate": 2.3480764122631143e-06, "loss": 0.5901, "num_input_tokens_seen": 68203856, "step": 117475 }, { "epoch": 17.497765862377122, "grad_norm": 1.387843370437622, "learning_rate": 2.3467017312946175e-06, "loss": 0.5963, "num_input_tokens_seen": 68206608, "step": 117480 }, { "epoch": 17.498510574918082, "grad_norm": 1.786587119102478, "learning_rate": 2.345327433030575e-06, "loss": 0.556, "num_input_tokens_seen": 68209456, "step": 117485 }, { "epoch": 17.49925528745904, "grad_norm": 1.5547641515731812, "learning_rate": 2.343953517494202e-06, "loss": 0.4742, "num_input_tokens_seen": 68212400, "step": 117490 }, { "epoch": 17.5, "grad_norm": 1.9861977100372314, "learning_rate": 2.3425799847087066e-06, "loss": 0.6649, "num_input_tokens_seen": 68215440, "step": 117495 }, { "epoch": 17.50074471254096, "grad_norm": 1.442975401878357, "learning_rate": 2.341206834697288e-06, "loss": 0.7233, "num_input_tokens_seen": 68218288, "step": 117500 }, { "epoch": 17.501489425081918, "grad_norm": 1.164049506187439, "learning_rate": 2.339834067483157e-06, "loss": 0.5516, "num_input_tokens_seen": 68221520, "step": 117505 }, { "epoch": 17.502234137622878, "grad_norm": 1.2471683025360107, "learning_rate": 2.338461683089499e-06, "loss": 0.7483, "num_input_tokens_seen": 68224432, "step": 117510 }, { "epoch": 17.502978850163835, "grad_norm": 2.1232433319091797, "learning_rate": 2.337089681539495e-06, "loss": 0.5982, "num_input_tokens_seen": 68227408, "step": 117515 }, { "epoch": 17.503723562704796, "grad_norm": 2.1387126445770264, "learning_rate": 2.335718062856329e-06, "loss": 0.5791, "num_input_tokens_seen": 68230288, "step": 117520 }, { "epoch": 17.504468275245756, "grad_norm": 2.046194076538086, "learning_rate": 2.334346827063169e-06, "loss": 0.5928, "num_input_tokens_seen": 68233168, "step": 117525 }, { "epoch": 17.505212987786713, "grad_norm": 1.5376665592193604, "learning_rate": 2.332975974183185e-06, "loss": 0.5452, "num_input_tokens_seen": 68236176, "step": 117530 }, { "epoch": 17.505957700327674, "grad_norm": 1.8255066871643066, "learning_rate": 2.331605504239534e-06, "loss": 0.4071, "num_input_tokens_seen": 68238992, "step": 117535 }, { "epoch": 17.506702412868634, "grad_norm": 1.807043433189392, "learning_rate": 2.330235417255369e-06, "loss": 0.5547, "num_input_tokens_seen": 68242032, "step": 117540 }, { "epoch": 17.50744712540959, "grad_norm": 1.8907592296600342, "learning_rate": 2.3288657132538387e-06, "loss": 0.6636, "num_input_tokens_seen": 68245168, "step": 117545 }, { "epoch": 17.50819183795055, "grad_norm": 1.5219264030456543, "learning_rate": 2.32749639225808e-06, "loss": 0.4889, "num_input_tokens_seen": 68248048, "step": 117550 }, { "epoch": 17.50893655049151, "grad_norm": 1.3280569314956665, "learning_rate": 2.3261274542912213e-06, "loss": 0.4661, "num_input_tokens_seen": 68251088, "step": 117555 }, { "epoch": 17.50968126303247, "grad_norm": 1.3137545585632324, "learning_rate": 2.3247588993764e-06, "loss": 0.4685, "num_input_tokens_seen": 68253872, "step": 117560 }, { "epoch": 17.51042597557343, "grad_norm": 1.437943458557129, "learning_rate": 2.3233907275367283e-06, "loss": 0.5781, "num_input_tokens_seen": 68256944, "step": 117565 }, { "epoch": 17.511170688114387, "grad_norm": 3.3163905143737793, "learning_rate": 2.3220229387953207e-06, "loss": 0.6467, "num_input_tokens_seen": 68260112, "step": 117570 }, { "epoch": 17.511915400655347, "grad_norm": 1.7534395456314087, "learning_rate": 2.3206555331752922e-06, "loss": 0.6645, "num_input_tokens_seen": 68262704, "step": 117575 }, { "epoch": 17.512660113196308, "grad_norm": 1.4606961011886597, "learning_rate": 2.3192885106997327e-06, "loss": 0.5656, "num_input_tokens_seen": 68265552, "step": 117580 }, { "epoch": 17.513404825737265, "grad_norm": 3.784083604812622, "learning_rate": 2.3179218713917456e-06, "loss": 0.5485, "num_input_tokens_seen": 68268432, "step": 117585 }, { "epoch": 17.514149538278225, "grad_norm": 1.3971951007843018, "learning_rate": 2.3165556152744154e-06, "loss": 0.5425, "num_input_tokens_seen": 68271376, "step": 117590 }, { "epoch": 17.514894250819182, "grad_norm": 1.3713281154632568, "learning_rate": 2.3151897423708214e-06, "loss": 0.4917, "num_input_tokens_seen": 68274192, "step": 117595 }, { "epoch": 17.515638963360143, "grad_norm": 1.4353315830230713, "learning_rate": 2.3138242527040416e-06, "loss": 0.585, "num_input_tokens_seen": 68277328, "step": 117600 }, { "epoch": 17.516383675901103, "grad_norm": 1.807138442993164, "learning_rate": 2.3124591462971414e-06, "loss": 0.5832, "num_input_tokens_seen": 68280048, "step": 117605 }, { "epoch": 17.51712838844206, "grad_norm": 2.0856235027313232, "learning_rate": 2.3110944231731907e-06, "loss": 0.5908, "num_input_tokens_seen": 68282896, "step": 117610 }, { "epoch": 17.51787310098302, "grad_norm": 1.798574686050415, "learning_rate": 2.309730083355238e-06, "loss": 0.4868, "num_input_tokens_seen": 68285488, "step": 117615 }, { "epoch": 17.51861781352398, "grad_norm": 1.6599372625350952, "learning_rate": 2.3083661268663376e-06, "loss": 0.6257, "num_input_tokens_seen": 68288240, "step": 117620 }, { "epoch": 17.51936252606494, "grad_norm": 1.0904381275177002, "learning_rate": 2.3070025537295257e-06, "loss": 0.659, "num_input_tokens_seen": 68290992, "step": 117625 }, { "epoch": 17.5201072386059, "grad_norm": 1.159186601638794, "learning_rate": 2.3056393639678374e-06, "loss": 0.516, "num_input_tokens_seen": 68293872, "step": 117630 }, { "epoch": 17.520851951146856, "grad_norm": 2.6756954193115234, "learning_rate": 2.3042765576043124e-06, "loss": 0.4983, "num_input_tokens_seen": 68296816, "step": 117635 }, { "epoch": 17.521596663687816, "grad_norm": 1.6912745237350464, "learning_rate": 2.3029141346619624e-06, "loss": 0.5525, "num_input_tokens_seen": 68299792, "step": 117640 }, { "epoch": 17.522341376228777, "grad_norm": 3.3295278549194336, "learning_rate": 2.301552095163814e-06, "loss": 0.5861, "num_input_tokens_seen": 68302736, "step": 117645 }, { "epoch": 17.523086088769734, "grad_norm": 1.192016363143921, "learning_rate": 2.3001904391328683e-06, "loss": 0.5861, "num_input_tokens_seen": 68305680, "step": 117650 }, { "epoch": 17.523830801310694, "grad_norm": 2.1891467571258545, "learning_rate": 2.2988291665921396e-06, "loss": 0.4609, "num_input_tokens_seen": 68308496, "step": 117655 }, { "epoch": 17.52457551385165, "grad_norm": 1.5147521495819092, "learning_rate": 2.2974682775646157e-06, "loss": 0.6972, "num_input_tokens_seen": 68311376, "step": 117660 }, { "epoch": 17.525320226392612, "grad_norm": 2.057713270187378, "learning_rate": 2.2961077720732943e-06, "loss": 0.6907, "num_input_tokens_seen": 68314096, "step": 117665 }, { "epoch": 17.526064938933573, "grad_norm": 0.4683469533920288, "learning_rate": 2.2947476501411573e-06, "loss": 0.5264, "num_input_tokens_seen": 68317328, "step": 117670 }, { "epoch": 17.52680965147453, "grad_norm": 1.953361988067627, "learning_rate": 2.293387911791178e-06, "loss": 0.6702, "num_input_tokens_seen": 68320208, "step": 117675 }, { "epoch": 17.52755436401549, "grad_norm": 2.1064586639404297, "learning_rate": 2.2920285570463386e-06, "loss": 0.795, "num_input_tokens_seen": 68323504, "step": 117680 }, { "epoch": 17.52829907655645, "grad_norm": 2.004889965057373, "learning_rate": 2.2906695859295946e-06, "loss": 0.6579, "num_input_tokens_seen": 68326576, "step": 117685 }, { "epoch": 17.529043789097408, "grad_norm": 1.6265987157821655, "learning_rate": 2.2893109984639086e-06, "loss": 0.6131, "num_input_tokens_seen": 68329232, "step": 117690 }, { "epoch": 17.529788501638368, "grad_norm": 1.7292921543121338, "learning_rate": 2.287952794672227e-06, "loss": 0.5236, "num_input_tokens_seen": 68332176, "step": 117695 }, { "epoch": 17.530533214179325, "grad_norm": 2.2843878269195557, "learning_rate": 2.2865949745775024e-06, "loss": 0.731, "num_input_tokens_seen": 68335472, "step": 117700 }, { "epoch": 17.531277926720286, "grad_norm": 1.9685587882995605, "learning_rate": 2.2852375382026727e-06, "loss": 0.7352, "num_input_tokens_seen": 68338384, "step": 117705 }, { "epoch": 17.532022639261246, "grad_norm": 1.1661983728408813, "learning_rate": 2.2838804855706606e-06, "loss": 0.6734, "num_input_tokens_seen": 68341328, "step": 117710 }, { "epoch": 17.532767351802203, "grad_norm": 1.5409667491912842, "learning_rate": 2.282523816704407e-06, "loss": 0.6929, "num_input_tokens_seen": 68344176, "step": 117715 }, { "epoch": 17.533512064343164, "grad_norm": 1.473361611366272, "learning_rate": 2.2811675316268212e-06, "loss": 0.6225, "num_input_tokens_seen": 68346832, "step": 117720 }, { "epoch": 17.534256776884124, "grad_norm": 1.59916353225708, "learning_rate": 2.279811630360823e-06, "loss": 0.6292, "num_input_tokens_seen": 68349936, "step": 117725 }, { "epoch": 17.53500148942508, "grad_norm": 2.095696210861206, "learning_rate": 2.2784561129293086e-06, "loss": 0.5584, "num_input_tokens_seen": 68353168, "step": 117730 }, { "epoch": 17.53574620196604, "grad_norm": 1.078209638595581, "learning_rate": 2.2771009793551927e-06, "loss": 0.5606, "num_input_tokens_seen": 68355856, "step": 117735 }, { "epoch": 17.536490914507, "grad_norm": 1.4613556861877441, "learning_rate": 2.275746229661355e-06, "loss": 0.6452, "num_input_tokens_seen": 68359120, "step": 117740 }, { "epoch": 17.53723562704796, "grad_norm": 1.7820677757263184, "learning_rate": 2.2743918638706952e-06, "loss": 0.5623, "num_input_tokens_seen": 68362352, "step": 117745 }, { "epoch": 17.53798033958892, "grad_norm": 1.2146167755126953, "learning_rate": 2.2730378820060847e-06, "loss": 0.6369, "num_input_tokens_seen": 68365296, "step": 117750 }, { "epoch": 17.538725052129877, "grad_norm": 2.164074659347534, "learning_rate": 2.271684284090403e-06, "loss": 0.4451, "num_input_tokens_seen": 68368240, "step": 117755 }, { "epoch": 17.539469764670837, "grad_norm": 1.3266065120697021, "learning_rate": 2.270331070146514e-06, "loss": 0.5492, "num_input_tokens_seen": 68371152, "step": 117760 }, { "epoch": 17.540214477211798, "grad_norm": 1.3838160037994385, "learning_rate": 2.2689782401972783e-06, "loss": 0.7076, "num_input_tokens_seen": 68373936, "step": 117765 }, { "epoch": 17.540959189752755, "grad_norm": 1.6796544790267944, "learning_rate": 2.2676257942655544e-06, "loss": 0.6764, "num_input_tokens_seen": 68376848, "step": 117770 }, { "epoch": 17.541703902293715, "grad_norm": 1.032670259475708, "learning_rate": 2.266273732374183e-06, "loss": 0.4393, "num_input_tokens_seen": 68379664, "step": 117775 }, { "epoch": 17.542448614834672, "grad_norm": 2.047762632369995, "learning_rate": 2.2649220545460176e-06, "loss": 0.5651, "num_input_tokens_seen": 68382480, "step": 117780 }, { "epoch": 17.543193327375633, "grad_norm": 0.9765878319740295, "learning_rate": 2.2635707608038877e-06, "loss": 0.529, "num_input_tokens_seen": 68385136, "step": 117785 }, { "epoch": 17.543938039916593, "grad_norm": 1.9827483892440796, "learning_rate": 2.262219851170616e-06, "loss": 0.5144, "num_input_tokens_seen": 68387856, "step": 117790 }, { "epoch": 17.54468275245755, "grad_norm": 1.0608470439910889, "learning_rate": 2.260869325669035e-06, "loss": 0.634, "num_input_tokens_seen": 68390736, "step": 117795 }, { "epoch": 17.54542746499851, "grad_norm": 1.3781042098999023, "learning_rate": 2.259519184321951e-06, "loss": 0.5747, "num_input_tokens_seen": 68393680, "step": 117800 }, { "epoch": 17.54617217753947, "grad_norm": 1.2243696451187134, "learning_rate": 2.2581694271521817e-06, "loss": 0.5276, "num_input_tokens_seen": 68396592, "step": 117805 }, { "epoch": 17.54691689008043, "grad_norm": 1.469428539276123, "learning_rate": 2.2568200541825236e-06, "loss": 0.4827, "num_input_tokens_seen": 68399472, "step": 117810 }, { "epoch": 17.54766160262139, "grad_norm": 2.344998836517334, "learning_rate": 2.2554710654357796e-06, "loss": 0.6871, "num_input_tokens_seen": 68402832, "step": 117815 }, { "epoch": 17.548406315162346, "grad_norm": 2.727937936782837, "learning_rate": 2.254122460934735e-06, "loss": 0.5952, "num_input_tokens_seen": 68405648, "step": 117820 }, { "epoch": 17.549151027703306, "grad_norm": 1.0810202360153198, "learning_rate": 2.2527742407021742e-06, "loss": 0.6141, "num_input_tokens_seen": 68408528, "step": 117825 }, { "epoch": 17.549895740244267, "grad_norm": 2.1076698303222656, "learning_rate": 2.251426404760873e-06, "loss": 0.5757, "num_input_tokens_seen": 68411440, "step": 117830 }, { "epoch": 17.550640452785224, "grad_norm": 3.8773553371429443, "learning_rate": 2.250078953133597e-06, "loss": 0.4794, "num_input_tokens_seen": 68414480, "step": 117835 }, { "epoch": 17.551385165326185, "grad_norm": 1.5868120193481445, "learning_rate": 2.2487318858431193e-06, "loss": 0.6245, "num_input_tokens_seen": 68417264, "step": 117840 }, { "epoch": 17.552129877867145, "grad_norm": 2.341120481491089, "learning_rate": 2.247385202912189e-06, "loss": 0.7608, "num_input_tokens_seen": 68420272, "step": 117845 }, { "epoch": 17.552874590408102, "grad_norm": 2.299196720123291, "learning_rate": 2.246038904363565e-06, "loss": 0.5271, "num_input_tokens_seen": 68423312, "step": 117850 }, { "epoch": 17.553619302949063, "grad_norm": 1.6010396480560303, "learning_rate": 2.2446929902199847e-06, "loss": 0.6544, "num_input_tokens_seen": 68426352, "step": 117855 }, { "epoch": 17.55436401549002, "grad_norm": 1.7233854532241821, "learning_rate": 2.2433474605041917e-06, "loss": 0.7859, "num_input_tokens_seen": 68429456, "step": 117860 }, { "epoch": 17.55510872803098, "grad_norm": 1.1691783666610718, "learning_rate": 2.242002315238917e-06, "loss": 0.6419, "num_input_tokens_seen": 68431952, "step": 117865 }, { "epoch": 17.55585344057194, "grad_norm": 1.1125036478042603, "learning_rate": 2.240657554446876e-06, "loss": 0.6819, "num_input_tokens_seen": 68435120, "step": 117870 }, { "epoch": 17.556598153112898, "grad_norm": 1.839290976524353, "learning_rate": 2.239313178150798e-06, "loss": 0.7439, "num_input_tokens_seen": 68438064, "step": 117875 }, { "epoch": 17.557342865653858, "grad_norm": 1.2996344566345215, "learning_rate": 2.2379691863733927e-06, "loss": 0.5404, "num_input_tokens_seen": 68440880, "step": 117880 }, { "epoch": 17.558087578194815, "grad_norm": 2.366457939147949, "learning_rate": 2.236625579137358e-06, "loss": 0.5413, "num_input_tokens_seen": 68444080, "step": 117885 }, { "epoch": 17.558832290735776, "grad_norm": 1.0462483167648315, "learning_rate": 2.235282356465404e-06, "loss": 0.5701, "num_input_tokens_seen": 68447088, "step": 117890 }, { "epoch": 17.559577003276736, "grad_norm": 1.8185865879058838, "learning_rate": 2.233939518380218e-06, "loss": 0.6768, "num_input_tokens_seen": 68449936, "step": 117895 }, { "epoch": 17.560321715817693, "grad_norm": 1.1222779750823975, "learning_rate": 2.232597064904479e-06, "loss": 0.5396, "num_input_tokens_seen": 68453168, "step": 117900 }, { "epoch": 17.561066428358654, "grad_norm": 3.7984812259674072, "learning_rate": 2.2312549960608804e-06, "loss": 0.4829, "num_input_tokens_seen": 68456016, "step": 117905 }, { "epoch": 17.561811140899614, "grad_norm": 3.7866644859313965, "learning_rate": 2.2299133118720837e-06, "loss": 0.6025, "num_input_tokens_seen": 68459088, "step": 117910 }, { "epoch": 17.56255585344057, "grad_norm": 1.2390522956848145, "learning_rate": 2.228572012360758e-06, "loss": 0.5848, "num_input_tokens_seen": 68462000, "step": 117915 }, { "epoch": 17.56330056598153, "grad_norm": 1.7996504306793213, "learning_rate": 2.2272310975495673e-06, "loss": 0.696, "num_input_tokens_seen": 68465040, "step": 117920 }, { "epoch": 17.56404527852249, "grad_norm": 1.5154696702957153, "learning_rate": 2.2258905674611556e-06, "loss": 0.7784, "num_input_tokens_seen": 68468240, "step": 117925 }, { "epoch": 17.56478999106345, "grad_norm": 1.1290541887283325, "learning_rate": 2.224550422118185e-06, "loss": 0.4182, "num_input_tokens_seen": 68471120, "step": 117930 }, { "epoch": 17.56553470360441, "grad_norm": 1.287269115447998, "learning_rate": 2.223210661543279e-06, "loss": 0.4903, "num_input_tokens_seen": 68474064, "step": 117935 }, { "epoch": 17.566279416145367, "grad_norm": 1.1861766576766968, "learning_rate": 2.221871285759086e-06, "loss": 0.5196, "num_input_tokens_seen": 68476848, "step": 117940 }, { "epoch": 17.567024128686327, "grad_norm": 2.369027614593506, "learning_rate": 2.2205322947882245e-06, "loss": 0.7159, "num_input_tokens_seen": 68479920, "step": 117945 }, { "epoch": 17.567768841227288, "grad_norm": 3.0612118244171143, "learning_rate": 2.2191936886533206e-06, "loss": 0.5519, "num_input_tokens_seen": 68482512, "step": 117950 }, { "epoch": 17.568513553768245, "grad_norm": 1.8398083448410034, "learning_rate": 2.2178554673769863e-06, "loss": 0.9228, "num_input_tokens_seen": 68485392, "step": 117955 }, { "epoch": 17.569258266309205, "grad_norm": 1.4240754842758179, "learning_rate": 2.216517630981821e-06, "loss": 0.4824, "num_input_tokens_seen": 68488240, "step": 117960 }, { "epoch": 17.570002978850162, "grad_norm": 1.6939860582351685, "learning_rate": 2.215180179490442e-06, "loss": 0.6158, "num_input_tokens_seen": 68491120, "step": 117965 }, { "epoch": 17.570747691391123, "grad_norm": 1.376229166984558, "learning_rate": 2.2138431129254318e-06, "loss": 0.6747, "num_input_tokens_seen": 68493968, "step": 117970 }, { "epoch": 17.571492403932083, "grad_norm": 1.2763339281082153, "learning_rate": 2.2125064313093857e-06, "loss": 0.6919, "num_input_tokens_seen": 68496848, "step": 117975 }, { "epoch": 17.57223711647304, "grad_norm": 2.214496612548828, "learning_rate": 2.2111701346648806e-06, "loss": 0.5777, "num_input_tokens_seen": 68499664, "step": 117980 }, { "epoch": 17.572981829014, "grad_norm": 1.6299763917922974, "learning_rate": 2.209834223014498e-06, "loss": 0.646, "num_input_tokens_seen": 68502608, "step": 117985 }, { "epoch": 17.57372654155496, "grad_norm": 2.21925687789917, "learning_rate": 2.2084986963808036e-06, "loss": 0.7852, "num_input_tokens_seen": 68505296, "step": 117990 }, { "epoch": 17.57447125409592, "grad_norm": 2.281548500061035, "learning_rate": 2.2071635547863565e-06, "loss": 0.7736, "num_input_tokens_seen": 68508336, "step": 117995 }, { "epoch": 17.57521596663688, "grad_norm": 1.9646767377853394, "learning_rate": 2.2058287982537175e-06, "loss": 0.6167, "num_input_tokens_seen": 68512208, "step": 118000 }, { "epoch": 17.575960679177836, "grad_norm": 3.2378897666931152, "learning_rate": 2.2044944268054315e-06, "loss": 0.7746, "num_input_tokens_seen": 68515216, "step": 118005 }, { "epoch": 17.576705391718797, "grad_norm": 4.062417030334473, "learning_rate": 2.203160440464047e-06, "loss": 0.6367, "num_input_tokens_seen": 68518224, "step": 118010 }, { "epoch": 17.577450104259757, "grad_norm": 1.026458978652954, "learning_rate": 2.2018268392520998e-06, "loss": 0.6518, "num_input_tokens_seen": 68521200, "step": 118015 }, { "epoch": 17.578194816800714, "grad_norm": 1.350339412689209, "learning_rate": 2.2004936231921153e-06, "loss": 0.5841, "num_input_tokens_seen": 68524272, "step": 118020 }, { "epoch": 17.578939529341675, "grad_norm": 1.6401848793029785, "learning_rate": 2.1991607923066176e-06, "loss": 0.4411, "num_input_tokens_seen": 68527120, "step": 118025 }, { "epoch": 17.57968424188263, "grad_norm": 1.0480468273162842, "learning_rate": 2.197828346618122e-06, "loss": 0.4489, "num_input_tokens_seen": 68529808, "step": 118030 }, { "epoch": 17.580428954423592, "grad_norm": 1.465110421180725, "learning_rate": 2.196496286149144e-06, "loss": 0.4773, "num_input_tokens_seen": 68532912, "step": 118035 }, { "epoch": 17.581173666964553, "grad_norm": 1.211132526397705, "learning_rate": 2.195164610922182e-06, "loss": 0.4492, "num_input_tokens_seen": 68535696, "step": 118040 }, { "epoch": 17.58191837950551, "grad_norm": 2.4532809257507324, "learning_rate": 2.1938333209597373e-06, "loss": 0.6433, "num_input_tokens_seen": 68538416, "step": 118045 }, { "epoch": 17.58266309204647, "grad_norm": 1.4750112295150757, "learning_rate": 2.1925024162842978e-06, "loss": 0.6596, "num_input_tokens_seen": 68541264, "step": 118050 }, { "epoch": 17.58340780458743, "grad_norm": 1.9753379821777344, "learning_rate": 2.1911718969183535e-06, "loss": 0.7212, "num_input_tokens_seen": 68544272, "step": 118055 }, { "epoch": 17.584152517128388, "grad_norm": 1.3924381732940674, "learning_rate": 2.189841762884376e-06, "loss": 0.6155, "num_input_tokens_seen": 68547376, "step": 118060 }, { "epoch": 17.584897229669348, "grad_norm": 2.064037799835205, "learning_rate": 2.188512014204841e-06, "loss": 0.6685, "num_input_tokens_seen": 68550928, "step": 118065 }, { "epoch": 17.585641942210305, "grad_norm": 2.567136287689209, "learning_rate": 2.1871826509022086e-06, "loss": 0.789, "num_input_tokens_seen": 68553680, "step": 118070 }, { "epoch": 17.586386654751266, "grad_norm": 1.9473124742507935, "learning_rate": 2.1858536729989388e-06, "loss": 0.5542, "num_input_tokens_seen": 68556592, "step": 118075 }, { "epoch": 17.587131367292226, "grad_norm": 1.541759729385376, "learning_rate": 2.1845250805174854e-06, "loss": 0.6566, "num_input_tokens_seen": 68559568, "step": 118080 }, { "epoch": 17.587876079833183, "grad_norm": 3.3851852416992188, "learning_rate": 2.183196873480295e-06, "loss": 0.5666, "num_input_tokens_seen": 68562096, "step": 118085 }, { "epoch": 17.588620792374144, "grad_norm": 1.7408936023712158, "learning_rate": 2.1818690519098018e-06, "loss": 0.6748, "num_input_tokens_seen": 68565104, "step": 118090 }, { "epoch": 17.589365504915104, "grad_norm": 2.2170114517211914, "learning_rate": 2.1805416158284355e-06, "loss": 0.531, "num_input_tokens_seen": 68567920, "step": 118095 }, { "epoch": 17.59011021745606, "grad_norm": 2.4397919178009033, "learning_rate": 2.1792145652586305e-06, "loss": 0.6909, "num_input_tokens_seen": 68570800, "step": 118100 }, { "epoch": 17.590854929997022, "grad_norm": 1.3706791400909424, "learning_rate": 2.177887900222797e-06, "loss": 0.4807, "num_input_tokens_seen": 68573616, "step": 118105 }, { "epoch": 17.59159964253798, "grad_norm": 2.3640849590301514, "learning_rate": 2.176561620743356e-06, "loss": 0.5957, "num_input_tokens_seen": 68576528, "step": 118110 }, { "epoch": 17.59234435507894, "grad_norm": 1.6741065979003906, "learning_rate": 2.1752357268427086e-06, "loss": 0.579, "num_input_tokens_seen": 68579408, "step": 118115 }, { "epoch": 17.5930890676199, "grad_norm": 1.6989989280700684, "learning_rate": 2.173910218543254e-06, "loss": 0.4881, "num_input_tokens_seen": 68582128, "step": 118120 }, { "epoch": 17.593833780160857, "grad_norm": 1.9317072629928589, "learning_rate": 2.172585095867391e-06, "loss": 0.6336, "num_input_tokens_seen": 68584752, "step": 118125 }, { "epoch": 17.594578492701817, "grad_norm": 1.4532793760299683, "learning_rate": 2.1712603588374956e-06, "loss": 0.5057, "num_input_tokens_seen": 68587600, "step": 118130 }, { "epoch": 17.595323205242778, "grad_norm": 1.4142017364501953, "learning_rate": 2.169936007475959e-06, "loss": 0.4932, "num_input_tokens_seen": 68590800, "step": 118135 }, { "epoch": 17.596067917783735, "grad_norm": 1.128416895866394, "learning_rate": 2.1686120418051457e-06, "loss": 0.5867, "num_input_tokens_seen": 68593712, "step": 118140 }, { "epoch": 17.596812630324695, "grad_norm": 1.3333200216293335, "learning_rate": 2.167288461847433e-06, "loss": 0.4812, "num_input_tokens_seen": 68596624, "step": 118145 }, { "epoch": 17.597557342865652, "grad_norm": 1.4420136213302612, "learning_rate": 2.165965267625175e-06, "loss": 0.5304, "num_input_tokens_seen": 68599728, "step": 118150 }, { "epoch": 17.598302055406613, "grad_norm": 3.7586026191711426, "learning_rate": 2.164642459160726e-06, "loss": 0.5299, "num_input_tokens_seen": 68602448, "step": 118155 }, { "epoch": 17.599046767947573, "grad_norm": 1.529872179031372, "learning_rate": 2.163320036476435e-06, "loss": 0.4929, "num_input_tokens_seen": 68604976, "step": 118160 }, { "epoch": 17.59979148048853, "grad_norm": 1.908765196800232, "learning_rate": 2.1619979995946365e-06, "loss": 0.621, "num_input_tokens_seen": 68607824, "step": 118165 }, { "epoch": 17.60053619302949, "grad_norm": 1.4089562892913818, "learning_rate": 2.160676348537674e-06, "loss": 0.4596, "num_input_tokens_seen": 68610512, "step": 118170 }, { "epoch": 17.601280905570448, "grad_norm": 1.4028394222259521, "learning_rate": 2.159355083327866e-06, "loss": 0.4468, "num_input_tokens_seen": 68613328, "step": 118175 }, { "epoch": 17.60202561811141, "grad_norm": 2.2785677909851074, "learning_rate": 2.158034203987547e-06, "loss": 0.6069, "num_input_tokens_seen": 68616240, "step": 118180 }, { "epoch": 17.60277033065237, "grad_norm": 1.226428747177124, "learning_rate": 2.1567137105390183e-06, "loss": 0.596, "num_input_tokens_seen": 68619088, "step": 118185 }, { "epoch": 17.603515043193326, "grad_norm": 1.6722493171691895, "learning_rate": 2.155393603004602e-06, "loss": 0.3334, "num_input_tokens_seen": 68621936, "step": 118190 }, { "epoch": 17.604259755734287, "grad_norm": 1.5066275596618652, "learning_rate": 2.15407388140659e-06, "loss": 0.5313, "num_input_tokens_seen": 68624656, "step": 118195 }, { "epoch": 17.605004468275247, "grad_norm": 2.7316551208496094, "learning_rate": 2.1527545457672743e-06, "loss": 0.4913, "num_input_tokens_seen": 68627536, "step": 118200 }, { "epoch": 17.605749180816204, "grad_norm": 1.4673434495925903, "learning_rate": 2.1514355961089583e-06, "loss": 0.6333, "num_input_tokens_seen": 68630320, "step": 118205 }, { "epoch": 17.606493893357165, "grad_norm": 1.1920301914215088, "learning_rate": 2.1501170324539134e-06, "loss": 0.5709, "num_input_tokens_seen": 68633168, "step": 118210 }, { "epoch": 17.60723860589812, "grad_norm": 1.3071883916854858, "learning_rate": 2.1487988548244133e-06, "loss": 0.59, "num_input_tokens_seen": 68636176, "step": 118215 }, { "epoch": 17.607983318439082, "grad_norm": 0.743960440158844, "learning_rate": 2.1474810632427344e-06, "loss": 0.4494, "num_input_tokens_seen": 68639056, "step": 118220 }, { "epoch": 17.608728030980043, "grad_norm": 1.7759606838226318, "learning_rate": 2.1461636577311373e-06, "loss": 0.5083, "num_input_tokens_seen": 68641904, "step": 118225 }, { "epoch": 17.609472743521, "grad_norm": 1.4612696170806885, "learning_rate": 2.1448466383118786e-06, "loss": 0.6871, "num_input_tokens_seen": 68644656, "step": 118230 }, { "epoch": 17.61021745606196, "grad_norm": 2.7669475078582764, "learning_rate": 2.143530005007202e-06, "loss": 0.74, "num_input_tokens_seen": 68647728, "step": 118235 }, { "epoch": 17.61096216860292, "grad_norm": 3.4555444717407227, "learning_rate": 2.142213757839362e-06, "loss": 0.5747, "num_input_tokens_seen": 68650352, "step": 118240 }, { "epoch": 17.611706881143878, "grad_norm": 0.9457644820213318, "learning_rate": 2.140897896830582e-06, "loss": 0.5652, "num_input_tokens_seen": 68653424, "step": 118245 }, { "epoch": 17.61245159368484, "grad_norm": 1.610818862915039, "learning_rate": 2.139582422003106e-06, "loss": 0.5971, "num_input_tokens_seen": 68656176, "step": 118250 }, { "epoch": 17.613196306225795, "grad_norm": 1.727418065071106, "learning_rate": 2.138267333379146e-06, "loss": 0.6655, "num_input_tokens_seen": 68659120, "step": 118255 }, { "epoch": 17.613941018766756, "grad_norm": 1.4275438785552979, "learning_rate": 2.1369526309809263e-06, "loss": 0.2584, "num_input_tokens_seen": 68662352, "step": 118260 }, { "epoch": 17.614685731307716, "grad_norm": 1.7868112325668335, "learning_rate": 2.1356383148306515e-06, "loss": 0.6917, "num_input_tokens_seen": 68665872, "step": 118265 }, { "epoch": 17.615430443848673, "grad_norm": 1.6458503007888794, "learning_rate": 2.134324384950531e-06, "loss": 0.5597, "num_input_tokens_seen": 68668688, "step": 118270 }, { "epoch": 17.616175156389634, "grad_norm": 2.029080629348755, "learning_rate": 2.1330108413627647e-06, "loss": 0.8484, "num_input_tokens_seen": 68671760, "step": 118275 }, { "epoch": 17.616919868930594, "grad_norm": 2.0449185371398926, "learning_rate": 2.131697684089537e-06, "loss": 0.5233, "num_input_tokens_seen": 68674416, "step": 118280 }, { "epoch": 17.61766458147155, "grad_norm": 2.6903302669525146, "learning_rate": 2.1303849131530357e-06, "loss": 0.5947, "num_input_tokens_seen": 68677424, "step": 118285 }, { "epoch": 17.618409294012512, "grad_norm": 0.9071025848388672, "learning_rate": 2.129072528575432e-06, "loss": 0.497, "num_input_tokens_seen": 68680368, "step": 118290 }, { "epoch": 17.61915400655347, "grad_norm": 1.6436376571655273, "learning_rate": 2.1277605303789057e-06, "loss": 0.6236, "num_input_tokens_seen": 68683344, "step": 118295 }, { "epoch": 17.61989871909443, "grad_norm": 2.4400815963745117, "learning_rate": 2.126448918585619e-06, "loss": 0.6376, "num_input_tokens_seen": 68686256, "step": 118300 }, { "epoch": 17.62064343163539, "grad_norm": 1.4455853700637817, "learning_rate": 2.12513769321773e-06, "loss": 0.5893, "num_input_tokens_seen": 68689008, "step": 118305 }, { "epoch": 17.621388144176347, "grad_norm": 1.258898377418518, "learning_rate": 2.123826854297395e-06, "loss": 0.7086, "num_input_tokens_seen": 68691760, "step": 118310 }, { "epoch": 17.622132856717307, "grad_norm": 1.6579188108444214, "learning_rate": 2.1225164018467468e-06, "loss": 0.7206, "num_input_tokens_seen": 68694736, "step": 118315 }, { "epoch": 17.622877569258268, "grad_norm": 1.3723803758621216, "learning_rate": 2.1212063358879374e-06, "loss": 0.5332, "num_input_tokens_seen": 68697968, "step": 118320 }, { "epoch": 17.623622281799225, "grad_norm": 2.8355982303619385, "learning_rate": 2.1198966564430935e-06, "loss": 0.8236, "num_input_tokens_seen": 68702000, "step": 118325 }, { "epoch": 17.624366994340185, "grad_norm": 1.3986438512802124, "learning_rate": 2.1185873635343413e-06, "loss": 0.4176, "num_input_tokens_seen": 68704784, "step": 118330 }, { "epoch": 17.625111706881142, "grad_norm": 1.290244460105896, "learning_rate": 2.1172784571837965e-06, "loss": 0.6009, "num_input_tokens_seen": 68707536, "step": 118335 }, { "epoch": 17.625856419422103, "grad_norm": 1.5821977853775024, "learning_rate": 2.115969937413581e-06, "loss": 0.5761, "num_input_tokens_seen": 68710544, "step": 118340 }, { "epoch": 17.626601131963064, "grad_norm": 1.5376367568969727, "learning_rate": 2.1146618042457935e-06, "loss": 0.5684, "num_input_tokens_seen": 68713424, "step": 118345 }, { "epoch": 17.62734584450402, "grad_norm": 1.1428096294403076, "learning_rate": 2.113354057702535e-06, "loss": 0.4047, "num_input_tokens_seen": 68716432, "step": 118350 }, { "epoch": 17.62809055704498, "grad_norm": 1.9464424848556519, "learning_rate": 2.1120466978059e-06, "loss": 0.5401, "num_input_tokens_seen": 68719408, "step": 118355 }, { "epoch": 17.62883526958594, "grad_norm": 1.6874445676803589, "learning_rate": 2.1107397245779705e-06, "loss": 0.4701, "num_input_tokens_seen": 68722224, "step": 118360 }, { "epoch": 17.6295799821269, "grad_norm": 1.3411874771118164, "learning_rate": 2.109433138040834e-06, "loss": 0.5623, "num_input_tokens_seen": 68724912, "step": 118365 }, { "epoch": 17.63032469466786, "grad_norm": 1.5133181810379028, "learning_rate": 2.1081269382165536e-06, "loss": 0.4011, "num_input_tokens_seen": 68728048, "step": 118370 }, { "epoch": 17.631069407208816, "grad_norm": 1.631435751914978, "learning_rate": 2.1068211251272063e-06, "loss": 0.491, "num_input_tokens_seen": 68730768, "step": 118375 }, { "epoch": 17.631814119749777, "grad_norm": 2.48567271232605, "learning_rate": 2.105515698794844e-06, "loss": 0.6445, "num_input_tokens_seen": 68733328, "step": 118380 }, { "epoch": 17.632558832290737, "grad_norm": 1.8083899021148682, "learning_rate": 2.104210659241532e-06, "loss": 0.7254, "num_input_tokens_seen": 68736624, "step": 118385 }, { "epoch": 17.633303544831694, "grad_norm": 1.5460904836654663, "learning_rate": 2.102906006489308e-06, "loss": 0.65, "num_input_tokens_seen": 68739600, "step": 118390 }, { "epoch": 17.634048257372655, "grad_norm": 3.7609784603118896, "learning_rate": 2.1016017405602135e-06, "loss": 0.5273, "num_input_tokens_seen": 68742416, "step": 118395 }, { "epoch": 17.63479296991361, "grad_norm": 0.9192675352096558, "learning_rate": 2.100297861476286e-06, "loss": 0.5908, "num_input_tokens_seen": 68745328, "step": 118400 }, { "epoch": 17.635537682454572, "grad_norm": 3.053499460220337, "learning_rate": 2.0989943692595495e-06, "loss": 0.5515, "num_input_tokens_seen": 68748240, "step": 118405 }, { "epoch": 17.636282394995533, "grad_norm": 1.1503410339355469, "learning_rate": 2.0976912639320336e-06, "loss": 0.5048, "num_input_tokens_seen": 68751184, "step": 118410 }, { "epoch": 17.63702710753649, "grad_norm": 1.7955907583236694, "learning_rate": 2.096388545515743e-06, "loss": 0.6278, "num_input_tokens_seen": 68754096, "step": 118415 }, { "epoch": 17.63777182007745, "grad_norm": 3.2436773777008057, "learning_rate": 2.095086214032693e-06, "loss": 0.647, "num_input_tokens_seen": 68757040, "step": 118420 }, { "epoch": 17.63851653261841, "grad_norm": 1.3316203355789185, "learning_rate": 2.0937842695048754e-06, "loss": 0.56, "num_input_tokens_seen": 68759888, "step": 118425 }, { "epoch": 17.639261245159368, "grad_norm": 1.5749679803848267, "learning_rate": 2.0924827119542965e-06, "loss": 0.6099, "num_input_tokens_seen": 68762800, "step": 118430 }, { "epoch": 17.64000595770033, "grad_norm": 1.7114681005477905, "learning_rate": 2.0911815414029423e-06, "loss": 0.6628, "num_input_tokens_seen": 68765776, "step": 118435 }, { "epoch": 17.640750670241285, "grad_norm": 3.124931812286377, "learning_rate": 2.089880757872786e-06, "loss": 0.5463, "num_input_tokens_seen": 68768592, "step": 118440 }, { "epoch": 17.641495382782246, "grad_norm": 2.091052293777466, "learning_rate": 2.088580361385814e-06, "loss": 0.4996, "num_input_tokens_seen": 68771760, "step": 118445 }, { "epoch": 17.642240095323206, "grad_norm": 1.1949830055236816, "learning_rate": 2.0872803519639856e-06, "loss": 0.3723, "num_input_tokens_seen": 68774576, "step": 118450 }, { "epoch": 17.642984807864163, "grad_norm": 3.368276596069336, "learning_rate": 2.085980729629275e-06, "loss": 0.6668, "num_input_tokens_seen": 68777552, "step": 118455 }, { "epoch": 17.643729520405124, "grad_norm": 1.0123990774154663, "learning_rate": 2.084681494403623e-06, "loss": 0.6687, "num_input_tokens_seen": 68780592, "step": 118460 }, { "epoch": 17.644474232946084, "grad_norm": 1.4655115604400635, "learning_rate": 2.0833826463089957e-06, "loss": 0.6046, "num_input_tokens_seen": 68783536, "step": 118465 }, { "epoch": 17.64521894548704, "grad_norm": 1.0299941301345825, "learning_rate": 2.0820841853673252e-06, "loss": 0.5653, "num_input_tokens_seen": 68786480, "step": 118470 }, { "epoch": 17.645963658028002, "grad_norm": 1.446485161781311, "learning_rate": 2.080786111600544e-06, "loss": 0.7757, "num_input_tokens_seen": 68789424, "step": 118475 }, { "epoch": 17.64670837056896, "grad_norm": 1.397936224937439, "learning_rate": 2.079488425030593e-06, "loss": 0.4954, "num_input_tokens_seen": 68792144, "step": 118480 }, { "epoch": 17.64745308310992, "grad_norm": 2.5393786430358887, "learning_rate": 2.0781911256793905e-06, "loss": 0.6285, "num_input_tokens_seen": 68795088, "step": 118485 }, { "epoch": 17.64819779565088, "grad_norm": 1.7501637935638428, "learning_rate": 2.076894213568853e-06, "loss": 0.6623, "num_input_tokens_seen": 68798160, "step": 118490 }, { "epoch": 17.648942508191837, "grad_norm": 1.164446234703064, "learning_rate": 2.0755976887208815e-06, "loss": 0.5656, "num_input_tokens_seen": 68801232, "step": 118495 }, { "epoch": 17.649687220732797, "grad_norm": 1.477830410003662, "learning_rate": 2.0743015511573952e-06, "loss": 0.4242, "num_input_tokens_seen": 68804176, "step": 118500 }, { "epoch": 17.650431933273758, "grad_norm": 1.8299816846847534, "learning_rate": 2.0730058009002793e-06, "loss": 0.3934, "num_input_tokens_seen": 68806928, "step": 118505 }, { "epoch": 17.651176645814715, "grad_norm": 1.1720508337020874, "learning_rate": 2.0717104379714304e-06, "loss": 0.6824, "num_input_tokens_seen": 68809616, "step": 118510 }, { "epoch": 17.651921358355676, "grad_norm": 1.9548882246017456, "learning_rate": 2.070415462392733e-06, "loss": 0.4946, "num_input_tokens_seen": 68812496, "step": 118515 }, { "epoch": 17.652666070896633, "grad_norm": 0.9267721772193909, "learning_rate": 2.0691208741860562e-06, "loss": 0.4459, "num_input_tokens_seen": 68815312, "step": 118520 }, { "epoch": 17.653410783437593, "grad_norm": 2.014960765838623, "learning_rate": 2.06782667337328e-06, "loss": 0.5667, "num_input_tokens_seen": 68818384, "step": 118525 }, { "epoch": 17.654155495978554, "grad_norm": 1.5157628059387207, "learning_rate": 2.0665328599762613e-06, "loss": 0.6143, "num_input_tokens_seen": 68820976, "step": 118530 }, { "epoch": 17.65490020851951, "grad_norm": 1.7715046405792236, "learning_rate": 2.0652394340168664e-06, "loss": 0.6299, "num_input_tokens_seen": 68823792, "step": 118535 }, { "epoch": 17.65564492106047, "grad_norm": 1.9609925746917725, "learning_rate": 2.0639463955169414e-06, "loss": 0.5655, "num_input_tokens_seen": 68826896, "step": 118540 }, { "epoch": 17.656389633601428, "grad_norm": 1.4350981712341309, "learning_rate": 2.0626537444983274e-06, "loss": 0.5003, "num_input_tokens_seen": 68829808, "step": 118545 }, { "epoch": 17.65713434614239, "grad_norm": 1.3135308027267456, "learning_rate": 2.061361480982868e-06, "loss": 0.4174, "num_input_tokens_seen": 68832624, "step": 118550 }, { "epoch": 17.65787905868335, "grad_norm": 1.2406456470489502, "learning_rate": 2.060069604992393e-06, "loss": 0.5406, "num_input_tokens_seen": 68835376, "step": 118555 }, { "epoch": 17.658623771224306, "grad_norm": 1.6982195377349854, "learning_rate": 2.058778116548729e-06, "loss": 0.6111, "num_input_tokens_seen": 68838800, "step": 118560 }, { "epoch": 17.659368483765267, "grad_norm": 1.3601349592208862, "learning_rate": 2.057487015673684e-06, "loss": 0.5954, "num_input_tokens_seen": 68841520, "step": 118565 }, { "epoch": 17.660113196306227, "grad_norm": 1.6978133916854858, "learning_rate": 2.0561963023890853e-06, "loss": 0.5237, "num_input_tokens_seen": 68844432, "step": 118570 }, { "epoch": 17.660857908847184, "grad_norm": 1.394002079963684, "learning_rate": 2.0549059767167255e-06, "loss": 0.6372, "num_input_tokens_seen": 68847472, "step": 118575 }, { "epoch": 17.661602621388145, "grad_norm": 3.4473376274108887, "learning_rate": 2.053616038678416e-06, "loss": 0.591, "num_input_tokens_seen": 68850672, "step": 118580 }, { "epoch": 17.6623473339291, "grad_norm": 1.533170461654663, "learning_rate": 2.0523264882959357e-06, "loss": 0.4758, "num_input_tokens_seen": 68853584, "step": 118585 }, { "epoch": 17.663092046470062, "grad_norm": 1.1281991004943848, "learning_rate": 2.051037325591079e-06, "loss": 0.4937, "num_input_tokens_seen": 68856528, "step": 118590 }, { "epoch": 17.663836759011023, "grad_norm": 1.7134839296340942, "learning_rate": 2.0497485505856256e-06, "loss": 0.4768, "num_input_tokens_seen": 68859504, "step": 118595 }, { "epoch": 17.66458147155198, "grad_norm": 1.3825713396072388, "learning_rate": 2.0484601633013383e-06, "loss": 0.4292, "num_input_tokens_seen": 68862416, "step": 118600 }, { "epoch": 17.66532618409294, "grad_norm": 2.6200971603393555, "learning_rate": 2.0471721637599944e-06, "loss": 0.4707, "num_input_tokens_seen": 68865520, "step": 118605 }, { "epoch": 17.6660708966339, "grad_norm": 1.9163832664489746, "learning_rate": 2.0458845519833487e-06, "loss": 0.5822, "num_input_tokens_seen": 68868144, "step": 118610 }, { "epoch": 17.666815609174858, "grad_norm": 4.777465343475342, "learning_rate": 2.044597327993153e-06, "loss": 0.6378, "num_input_tokens_seen": 68870896, "step": 118615 }, { "epoch": 17.66756032171582, "grad_norm": 1.4302139282226562, "learning_rate": 2.0433104918111514e-06, "loss": 0.601, "num_input_tokens_seen": 68873520, "step": 118620 }, { "epoch": 17.668305034256775, "grad_norm": 1.2508035898208618, "learning_rate": 2.0420240434590925e-06, "loss": 0.5902, "num_input_tokens_seen": 68876304, "step": 118625 }, { "epoch": 17.669049746797736, "grad_norm": 1.3767277002334595, "learning_rate": 2.0407379829587013e-06, "loss": 0.613, "num_input_tokens_seen": 68879088, "step": 118630 }, { "epoch": 17.669794459338696, "grad_norm": 1.2463749647140503, "learning_rate": 2.039452310331705e-06, "loss": 0.4415, "num_input_tokens_seen": 68881872, "step": 118635 }, { "epoch": 17.670539171879653, "grad_norm": 1.844256043434143, "learning_rate": 2.0381670255998297e-06, "loss": 0.5811, "num_input_tokens_seen": 68884784, "step": 118640 }, { "epoch": 17.671283884420614, "grad_norm": 2.019352436065674, "learning_rate": 2.0368821287847785e-06, "loss": 0.7259, "num_input_tokens_seen": 68887728, "step": 118645 }, { "epoch": 17.672028596961574, "grad_norm": 1.5452522039413452, "learning_rate": 2.035597619908272e-06, "loss": 0.5463, "num_input_tokens_seen": 68890768, "step": 118650 }, { "epoch": 17.67277330950253, "grad_norm": 1.7808398008346558, "learning_rate": 2.0343134989919995e-06, "loss": 0.5844, "num_input_tokens_seen": 68893648, "step": 118655 }, { "epoch": 17.673518022043492, "grad_norm": 1.417295217514038, "learning_rate": 2.033029766057662e-06, "loss": 0.5421, "num_input_tokens_seen": 68896528, "step": 118660 }, { "epoch": 17.67426273458445, "grad_norm": 3.0940396785736084, "learning_rate": 2.03174642112694e-06, "loss": 0.5342, "num_input_tokens_seen": 68899152, "step": 118665 }, { "epoch": 17.67500744712541, "grad_norm": 2.2609124183654785, "learning_rate": 2.0304634642215215e-06, "loss": 0.6886, "num_input_tokens_seen": 68902416, "step": 118670 }, { "epoch": 17.67575215966637, "grad_norm": 1.4404823780059814, "learning_rate": 2.029180895363081e-06, "loss": 0.5882, "num_input_tokens_seen": 68905328, "step": 118675 }, { "epoch": 17.676496872207327, "grad_norm": 1.9343713521957397, "learning_rate": 2.0278987145732786e-06, "loss": 0.5487, "num_input_tokens_seen": 68907920, "step": 118680 }, { "epoch": 17.677241584748288, "grad_norm": 0.9977290630340576, "learning_rate": 2.0266169218737836e-06, "loss": 0.505, "num_input_tokens_seen": 68910800, "step": 118685 }, { "epoch": 17.677986297289245, "grad_norm": 1.2255463600158691, "learning_rate": 2.0253355172862394e-06, "loss": 0.7165, "num_input_tokens_seen": 68913616, "step": 118690 }, { "epoch": 17.678731009830205, "grad_norm": 1.2646071910858154, "learning_rate": 2.0240545008323064e-06, "loss": 0.5572, "num_input_tokens_seen": 68916336, "step": 118695 }, { "epoch": 17.679475722371166, "grad_norm": 2.4163849353790283, "learning_rate": 2.0227738725336176e-06, "loss": 0.4675, "num_input_tokens_seen": 68919120, "step": 118700 }, { "epoch": 17.680220434912123, "grad_norm": 1.054383635520935, "learning_rate": 2.0214936324118137e-06, "loss": 0.5528, "num_input_tokens_seen": 68921968, "step": 118705 }, { "epoch": 17.680965147453083, "grad_norm": 1.5914905071258545, "learning_rate": 2.0202137804885196e-06, "loss": 0.4948, "num_input_tokens_seen": 68924848, "step": 118710 }, { "epoch": 17.681709859994044, "grad_norm": 1.388344168663025, "learning_rate": 2.018934316785359e-06, "loss": 0.5143, "num_input_tokens_seen": 68927696, "step": 118715 }, { "epoch": 17.682454572535, "grad_norm": 1.8837308883666992, "learning_rate": 2.017655241323946e-06, "loss": 0.7663, "num_input_tokens_seen": 68930352, "step": 118720 }, { "epoch": 17.68319928507596, "grad_norm": 1.0582592487335205, "learning_rate": 2.016376554125887e-06, "loss": 0.4695, "num_input_tokens_seen": 68933424, "step": 118725 }, { "epoch": 17.683943997616918, "grad_norm": 0.5943391919136047, "learning_rate": 2.0150982552127913e-06, "loss": 0.4503, "num_input_tokens_seen": 68936336, "step": 118730 }, { "epoch": 17.68468871015788, "grad_norm": 1.4402920007705688, "learning_rate": 2.0138203446062433e-06, "loss": 0.4609, "num_input_tokens_seen": 68939216, "step": 118735 }, { "epoch": 17.68543342269884, "grad_norm": 1.7552207708358765, "learning_rate": 2.0125428223278453e-06, "loss": 0.5219, "num_input_tokens_seen": 68942512, "step": 118740 }, { "epoch": 17.686178135239796, "grad_norm": 2.774670124053955, "learning_rate": 2.011265688399172e-06, "loss": 0.725, "num_input_tokens_seen": 68945200, "step": 118745 }, { "epoch": 17.686922847780757, "grad_norm": 2.9810941219329834, "learning_rate": 2.009988942841798e-06, "loss": 0.6313, "num_input_tokens_seen": 68948272, "step": 118750 }, { "epoch": 17.687667560321717, "grad_norm": 1.7635122537612915, "learning_rate": 2.008712585677297e-06, "loss": 0.6734, "num_input_tokens_seen": 68951472, "step": 118755 }, { "epoch": 17.688412272862674, "grad_norm": 0.9733136892318726, "learning_rate": 2.007436616927225e-06, "loss": 0.5499, "num_input_tokens_seen": 68954384, "step": 118760 }, { "epoch": 17.689156985403635, "grad_norm": 1.243347406387329, "learning_rate": 2.006161036613147e-06, "loss": 0.527, "num_input_tokens_seen": 68957040, "step": 118765 }, { "epoch": 17.68990169794459, "grad_norm": 2.312147378921509, "learning_rate": 2.0048858447566045e-06, "loss": 0.6271, "num_input_tokens_seen": 68959664, "step": 118770 }, { "epoch": 17.690646410485552, "grad_norm": 1.5610241889953613, "learning_rate": 2.003611041379147e-06, "loss": 0.5459, "num_input_tokens_seen": 68962576, "step": 118775 }, { "epoch": 17.691391123026513, "grad_norm": 1.2558926343917847, "learning_rate": 2.0023366265023074e-06, "loss": 0.5656, "num_input_tokens_seen": 68965712, "step": 118780 }, { "epoch": 17.69213583556747, "grad_norm": 2.0627601146698, "learning_rate": 2.0010626001476184e-06, "loss": 0.6228, "num_input_tokens_seen": 68968304, "step": 118785 }, { "epoch": 17.69288054810843, "grad_norm": 2.58681058883667, "learning_rate": 1.999788962336599e-06, "loss": 0.7274, "num_input_tokens_seen": 68971344, "step": 118790 }, { "epoch": 17.69362526064939, "grad_norm": 1.504555106163025, "learning_rate": 1.9985157130907707e-06, "loss": 0.7304, "num_input_tokens_seen": 68974032, "step": 118795 }, { "epoch": 17.694369973190348, "grad_norm": 1.274438500404358, "learning_rate": 1.997242852431644e-06, "loss": 0.4048, "num_input_tokens_seen": 68977040, "step": 118800 }, { "epoch": 17.69511468573131, "grad_norm": 1.1528319120407104, "learning_rate": 1.9959703803807156e-06, "loss": 0.5113, "num_input_tokens_seen": 68979792, "step": 118805 }, { "epoch": 17.695859398272265, "grad_norm": 1.5031810998916626, "learning_rate": 1.994698296959491e-06, "loss": 0.5106, "num_input_tokens_seen": 68982544, "step": 118810 }, { "epoch": 17.696604110813226, "grad_norm": 0.8973279595375061, "learning_rate": 1.9934266021894575e-06, "loss": 0.4827, "num_input_tokens_seen": 68985360, "step": 118815 }, { "epoch": 17.697348823354186, "grad_norm": 3.1397554874420166, "learning_rate": 1.9921552960920994e-06, "loss": 0.5569, "num_input_tokens_seen": 68988208, "step": 118820 }, { "epoch": 17.698093535895143, "grad_norm": 1.6516239643096924, "learning_rate": 1.990884378688887e-06, "loss": 0.4851, "num_input_tokens_seen": 68991280, "step": 118825 }, { "epoch": 17.698838248436104, "grad_norm": 1.1554595232009888, "learning_rate": 1.989613850001304e-06, "loss": 0.7146, "num_input_tokens_seen": 68994288, "step": 118830 }, { "epoch": 17.699582960977065, "grad_norm": 2.1406030654907227, "learning_rate": 1.988343710050808e-06, "loss": 0.7947, "num_input_tokens_seen": 68997328, "step": 118835 }, { "epoch": 17.70032767351802, "grad_norm": 1.3598730564117432, "learning_rate": 1.987073958858851e-06, "loss": 0.676, "num_input_tokens_seen": 69000144, "step": 118840 }, { "epoch": 17.701072386058982, "grad_norm": 1.673466444015503, "learning_rate": 1.985804596446897e-06, "loss": 0.395, "num_input_tokens_seen": 69003152, "step": 118845 }, { "epoch": 17.70181709859994, "grad_norm": 1.3730965852737427, "learning_rate": 1.984535622836378e-06, "loss": 0.5814, "num_input_tokens_seen": 69005904, "step": 118850 }, { "epoch": 17.7025618111409, "grad_norm": 1.5610811710357666, "learning_rate": 1.983267038048742e-06, "loss": 0.6766, "num_input_tokens_seen": 69008784, "step": 118855 }, { "epoch": 17.70330652368186, "grad_norm": 1.7735050916671753, "learning_rate": 1.981998842105412e-06, "loss": 0.3585, "num_input_tokens_seen": 69011376, "step": 118860 }, { "epoch": 17.704051236222817, "grad_norm": 2.431361675262451, "learning_rate": 1.980731035027822e-06, "loss": 0.4831, "num_input_tokens_seen": 69014512, "step": 118865 }, { "epoch": 17.704795948763778, "grad_norm": 1.3652609586715698, "learning_rate": 1.979463616837385e-06, "loss": 0.7485, "num_input_tokens_seen": 69017648, "step": 118870 }, { "epoch": 17.705540661304738, "grad_norm": 2.0591273307800293, "learning_rate": 1.9781965875555087e-06, "loss": 0.5932, "num_input_tokens_seen": 69020400, "step": 118875 }, { "epoch": 17.706285373845695, "grad_norm": 1.3015230894088745, "learning_rate": 1.976929947203607e-06, "loss": 0.5044, "num_input_tokens_seen": 69023216, "step": 118880 }, { "epoch": 17.707030086386656, "grad_norm": 2.940255641937256, "learning_rate": 1.9756636958030733e-06, "loss": 0.6879, "num_input_tokens_seen": 69026192, "step": 118885 }, { "epoch": 17.707774798927613, "grad_norm": 1.500019907951355, "learning_rate": 1.9743978333753023e-06, "loss": 0.5664, "num_input_tokens_seen": 69028944, "step": 118890 }, { "epoch": 17.708519511468573, "grad_norm": 1.2564008235931396, "learning_rate": 1.9731323599416736e-06, "loss": 0.5912, "num_input_tokens_seen": 69031632, "step": 118895 }, { "epoch": 17.709264224009534, "grad_norm": 1.037462592124939, "learning_rate": 1.9718672755235728e-06, "loss": 0.6683, "num_input_tokens_seen": 69034288, "step": 118900 }, { "epoch": 17.71000893655049, "grad_norm": 1.2272034883499146, "learning_rate": 1.9706025801423666e-06, "loss": 0.6094, "num_input_tokens_seen": 69037136, "step": 118905 }, { "epoch": 17.71075364909145, "grad_norm": 3.005211353302002, "learning_rate": 1.969338273819429e-06, "loss": 0.6501, "num_input_tokens_seen": 69040048, "step": 118910 }, { "epoch": 17.711498361632408, "grad_norm": 1.2644031047821045, "learning_rate": 1.9680743565761107e-06, "loss": 0.7708, "num_input_tokens_seen": 69042896, "step": 118915 }, { "epoch": 17.71224307417337, "grad_norm": 2.2521469593048096, "learning_rate": 1.9668108284337654e-06, "loss": 0.5968, "num_input_tokens_seen": 69046032, "step": 118920 }, { "epoch": 17.71298778671433, "grad_norm": 2.2448158264160156, "learning_rate": 1.9655476894137465e-06, "loss": 0.583, "num_input_tokens_seen": 69048944, "step": 118925 }, { "epoch": 17.713732499255286, "grad_norm": 1.4397906064987183, "learning_rate": 1.9642849395373836e-06, "loss": 0.5727, "num_input_tokens_seen": 69052080, "step": 118930 }, { "epoch": 17.714477211796247, "grad_norm": 1.9730768203735352, "learning_rate": 1.963022578826018e-06, "loss": 0.5678, "num_input_tokens_seen": 69054960, "step": 118935 }, { "epoch": 17.715221924337207, "grad_norm": 1.2982215881347656, "learning_rate": 1.961760607300972e-06, "loss": 0.6316, "num_input_tokens_seen": 69058256, "step": 118940 }, { "epoch": 17.715966636878164, "grad_norm": 2.7760772705078125, "learning_rate": 1.960499024983564e-06, "loss": 0.7193, "num_input_tokens_seen": 69061072, "step": 118945 }, { "epoch": 17.716711349419125, "grad_norm": 1.6079349517822266, "learning_rate": 1.9592378318951054e-06, "loss": 0.575, "num_input_tokens_seen": 69064304, "step": 118950 }, { "epoch": 17.717456061960082, "grad_norm": 2.6793646812438965, "learning_rate": 1.957977028056912e-06, "loss": 0.509, "num_input_tokens_seen": 69067088, "step": 118955 }, { "epoch": 17.718200774501042, "grad_norm": 2.434535026550293, "learning_rate": 1.9567166134902752e-06, "loss": 0.4798, "num_input_tokens_seen": 69069872, "step": 118960 }, { "epoch": 17.718945487042003, "grad_norm": 1.4186924695968628, "learning_rate": 1.955456588216489e-06, "loss": 0.5082, "num_input_tokens_seen": 69072432, "step": 118965 }, { "epoch": 17.71969019958296, "grad_norm": 2.1052846908569336, "learning_rate": 1.9541969522568456e-06, "loss": 0.7097, "num_input_tokens_seen": 69075280, "step": 118970 }, { "epoch": 17.72043491212392, "grad_norm": 1.5455747842788696, "learning_rate": 1.9529377056326183e-06, "loss": 0.6808, "num_input_tokens_seen": 69078192, "step": 118975 }, { "epoch": 17.72117962466488, "grad_norm": 1.9411343336105347, "learning_rate": 1.951678848365088e-06, "loss": 0.6259, "num_input_tokens_seen": 69081392, "step": 118980 }, { "epoch": 17.721924337205838, "grad_norm": 0.5395528078079224, "learning_rate": 1.950420380475515e-06, "loss": 0.3783, "num_input_tokens_seen": 69084016, "step": 118985 }, { "epoch": 17.7226690497468, "grad_norm": 2.2322962284088135, "learning_rate": 1.949162301985166e-06, "loss": 0.7014, "num_input_tokens_seen": 69087088, "step": 118990 }, { "epoch": 17.723413762287755, "grad_norm": 2.0782113075256348, "learning_rate": 1.947904612915294e-06, "loss": 0.7142, "num_input_tokens_seen": 69090064, "step": 118995 }, { "epoch": 17.724158474828716, "grad_norm": 1.9219738245010376, "learning_rate": 1.9466473132871392e-06, "loss": 0.5474, "num_input_tokens_seen": 69092976, "step": 119000 }, { "epoch": 17.724903187369677, "grad_norm": 1.384914755821228, "learning_rate": 1.945390403121952e-06, "loss": 0.6232, "num_input_tokens_seen": 69096240, "step": 119005 }, { "epoch": 17.725647899910633, "grad_norm": 1.2351243495941162, "learning_rate": 1.944133882440963e-06, "loss": 0.4997, "num_input_tokens_seen": 69099248, "step": 119010 }, { "epoch": 17.726392612451594, "grad_norm": 1.6116323471069336, "learning_rate": 1.9428777512653957e-06, "loss": 0.5836, "num_input_tokens_seen": 69102288, "step": 119015 }, { "epoch": 17.727137324992555, "grad_norm": 2.904611825942993, "learning_rate": 1.941622009616473e-06, "loss": 0.4763, "num_input_tokens_seen": 69105392, "step": 119020 }, { "epoch": 17.72788203753351, "grad_norm": 1.6777374744415283, "learning_rate": 1.9403666575154163e-06, "loss": 0.6212, "num_input_tokens_seen": 69108016, "step": 119025 }, { "epoch": 17.728626750074472, "grad_norm": 1.4432302713394165, "learning_rate": 1.9391116949834227e-06, "loss": 0.4001, "num_input_tokens_seen": 69110704, "step": 119030 }, { "epoch": 17.72937146261543, "grad_norm": 1.973945140838623, "learning_rate": 1.937857122041703e-06, "loss": 0.6332, "num_input_tokens_seen": 69113744, "step": 119035 }, { "epoch": 17.73011617515639, "grad_norm": 1.029463768005371, "learning_rate": 1.936602938711449e-06, "loss": 0.4643, "num_input_tokens_seen": 69116784, "step": 119040 }, { "epoch": 17.73086088769735, "grad_norm": 1.1497248411178589, "learning_rate": 1.93534914501384e-06, "loss": 0.5532, "num_input_tokens_seen": 69119856, "step": 119045 }, { "epoch": 17.731605600238307, "grad_norm": 1.5121709108352661, "learning_rate": 1.934095740970074e-06, "loss": 0.578, "num_input_tokens_seen": 69123120, "step": 119050 }, { "epoch": 17.732350312779268, "grad_norm": 1.5922584533691406, "learning_rate": 1.932842726601311e-06, "loss": 0.4238, "num_input_tokens_seen": 69126096, "step": 119055 }, { "epoch": 17.733095025320225, "grad_norm": 2.1608738899230957, "learning_rate": 1.931590101928729e-06, "loss": 0.5174, "num_input_tokens_seen": 69128912, "step": 119060 }, { "epoch": 17.733839737861185, "grad_norm": 1.4236993789672852, "learning_rate": 1.9303378669734834e-06, "loss": 0.5599, "num_input_tokens_seen": 69131920, "step": 119065 }, { "epoch": 17.734584450402146, "grad_norm": 1.6868122816085815, "learning_rate": 1.9290860217567374e-06, "loss": 0.5659, "num_input_tokens_seen": 69134608, "step": 119070 }, { "epoch": 17.735329162943103, "grad_norm": 1.2992740869522095, "learning_rate": 1.9278345662996356e-06, "loss": 0.4461, "num_input_tokens_seen": 69137616, "step": 119075 }, { "epoch": 17.736073875484063, "grad_norm": 2.1077919006347656, "learning_rate": 1.926583500623316e-06, "loss": 0.8895, "num_input_tokens_seen": 69140400, "step": 119080 }, { "epoch": 17.736818588025024, "grad_norm": 1.464219570159912, "learning_rate": 1.925332824748921e-06, "loss": 0.4634, "num_input_tokens_seen": 69143248, "step": 119085 }, { "epoch": 17.73756330056598, "grad_norm": 3.0707507133483887, "learning_rate": 1.9240825386975692e-06, "loss": 0.7329, "num_input_tokens_seen": 69146288, "step": 119090 }, { "epoch": 17.73830801310694, "grad_norm": 1.409324288368225, "learning_rate": 1.9228326424903966e-06, "loss": 0.5622, "num_input_tokens_seen": 69149136, "step": 119095 }, { "epoch": 17.7390527256479, "grad_norm": 0.9446173906326294, "learning_rate": 1.9215831361485054e-06, "loss": 0.6885, "num_input_tokens_seen": 69151952, "step": 119100 }, { "epoch": 17.73979743818886, "grad_norm": 1.0453784465789795, "learning_rate": 1.920334019693015e-06, "loss": 0.6566, "num_input_tokens_seen": 69154576, "step": 119105 }, { "epoch": 17.74054215072982, "grad_norm": 2.076951026916504, "learning_rate": 1.9190852931450204e-06, "loss": 0.6175, "num_input_tokens_seen": 69157680, "step": 119110 }, { "epoch": 17.741286863270776, "grad_norm": 1.715397596359253, "learning_rate": 1.917836956525626e-06, "loss": 0.6003, "num_input_tokens_seen": 69160528, "step": 119115 }, { "epoch": 17.742031575811737, "grad_norm": 0.956417977809906, "learning_rate": 1.916589009855918e-06, "loss": 0.4404, "num_input_tokens_seen": 69163184, "step": 119120 }, { "epoch": 17.742776288352697, "grad_norm": 6.065086364746094, "learning_rate": 1.9153414531569713e-06, "loss": 0.6406, "num_input_tokens_seen": 69166096, "step": 119125 }, { "epoch": 17.743521000893654, "grad_norm": 1.0297385454177856, "learning_rate": 1.9140942864498747e-06, "loss": 0.5603, "num_input_tokens_seen": 69169072, "step": 119130 }, { "epoch": 17.744265713434615, "grad_norm": 1.9943017959594727, "learning_rate": 1.912847509755686e-06, "loss": 0.5696, "num_input_tokens_seen": 69171728, "step": 119135 }, { "epoch": 17.745010425975572, "grad_norm": 1.0649709701538086, "learning_rate": 1.911601123095477e-06, "loss": 0.4757, "num_input_tokens_seen": 69174352, "step": 119140 }, { "epoch": 17.745755138516532, "grad_norm": 2.4502692222595215, "learning_rate": 1.910355126490304e-06, "loss": 0.7006, "num_input_tokens_seen": 69176816, "step": 119145 }, { "epoch": 17.746499851057493, "grad_norm": 3.769224166870117, "learning_rate": 1.909109519961211e-06, "loss": 0.5871, "num_input_tokens_seen": 69179920, "step": 119150 }, { "epoch": 17.74724456359845, "grad_norm": 1.7637451887130737, "learning_rate": 1.907864303529247e-06, "loss": 0.46, "num_input_tokens_seen": 69182960, "step": 119155 }, { "epoch": 17.74798927613941, "grad_norm": 1.1677401065826416, "learning_rate": 1.9066194772154379e-06, "loss": 0.423, "num_input_tokens_seen": 69186192, "step": 119160 }, { "epoch": 17.74873398868037, "grad_norm": 1.1117442846298218, "learning_rate": 1.90537504104083e-06, "loss": 0.4759, "num_input_tokens_seen": 69188752, "step": 119165 }, { "epoch": 17.749478701221328, "grad_norm": 1.255595326423645, "learning_rate": 1.9041309950264319e-06, "loss": 0.4474, "num_input_tokens_seen": 69192048, "step": 119170 }, { "epoch": 17.75022341376229, "grad_norm": 1.364487648010254, "learning_rate": 1.902887339193271e-06, "loss": 0.6382, "num_input_tokens_seen": 69194928, "step": 119175 }, { "epoch": 17.750968126303245, "grad_norm": 1.338949203491211, "learning_rate": 1.9016440735623503e-06, "loss": 0.4452, "num_input_tokens_seen": 69198064, "step": 119180 }, { "epoch": 17.751712838844206, "grad_norm": 2.1551544666290283, "learning_rate": 1.9004011981546804e-06, "loss": 0.7348, "num_input_tokens_seen": 69200912, "step": 119185 }, { "epoch": 17.752457551385167, "grad_norm": 0.9425879120826721, "learning_rate": 1.8991587129912531e-06, "loss": 0.6775, "num_input_tokens_seen": 69203536, "step": 119190 }, { "epoch": 17.753202263926124, "grad_norm": 2.960909843444824, "learning_rate": 1.8979166180930625e-06, "loss": 0.6755, "num_input_tokens_seen": 69206384, "step": 119195 }, { "epoch": 17.753946976467084, "grad_norm": 1.6664258241653442, "learning_rate": 1.896674913481089e-06, "loss": 0.588, "num_input_tokens_seen": 69209040, "step": 119200 }, { "epoch": 17.75469168900804, "grad_norm": 1.415286898612976, "learning_rate": 1.8954335991763107e-06, "loss": 0.6912, "num_input_tokens_seen": 69212176, "step": 119205 }, { "epoch": 17.755436401549, "grad_norm": 2.0958259105682373, "learning_rate": 1.8941926751997018e-06, "loss": 0.5115, "num_input_tokens_seen": 69214800, "step": 119210 }, { "epoch": 17.756181114089962, "grad_norm": 1.2347885370254517, "learning_rate": 1.8929521415722267e-06, "loss": 0.682, "num_input_tokens_seen": 69217584, "step": 119215 }, { "epoch": 17.75692582663092, "grad_norm": 1.827471137046814, "learning_rate": 1.8917119983148378e-06, "loss": 0.7724, "num_input_tokens_seen": 69220592, "step": 119220 }, { "epoch": 17.75767053917188, "grad_norm": 2.338343381881714, "learning_rate": 1.8904722454484825e-06, "loss": 0.8116, "num_input_tokens_seen": 69223536, "step": 119225 }, { "epoch": 17.75841525171284, "grad_norm": 2.35939359664917, "learning_rate": 1.8892328829941186e-06, "loss": 0.5568, "num_input_tokens_seen": 69226320, "step": 119230 }, { "epoch": 17.759159964253797, "grad_norm": 2.6305317878723145, "learning_rate": 1.8879939109726713e-06, "loss": 0.7584, "num_input_tokens_seen": 69229360, "step": 119235 }, { "epoch": 17.759904676794758, "grad_norm": 3.1546387672424316, "learning_rate": 1.8867553294050795e-06, "loss": 0.4861, "num_input_tokens_seen": 69232368, "step": 119240 }, { "epoch": 17.76064938933572, "grad_norm": 2.5337929725646973, "learning_rate": 1.8855171383122677e-06, "loss": 0.5834, "num_input_tokens_seen": 69235792, "step": 119245 }, { "epoch": 17.761394101876675, "grad_norm": 2.450462579727173, "learning_rate": 1.8842793377151446e-06, "loss": 0.7696, "num_input_tokens_seen": 69238704, "step": 119250 }, { "epoch": 17.762138814417636, "grad_norm": 2.199906826019287, "learning_rate": 1.8830419276346352e-06, "loss": 0.8035, "num_input_tokens_seen": 69241744, "step": 119255 }, { "epoch": 17.762883526958593, "grad_norm": 1.2441284656524658, "learning_rate": 1.8818049080916305e-06, "loss": 0.4215, "num_input_tokens_seen": 69244528, "step": 119260 }, { "epoch": 17.763628239499553, "grad_norm": 2.715449810028076, "learning_rate": 1.8805682791070422e-06, "loss": 0.6694, "num_input_tokens_seen": 69247440, "step": 119265 }, { "epoch": 17.764372952040514, "grad_norm": 1.7875221967697144, "learning_rate": 1.8793320407017534e-06, "loss": 0.7525, "num_input_tokens_seen": 69250256, "step": 119270 }, { "epoch": 17.76511766458147, "grad_norm": 1.797162652015686, "learning_rate": 1.8780961928966528e-06, "loss": 0.5678, "num_input_tokens_seen": 69253104, "step": 119275 }, { "epoch": 17.76586237712243, "grad_norm": 3.915364980697632, "learning_rate": 1.8768607357126128e-06, "loss": 0.6789, "num_input_tokens_seen": 69255792, "step": 119280 }, { "epoch": 17.76660708966339, "grad_norm": 1.1010630130767822, "learning_rate": 1.875625669170511e-06, "loss": 0.4663, "num_input_tokens_seen": 69258768, "step": 119285 }, { "epoch": 17.76735180220435, "grad_norm": 1.1818487644195557, "learning_rate": 1.874390993291214e-06, "loss": 0.5237, "num_input_tokens_seen": 69261616, "step": 119290 }, { "epoch": 17.76809651474531, "grad_norm": 1.5255502462387085, "learning_rate": 1.8731567080955692e-06, "loss": 0.5943, "num_input_tokens_seen": 69264656, "step": 119295 }, { "epoch": 17.768841227286266, "grad_norm": 1.4370516538619995, "learning_rate": 1.871922813604443e-06, "loss": 0.5544, "num_input_tokens_seen": 69267920, "step": 119300 }, { "epoch": 17.769585939827227, "grad_norm": 1.5625053644180298, "learning_rate": 1.870689309838672e-06, "loss": 0.7024, "num_input_tokens_seen": 69270896, "step": 119305 }, { "epoch": 17.770330652368187, "grad_norm": 3.001311779022217, "learning_rate": 1.8694561968191e-06, "loss": 0.6043, "num_input_tokens_seen": 69274096, "step": 119310 }, { "epoch": 17.771075364909144, "grad_norm": 1.0886739492416382, "learning_rate": 1.8682234745665522e-06, "loss": 0.5237, "num_input_tokens_seen": 69276880, "step": 119315 }, { "epoch": 17.771820077450105, "grad_norm": 0.9885845184326172, "learning_rate": 1.866991143101865e-06, "loss": 0.537, "num_input_tokens_seen": 69279856, "step": 119320 }, { "epoch": 17.772564789991062, "grad_norm": 1.2120039463043213, "learning_rate": 1.8657592024458491e-06, "loss": 0.6376, "num_input_tokens_seen": 69282832, "step": 119325 }, { "epoch": 17.773309502532022, "grad_norm": 2.192938804626465, "learning_rate": 1.8645276526193162e-06, "loss": 0.5844, "num_input_tokens_seen": 69285552, "step": 119330 }, { "epoch": 17.774054215072983, "grad_norm": 1.4802073240280151, "learning_rate": 1.8632964936430768e-06, "loss": 0.6733, "num_input_tokens_seen": 69288464, "step": 119335 }, { "epoch": 17.77479892761394, "grad_norm": 1.2335796356201172, "learning_rate": 1.8620657255379314e-06, "loss": 0.5284, "num_input_tokens_seen": 69291280, "step": 119340 }, { "epoch": 17.7755436401549, "grad_norm": 1.5169801712036133, "learning_rate": 1.860835348324666e-06, "loss": 0.7302, "num_input_tokens_seen": 69294320, "step": 119345 }, { "epoch": 17.77628835269586, "grad_norm": 4.229835033416748, "learning_rate": 1.8596053620240667e-06, "loss": 0.6864, "num_input_tokens_seen": 69297296, "step": 119350 }, { "epoch": 17.777033065236818, "grad_norm": 0.8748176097869873, "learning_rate": 1.8583757666569196e-06, "loss": 0.4163, "num_input_tokens_seen": 69300528, "step": 119355 }, { "epoch": 17.77777777777778, "grad_norm": 1.3385738134384155, "learning_rate": 1.8571465622439943e-06, "loss": 0.5728, "num_input_tokens_seen": 69303344, "step": 119360 }, { "epoch": 17.778522490318736, "grad_norm": 1.485974907875061, "learning_rate": 1.8559177488060547e-06, "loss": 0.7748, "num_input_tokens_seen": 69306000, "step": 119365 }, { "epoch": 17.779267202859696, "grad_norm": 0.6267123818397522, "learning_rate": 1.854689326363862e-06, "loss": 0.5072, "num_input_tokens_seen": 69309008, "step": 119370 }, { "epoch": 17.780011915400657, "grad_norm": 1.7510511875152588, "learning_rate": 1.8534612949381691e-06, "loss": 0.5085, "num_input_tokens_seen": 69311888, "step": 119375 }, { "epoch": 17.780756627941614, "grad_norm": 1.0018912553787231, "learning_rate": 1.8522336545497232e-06, "loss": 0.5127, "num_input_tokens_seen": 69314768, "step": 119380 }, { "epoch": 17.781501340482574, "grad_norm": 2.093069076538086, "learning_rate": 1.8510064052192604e-06, "loss": 0.6221, "num_input_tokens_seen": 69317808, "step": 119385 }, { "epoch": 17.782246053023535, "grad_norm": 1.62354576587677, "learning_rate": 1.8497795469675227e-06, "loss": 0.6001, "num_input_tokens_seen": 69320784, "step": 119390 }, { "epoch": 17.78299076556449, "grad_norm": 2.9720253944396973, "learning_rate": 1.848553079815224e-06, "loss": 0.5636, "num_input_tokens_seen": 69323472, "step": 119395 }, { "epoch": 17.783735478105452, "grad_norm": 2.3402516841888428, "learning_rate": 1.8473270037830975e-06, "loss": 0.6801, "num_input_tokens_seen": 69326352, "step": 119400 }, { "epoch": 17.78448019064641, "grad_norm": 1.8456569910049438, "learning_rate": 1.8461013188918492e-06, "loss": 0.6167, "num_input_tokens_seen": 69329136, "step": 119405 }, { "epoch": 17.78522490318737, "grad_norm": 1.1694220304489136, "learning_rate": 1.8448760251621844e-06, "loss": 0.4336, "num_input_tokens_seen": 69331952, "step": 119410 }, { "epoch": 17.78596961572833, "grad_norm": 1.476118803024292, "learning_rate": 1.843651122614809e-06, "loss": 0.6928, "num_input_tokens_seen": 69334768, "step": 119415 }, { "epoch": 17.786714328269287, "grad_norm": 2.061166286468506, "learning_rate": 1.8424266112704064e-06, "loss": 0.4913, "num_input_tokens_seen": 69337552, "step": 119420 }, { "epoch": 17.787459040810248, "grad_norm": 2.4325592517852783, "learning_rate": 1.841202491149674e-06, "loss": 0.6987, "num_input_tokens_seen": 69340656, "step": 119425 }, { "epoch": 17.788203753351205, "grad_norm": 2.159670352935791, "learning_rate": 1.839978762273284e-06, "loss": 0.6362, "num_input_tokens_seen": 69343536, "step": 119430 }, { "epoch": 17.788948465892165, "grad_norm": 1.4997057914733887, "learning_rate": 1.838755424661917e-06, "loss": 0.6521, "num_input_tokens_seen": 69346480, "step": 119435 }, { "epoch": 17.789693178433126, "grad_norm": 1.2238128185272217, "learning_rate": 1.8375324783362402e-06, "loss": 0.6845, "num_input_tokens_seen": 69349584, "step": 119440 }, { "epoch": 17.790437890974083, "grad_norm": 1.9046660661697388, "learning_rate": 1.8363099233169034e-06, "loss": 0.5979, "num_input_tokens_seen": 69352688, "step": 119445 }, { "epoch": 17.791182603515043, "grad_norm": 1.811631679534912, "learning_rate": 1.8350877596245735e-06, "loss": 0.707, "num_input_tokens_seen": 69355504, "step": 119450 }, { "epoch": 17.791927316056004, "grad_norm": 2.4360852241516113, "learning_rate": 1.8338659872798896e-06, "loss": 0.6155, "num_input_tokens_seen": 69358192, "step": 119455 }, { "epoch": 17.79267202859696, "grad_norm": 1.4551576375961304, "learning_rate": 1.8326446063034964e-06, "loss": 0.5088, "num_input_tokens_seen": 69360880, "step": 119460 }, { "epoch": 17.79341674113792, "grad_norm": 1.1301813125610352, "learning_rate": 1.8314236167160243e-06, "loss": 0.68, "num_input_tokens_seen": 69363696, "step": 119465 }, { "epoch": 17.79416145367888, "grad_norm": 1.3474197387695312, "learning_rate": 1.8302030185381042e-06, "loss": 0.7039, "num_input_tokens_seen": 69366512, "step": 119470 }, { "epoch": 17.79490616621984, "grad_norm": 1.0993313789367676, "learning_rate": 1.8289828117903584e-06, "loss": 0.5074, "num_input_tokens_seen": 69369296, "step": 119475 }, { "epoch": 17.7956508787608, "grad_norm": 1.5809340476989746, "learning_rate": 1.8277629964933958e-06, "loss": 0.4413, "num_input_tokens_seen": 69372080, "step": 119480 }, { "epoch": 17.796395591301756, "grad_norm": 1.1553666591644287, "learning_rate": 1.8265435726678271e-06, "loss": 0.6233, "num_input_tokens_seen": 69374672, "step": 119485 }, { "epoch": 17.797140303842717, "grad_norm": 1.364382266998291, "learning_rate": 1.8253245403342472e-06, "loss": 0.5754, "num_input_tokens_seen": 69377584, "step": 119490 }, { "epoch": 17.797885016383677, "grad_norm": 1.7630181312561035, "learning_rate": 1.824105899513262e-06, "loss": 0.8514, "num_input_tokens_seen": 69380240, "step": 119495 }, { "epoch": 17.798629728924634, "grad_norm": 1.1866722106933594, "learning_rate": 1.8228876502254465e-06, "loss": 0.5506, "num_input_tokens_seen": 69382992, "step": 119500 }, { "epoch": 17.799374441465595, "grad_norm": 1.4603872299194336, "learning_rate": 1.8216697924913928e-06, "loss": 0.4608, "num_input_tokens_seen": 69385584, "step": 119505 }, { "epoch": 17.800119154006552, "grad_norm": 1.208028793334961, "learning_rate": 1.8204523263316647e-06, "loss": 0.6048, "num_input_tokens_seen": 69388720, "step": 119510 }, { "epoch": 17.800863866547513, "grad_norm": 1.0493441820144653, "learning_rate": 1.8192352517668432e-06, "loss": 0.434, "num_input_tokens_seen": 69391920, "step": 119515 }, { "epoch": 17.801608579088473, "grad_norm": 1.6169402599334717, "learning_rate": 1.818018568817481e-06, "loss": 0.4562, "num_input_tokens_seen": 69394576, "step": 119520 }, { "epoch": 17.80235329162943, "grad_norm": 2.035060405731201, "learning_rate": 1.8168022775041288e-06, "loss": 0.5527, "num_input_tokens_seen": 69397424, "step": 119525 }, { "epoch": 17.80309800417039, "grad_norm": 1.8239097595214844, "learning_rate": 1.8155863778473447e-06, "loss": 0.4975, "num_input_tokens_seen": 69400400, "step": 119530 }, { "epoch": 17.80384271671135, "grad_norm": 1.69840669631958, "learning_rate": 1.8143708698676597e-06, "loss": 0.4794, "num_input_tokens_seen": 69403152, "step": 119535 }, { "epoch": 17.804587429252308, "grad_norm": 3.4350576400756836, "learning_rate": 1.8131557535856214e-06, "loss": 0.6748, "num_input_tokens_seen": 69406064, "step": 119540 }, { "epoch": 17.80533214179327, "grad_norm": 1.0197287797927856, "learning_rate": 1.8119410290217465e-06, "loss": 0.7268, "num_input_tokens_seen": 69408720, "step": 119545 }, { "epoch": 17.806076854334226, "grad_norm": 2.3913819789886475, "learning_rate": 1.810726696196563e-06, "loss": 0.6334, "num_input_tokens_seen": 69411600, "step": 119550 }, { "epoch": 17.806821566875186, "grad_norm": 1.5088454484939575, "learning_rate": 1.8095127551305797e-06, "loss": 0.7043, "num_input_tokens_seen": 69414416, "step": 119555 }, { "epoch": 17.807566279416147, "grad_norm": 2.1316206455230713, "learning_rate": 1.8082992058443132e-06, "loss": 0.5885, "num_input_tokens_seen": 69417424, "step": 119560 }, { "epoch": 17.808310991957104, "grad_norm": 2.2199137210845947, "learning_rate": 1.8070860483582585e-06, "loss": 0.5706, "num_input_tokens_seen": 69420688, "step": 119565 }, { "epoch": 17.809055704498064, "grad_norm": 2.0761830806732178, "learning_rate": 1.8058732826929104e-06, "loss": 0.6535, "num_input_tokens_seen": 69423376, "step": 119570 }, { "epoch": 17.80980041703902, "grad_norm": 2.8856639862060547, "learning_rate": 1.8046609088687633e-06, "loss": 0.6388, "num_input_tokens_seen": 69426128, "step": 119575 }, { "epoch": 17.81054512957998, "grad_norm": 0.8132461905479431, "learning_rate": 1.8034489269062899e-06, "loss": 0.55, "num_input_tokens_seen": 69429040, "step": 119580 }, { "epoch": 17.811289842120942, "grad_norm": 2.4371232986450195, "learning_rate": 1.8022373368259765e-06, "loss": 0.653, "num_input_tokens_seen": 69431856, "step": 119585 }, { "epoch": 17.8120345546619, "grad_norm": 1.5240145921707153, "learning_rate": 1.801026138648282e-06, "loss": 0.6915, "num_input_tokens_seen": 69434768, "step": 119590 }, { "epoch": 17.81277926720286, "grad_norm": 3.1718430519104004, "learning_rate": 1.7998153323936755e-06, "loss": 0.6258, "num_input_tokens_seen": 69437456, "step": 119595 }, { "epoch": 17.81352397974382, "grad_norm": 1.5695806741714478, "learning_rate": 1.798604918082611e-06, "loss": 0.795, "num_input_tokens_seen": 69440432, "step": 119600 }, { "epoch": 17.814268692284777, "grad_norm": 2.710265874862671, "learning_rate": 1.7973948957355352e-06, "loss": 0.5779, "num_input_tokens_seen": 69443472, "step": 119605 }, { "epoch": 17.815013404825738, "grad_norm": 1.944347858428955, "learning_rate": 1.796185265372885e-06, "loss": 0.596, "num_input_tokens_seen": 69446736, "step": 119610 }, { "epoch": 17.815758117366695, "grad_norm": 2.011972665786743, "learning_rate": 1.7949760270151078e-06, "loss": 0.4468, "num_input_tokens_seen": 69449520, "step": 119615 }, { "epoch": 17.816502829907655, "grad_norm": 1.8821775913238525, "learning_rate": 1.7937671806826262e-06, "loss": 0.62, "num_input_tokens_seen": 69452432, "step": 119620 }, { "epoch": 17.817247542448616, "grad_norm": 1.6397583484649658, "learning_rate": 1.792558726395857e-06, "loss": 0.6198, "num_input_tokens_seen": 69455152, "step": 119625 }, { "epoch": 17.817992254989573, "grad_norm": 1.7142915725708008, "learning_rate": 1.791350664175223e-06, "loss": 0.5971, "num_input_tokens_seen": 69457936, "step": 119630 }, { "epoch": 17.818736967530533, "grad_norm": 3.229757308959961, "learning_rate": 1.7901429940411301e-06, "loss": 0.7437, "num_input_tokens_seen": 69460912, "step": 119635 }, { "epoch": 17.819481680071494, "grad_norm": 1.822810173034668, "learning_rate": 1.788935716013987e-06, "loss": 0.5939, "num_input_tokens_seen": 69463472, "step": 119640 }, { "epoch": 17.82022639261245, "grad_norm": 1.711148977279663, "learning_rate": 1.7877288301141826e-06, "loss": 0.5462, "num_input_tokens_seen": 69466384, "step": 119645 }, { "epoch": 17.82097110515341, "grad_norm": 1.042863368988037, "learning_rate": 1.7865223363621037e-06, "loss": 0.6776, "num_input_tokens_seen": 69469392, "step": 119650 }, { "epoch": 17.82171581769437, "grad_norm": 1.4906365871429443, "learning_rate": 1.7853162347781394e-06, "loss": 0.8701, "num_input_tokens_seen": 69472272, "step": 119655 }, { "epoch": 17.82246053023533, "grad_norm": 1.388090968132019, "learning_rate": 1.7841105253826596e-06, "loss": 0.6079, "num_input_tokens_seen": 69475696, "step": 119660 }, { "epoch": 17.82320524277629, "grad_norm": 1.4397144317626953, "learning_rate": 1.7829052081960423e-06, "loss": 0.5313, "num_input_tokens_seen": 69478704, "step": 119665 }, { "epoch": 17.823949955317246, "grad_norm": 2.9563546180725098, "learning_rate": 1.7817002832386436e-06, "loss": 0.7096, "num_input_tokens_seen": 69481648, "step": 119670 }, { "epoch": 17.824694667858207, "grad_norm": 1.1581989526748657, "learning_rate": 1.7804957505308224e-06, "loss": 0.4816, "num_input_tokens_seen": 69484400, "step": 119675 }, { "epoch": 17.825439380399168, "grad_norm": 1.3007707595825195, "learning_rate": 1.7792916100929258e-06, "loss": 0.5146, "num_input_tokens_seen": 69487312, "step": 119680 }, { "epoch": 17.826184092940125, "grad_norm": 2.513795852661133, "learning_rate": 1.7780878619452905e-06, "loss": 0.7293, "num_input_tokens_seen": 69490256, "step": 119685 }, { "epoch": 17.826928805481085, "grad_norm": 1.7696729898452759, "learning_rate": 1.7768845061082646e-06, "loss": 0.3667, "num_input_tokens_seen": 69493072, "step": 119690 }, { "epoch": 17.827673518022042, "grad_norm": 0.7256662249565125, "learning_rate": 1.7756815426021673e-06, "loss": 0.4108, "num_input_tokens_seen": 69495696, "step": 119695 }, { "epoch": 17.828418230563003, "grad_norm": 3.2009737491607666, "learning_rate": 1.7744789714473325e-06, "loss": 0.8102, "num_input_tokens_seen": 69498576, "step": 119700 }, { "epoch": 17.829162943103963, "grad_norm": 2.241243600845337, "learning_rate": 1.7732767926640636e-06, "loss": 0.6676, "num_input_tokens_seen": 69501648, "step": 119705 }, { "epoch": 17.82990765564492, "grad_norm": 2.2106921672821045, "learning_rate": 1.7720750062726831e-06, "loss": 0.7112, "num_input_tokens_seen": 69504688, "step": 119710 }, { "epoch": 17.83065236818588, "grad_norm": 1.9795303344726562, "learning_rate": 1.7708736122934805e-06, "loss": 0.5602, "num_input_tokens_seen": 69507536, "step": 119715 }, { "epoch": 17.83139708072684, "grad_norm": 5.162034034729004, "learning_rate": 1.7696726107467643e-06, "loss": 0.7133, "num_input_tokens_seen": 69510544, "step": 119720 }, { "epoch": 17.832141793267798, "grad_norm": 2.042034387588501, "learning_rate": 1.768472001652821e-06, "loss": 0.8072, "num_input_tokens_seen": 69513456, "step": 119725 }, { "epoch": 17.83288650580876, "grad_norm": 2.0016188621520996, "learning_rate": 1.7672717850319264e-06, "loss": 0.7199, "num_input_tokens_seen": 69516304, "step": 119730 }, { "epoch": 17.833631218349716, "grad_norm": 1.2202388048171997, "learning_rate": 1.766071960904367e-06, "loss": 0.4449, "num_input_tokens_seen": 69519088, "step": 119735 }, { "epoch": 17.834375930890676, "grad_norm": 1.5471073389053345, "learning_rate": 1.7648725292904067e-06, "loss": 0.7901, "num_input_tokens_seen": 69521936, "step": 119740 }, { "epoch": 17.835120643431637, "grad_norm": 1.7332626581192017, "learning_rate": 1.7636734902103102e-06, "loss": 0.4958, "num_input_tokens_seen": 69525104, "step": 119745 }, { "epoch": 17.835865355972594, "grad_norm": 1.126609444618225, "learning_rate": 1.7624748436843308e-06, "loss": 0.4328, "num_input_tokens_seen": 69528112, "step": 119750 }, { "epoch": 17.836610068513554, "grad_norm": 1.2330385446548462, "learning_rate": 1.7612765897327244e-06, "loss": 0.4638, "num_input_tokens_seen": 69530608, "step": 119755 }, { "epoch": 17.837354781054515, "grad_norm": 1.6732573509216309, "learning_rate": 1.7600787283757303e-06, "loss": 0.6143, "num_input_tokens_seen": 69533680, "step": 119760 }, { "epoch": 17.83809949359547, "grad_norm": 0.8774968981742859, "learning_rate": 1.7588812596335824e-06, "loss": 0.5946, "num_input_tokens_seen": 69536400, "step": 119765 }, { "epoch": 17.838844206136432, "grad_norm": 1.038274884223938, "learning_rate": 1.7576841835265202e-06, "loss": 0.533, "num_input_tokens_seen": 69539216, "step": 119770 }, { "epoch": 17.83958891867739, "grad_norm": 1.203836441040039, "learning_rate": 1.756487500074755e-06, "loss": 0.4804, "num_input_tokens_seen": 69542096, "step": 119775 }, { "epoch": 17.84033363121835, "grad_norm": 1.3256222009658813, "learning_rate": 1.7552912092985153e-06, "loss": 0.5886, "num_input_tokens_seen": 69544912, "step": 119780 }, { "epoch": 17.84107834375931, "grad_norm": 1.6429975032806396, "learning_rate": 1.7540953112180014e-06, "loss": 0.5552, "num_input_tokens_seen": 69547760, "step": 119785 }, { "epoch": 17.841823056300267, "grad_norm": 1.7997735738754272, "learning_rate": 1.752899805853425e-06, "loss": 0.645, "num_input_tokens_seen": 69550640, "step": 119790 }, { "epoch": 17.842567768841228, "grad_norm": 2.4528982639312744, "learning_rate": 1.7517046932249758e-06, "loss": 0.675, "num_input_tokens_seen": 69553520, "step": 119795 }, { "epoch": 17.843312481382185, "grad_norm": 1.5853866338729858, "learning_rate": 1.7505099733528514e-06, "loss": 0.5843, "num_input_tokens_seen": 69556560, "step": 119800 }, { "epoch": 17.844057193923145, "grad_norm": 2.468933343887329, "learning_rate": 1.7493156462572296e-06, "loss": 0.6135, "num_input_tokens_seen": 69559472, "step": 119805 }, { "epoch": 17.844801906464106, "grad_norm": 3.500528335571289, "learning_rate": 1.7481217119582921e-06, "loss": 0.6987, "num_input_tokens_seen": 69562416, "step": 119810 }, { "epoch": 17.845546619005063, "grad_norm": 1.213912010192871, "learning_rate": 1.746928170476203e-06, "loss": 0.6521, "num_input_tokens_seen": 69565328, "step": 119815 }, { "epoch": 17.846291331546023, "grad_norm": 1.2198541164398193, "learning_rate": 1.7457350218311269e-06, "loss": 0.4185, "num_input_tokens_seen": 69568208, "step": 119820 }, { "epoch": 17.847036044086984, "grad_norm": 1.900900959968567, "learning_rate": 1.7445422660432254e-06, "loss": 0.5422, "num_input_tokens_seen": 69571344, "step": 119825 }, { "epoch": 17.84778075662794, "grad_norm": 1.909189224243164, "learning_rate": 1.7433499031326434e-06, "loss": 0.3367, "num_input_tokens_seen": 69574288, "step": 119830 }, { "epoch": 17.8485254691689, "grad_norm": 1.4193309545516968, "learning_rate": 1.7421579331195314e-06, "loss": 0.5463, "num_input_tokens_seen": 69577296, "step": 119835 }, { "epoch": 17.84927018170986, "grad_norm": 1.2500739097595215, "learning_rate": 1.7409663560240209e-06, "loss": 0.7091, "num_input_tokens_seen": 69580144, "step": 119840 }, { "epoch": 17.85001489425082, "grad_norm": 1.9401805400848389, "learning_rate": 1.7397751718662452e-06, "loss": 0.6139, "num_input_tokens_seen": 69582992, "step": 119845 }, { "epoch": 17.85075960679178, "grad_norm": 2.2111988067626953, "learning_rate": 1.7385843806663304e-06, "loss": 0.75, "num_input_tokens_seen": 69586032, "step": 119850 }, { "epoch": 17.851504319332737, "grad_norm": 1.3618665933609009, "learning_rate": 1.7373939824443853e-06, "loss": 0.6689, "num_input_tokens_seen": 69589040, "step": 119855 }, { "epoch": 17.852249031873697, "grad_norm": 1.8570259809494019, "learning_rate": 1.7362039772205296e-06, "loss": 0.5731, "num_input_tokens_seen": 69591920, "step": 119860 }, { "epoch": 17.852993744414658, "grad_norm": 1.799426794052124, "learning_rate": 1.7350143650148587e-06, "loss": 0.6691, "num_input_tokens_seen": 69594640, "step": 119865 }, { "epoch": 17.853738456955615, "grad_norm": 1.9383153915405273, "learning_rate": 1.7338251458474786e-06, "loss": 0.4148, "num_input_tokens_seen": 69597616, "step": 119870 }, { "epoch": 17.854483169496575, "grad_norm": 1.882689356803894, "learning_rate": 1.7326363197384788e-06, "loss": 0.7524, "num_input_tokens_seen": 69600368, "step": 119875 }, { "epoch": 17.855227882037532, "grad_norm": 2.2334587574005127, "learning_rate": 1.7314478867079376e-06, "loss": 0.6259, "num_input_tokens_seen": 69603312, "step": 119880 }, { "epoch": 17.855972594578493, "grad_norm": 2.8064918518066406, "learning_rate": 1.7302598467759362e-06, "loss": 0.6133, "num_input_tokens_seen": 69606064, "step": 119885 }, { "epoch": 17.856717307119453, "grad_norm": 1.9048376083374023, "learning_rate": 1.729072199962542e-06, "loss": 0.6677, "num_input_tokens_seen": 69608976, "step": 119890 }, { "epoch": 17.85746201966041, "grad_norm": 1.5062519311904907, "learning_rate": 1.7278849462878223e-06, "loss": 0.448, "num_input_tokens_seen": 69611984, "step": 119895 }, { "epoch": 17.85820673220137, "grad_norm": 1.6436550617218018, "learning_rate": 1.7266980857718328e-06, "loss": 0.559, "num_input_tokens_seen": 69614704, "step": 119900 }, { "epoch": 17.85895144474233, "grad_norm": 0.9835546612739563, "learning_rate": 1.7255116184346277e-06, "loss": 0.6622, "num_input_tokens_seen": 69617488, "step": 119905 }, { "epoch": 17.859696157283288, "grad_norm": 1.3892022371292114, "learning_rate": 1.724325544296243e-06, "loss": 0.6929, "num_input_tokens_seen": 69620368, "step": 119910 }, { "epoch": 17.86044086982425, "grad_norm": 2.229362964630127, "learning_rate": 1.7231398633767272e-06, "loss": 0.4396, "num_input_tokens_seen": 69623472, "step": 119915 }, { "epoch": 17.861185582365206, "grad_norm": 1.6862084865570068, "learning_rate": 1.7219545756961025e-06, "loss": 0.5639, "num_input_tokens_seen": 69626192, "step": 119920 }, { "epoch": 17.861930294906166, "grad_norm": 2.0084383487701416, "learning_rate": 1.7207696812744007e-06, "loss": 0.4184, "num_input_tokens_seen": 69629200, "step": 119925 }, { "epoch": 17.862675007447127, "grad_norm": 2.1420905590057373, "learning_rate": 1.719585180131636e-06, "loss": 0.7768, "num_input_tokens_seen": 69632112, "step": 119930 }, { "epoch": 17.863419719988084, "grad_norm": 0.8652101159095764, "learning_rate": 1.7184010722878146e-06, "loss": 0.3512, "num_input_tokens_seen": 69635088, "step": 119935 }, { "epoch": 17.864164432529044, "grad_norm": 1.268246054649353, "learning_rate": 1.7172173577629459e-06, "loss": 0.5483, "num_input_tokens_seen": 69638160, "step": 119940 }, { "epoch": 17.86490914507, "grad_norm": 3.384824752807617, "learning_rate": 1.7160340365770272e-06, "loss": 0.6225, "num_input_tokens_seen": 69640752, "step": 119945 }, { "epoch": 17.865653857610962, "grad_norm": 1.7259868383407593, "learning_rate": 1.7148511087500485e-06, "loss": 0.7383, "num_input_tokens_seen": 69643824, "step": 119950 }, { "epoch": 17.866398570151922, "grad_norm": 1.3809716701507568, "learning_rate": 1.7136685743019909e-06, "loss": 0.7765, "num_input_tokens_seen": 69646768, "step": 119955 }, { "epoch": 17.86714328269288, "grad_norm": 4.439483642578125, "learning_rate": 1.7124864332528412e-06, "loss": 0.4834, "num_input_tokens_seen": 69649552, "step": 119960 }, { "epoch": 17.86788799523384, "grad_norm": 2.278846025466919, "learning_rate": 1.7113046856225611e-06, "loss": 0.4404, "num_input_tokens_seen": 69652400, "step": 119965 }, { "epoch": 17.8686327077748, "grad_norm": 0.9729037284851074, "learning_rate": 1.7101233314311181e-06, "loss": 0.645, "num_input_tokens_seen": 69655120, "step": 119970 }, { "epoch": 17.869377420315757, "grad_norm": 1.1311501264572144, "learning_rate": 1.7089423706984742e-06, "loss": 0.515, "num_input_tokens_seen": 69658480, "step": 119975 }, { "epoch": 17.870122132856718, "grad_norm": 1.905359148979187, "learning_rate": 1.7077618034445714e-06, "loss": 0.7505, "num_input_tokens_seen": 69661552, "step": 119980 }, { "epoch": 17.870866845397675, "grad_norm": 1.1456587314605713, "learning_rate": 1.706581629689366e-06, "loss": 0.5449, "num_input_tokens_seen": 69664368, "step": 119985 }, { "epoch": 17.871611557938635, "grad_norm": 2.318033456802368, "learning_rate": 1.705401849452784e-06, "loss": 0.6847, "num_input_tokens_seen": 69667248, "step": 119990 }, { "epoch": 17.872356270479596, "grad_norm": 2.4698026180267334, "learning_rate": 1.7042224627547676e-06, "loss": 0.5914, "num_input_tokens_seen": 69670160, "step": 119995 }, { "epoch": 17.873100983020553, "grad_norm": 1.4501999616622925, "learning_rate": 1.7030434696152342e-06, "loss": 0.6513, "num_input_tokens_seen": 69673008, "step": 120000 }, { "epoch": 17.873845695561513, "grad_norm": 1.3369758129119873, "learning_rate": 1.701864870054104e-06, "loss": 0.7552, "num_input_tokens_seen": 69675888, "step": 120005 }, { "epoch": 17.874590408102474, "grad_norm": 4.218823432922363, "learning_rate": 1.700686664091286e-06, "loss": 0.8112, "num_input_tokens_seen": 69678608, "step": 120010 }, { "epoch": 17.87533512064343, "grad_norm": 2.2112886905670166, "learning_rate": 1.6995088517466867e-06, "loss": 0.6826, "num_input_tokens_seen": 69681648, "step": 120015 }, { "epoch": 17.87607983318439, "grad_norm": 2.0981035232543945, "learning_rate": 1.6983314330402039e-06, "loss": 0.7086, "num_input_tokens_seen": 69684560, "step": 120020 }, { "epoch": 17.87682454572535, "grad_norm": 0.8914119601249695, "learning_rate": 1.6971544079917273e-06, "loss": 0.5408, "num_input_tokens_seen": 69687696, "step": 120025 }, { "epoch": 17.87756925826631, "grad_norm": 0.9276633262634277, "learning_rate": 1.6959777766211437e-06, "loss": 0.5164, "num_input_tokens_seen": 69690448, "step": 120030 }, { "epoch": 17.87831397080727, "grad_norm": 1.6868942975997925, "learning_rate": 1.6948015389483291e-06, "loss": 0.5915, "num_input_tokens_seen": 69693424, "step": 120035 }, { "epoch": 17.879058683348227, "grad_norm": 1.5881104469299316, "learning_rate": 1.6936256949931618e-06, "loss": 0.7294, "num_input_tokens_seen": 69696336, "step": 120040 }, { "epoch": 17.879803395889187, "grad_norm": 1.3513524532318115, "learning_rate": 1.692450244775498e-06, "loss": 0.5139, "num_input_tokens_seen": 69699376, "step": 120045 }, { "epoch": 17.880548108430148, "grad_norm": 1.9639102220535278, "learning_rate": 1.6912751883151945e-06, "loss": 1.0094, "num_input_tokens_seen": 69702128, "step": 120050 }, { "epoch": 17.881292820971105, "grad_norm": 1.9189746379852295, "learning_rate": 1.6901005256321128e-06, "loss": 0.4292, "num_input_tokens_seen": 69704880, "step": 120055 }, { "epoch": 17.882037533512065, "grad_norm": 1.22174072265625, "learning_rate": 1.6889262567460846e-06, "loss": 0.6632, "num_input_tokens_seen": 69707760, "step": 120060 }, { "epoch": 17.882782246053022, "grad_norm": 1.2680708169937134, "learning_rate": 1.6877523816769603e-06, "loss": 0.4308, "num_input_tokens_seen": 69710672, "step": 120065 }, { "epoch": 17.883526958593983, "grad_norm": 1.1342830657958984, "learning_rate": 1.6865789004445686e-06, "loss": 0.6283, "num_input_tokens_seen": 69713712, "step": 120070 }, { "epoch": 17.884271671134943, "grad_norm": 0.9496592879295349, "learning_rate": 1.6854058130687272e-06, "loss": 0.5712, "num_input_tokens_seen": 69716560, "step": 120075 }, { "epoch": 17.8850163836759, "grad_norm": 1.012721300125122, "learning_rate": 1.684233119569259e-06, "loss": 0.5865, "num_input_tokens_seen": 69719408, "step": 120080 }, { "epoch": 17.88576109621686, "grad_norm": 2.0078470706939697, "learning_rate": 1.683060819965976e-06, "loss": 0.6709, "num_input_tokens_seen": 69722096, "step": 120085 }, { "epoch": 17.886505808757818, "grad_norm": 1.4811558723449707, "learning_rate": 1.6818889142786842e-06, "loss": 0.725, "num_input_tokens_seen": 69725200, "step": 120090 }, { "epoch": 17.88725052129878, "grad_norm": 3.348172903060913, "learning_rate": 1.6807174025271737e-06, "loss": 0.6418, "num_input_tokens_seen": 69728368, "step": 120095 }, { "epoch": 17.88799523383974, "grad_norm": 1.6201293468475342, "learning_rate": 1.6795462847312481e-06, "loss": 0.5842, "num_input_tokens_seen": 69731280, "step": 120100 }, { "epoch": 17.888739946380696, "grad_norm": 1.1151807308197021, "learning_rate": 1.6783755609106804e-06, "loss": 0.4286, "num_input_tokens_seen": 69734352, "step": 120105 }, { "epoch": 17.889484658921656, "grad_norm": 1.8318212032318115, "learning_rate": 1.6772052310852605e-06, "loss": 0.6589, "num_input_tokens_seen": 69737200, "step": 120110 }, { "epoch": 17.890229371462617, "grad_norm": 1.5416646003723145, "learning_rate": 1.6760352952747472e-06, "loss": 0.6847, "num_input_tokens_seen": 69740112, "step": 120115 }, { "epoch": 17.890974084003574, "grad_norm": 1.4876424074172974, "learning_rate": 1.6748657534989194e-06, "loss": 0.4783, "num_input_tokens_seen": 69743024, "step": 120120 }, { "epoch": 17.891718796544534, "grad_norm": 1.9235177040100098, "learning_rate": 1.673696605777525e-06, "loss": 0.5324, "num_input_tokens_seen": 69745840, "step": 120125 }, { "epoch": 17.89246350908549, "grad_norm": 1.9783415794372559, "learning_rate": 1.6725278521303178e-06, "loss": 0.6573, "num_input_tokens_seen": 69748752, "step": 120130 }, { "epoch": 17.893208221626452, "grad_norm": 1.4950459003448486, "learning_rate": 1.6713594925770459e-06, "loss": 0.3907, "num_input_tokens_seen": 69751760, "step": 120135 }, { "epoch": 17.893952934167412, "grad_norm": 1.8080958127975464, "learning_rate": 1.6701915271374436e-06, "loss": 0.4394, "num_input_tokens_seen": 69754672, "step": 120140 }, { "epoch": 17.89469764670837, "grad_norm": 2.4587574005126953, "learning_rate": 1.6690239558312476e-06, "loss": 0.6252, "num_input_tokens_seen": 69757584, "step": 120145 }, { "epoch": 17.89544235924933, "grad_norm": 0.9258333444595337, "learning_rate": 1.667856778678173e-06, "loss": 0.4681, "num_input_tokens_seen": 69760528, "step": 120150 }, { "epoch": 17.89618707179029, "grad_norm": 2.0088114738464355, "learning_rate": 1.6666899956979483e-06, "loss": 0.6065, "num_input_tokens_seen": 69763216, "step": 120155 }, { "epoch": 17.896931784331247, "grad_norm": 1.9427884817123413, "learning_rate": 1.665523606910277e-06, "loss": 0.4783, "num_input_tokens_seen": 69766160, "step": 120160 }, { "epoch": 17.897676496872208, "grad_norm": 1.8719241619110107, "learning_rate": 1.6643576123348741e-06, "loss": 0.5605, "num_input_tokens_seen": 69768944, "step": 120165 }, { "epoch": 17.898421209413165, "grad_norm": 1.0900144577026367, "learning_rate": 1.6631920119914296e-06, "loss": 0.5976, "num_input_tokens_seen": 69772144, "step": 120170 }, { "epoch": 17.899165921954125, "grad_norm": 1.9653546810150146, "learning_rate": 1.6620268058996357e-06, "loss": 0.4784, "num_input_tokens_seen": 69774992, "step": 120175 }, { "epoch": 17.899910634495086, "grad_norm": 1.1714625358581543, "learning_rate": 1.6608619940791826e-06, "loss": 0.445, "num_input_tokens_seen": 69777936, "step": 120180 }, { "epoch": 17.900655347036043, "grad_norm": 3.2341229915618896, "learning_rate": 1.6596975765497403e-06, "loss": 0.7643, "num_input_tokens_seen": 69780688, "step": 120185 }, { "epoch": 17.901400059577004, "grad_norm": 1.382200002670288, "learning_rate": 1.6585335533309903e-06, "loss": 0.7009, "num_input_tokens_seen": 69783536, "step": 120190 }, { "epoch": 17.902144772117964, "grad_norm": 1.403498888015747, "learning_rate": 1.6573699244425895e-06, "loss": 0.4893, "num_input_tokens_seen": 69786512, "step": 120195 }, { "epoch": 17.90288948465892, "grad_norm": 1.1477487087249756, "learning_rate": 1.6562066899042023e-06, "loss": 0.806, "num_input_tokens_seen": 69789360, "step": 120200 }, { "epoch": 17.90363419719988, "grad_norm": 2.822646141052246, "learning_rate": 1.655043849735477e-06, "loss": 0.7007, "num_input_tokens_seen": 69792304, "step": 120205 }, { "epoch": 17.90437890974084, "grad_norm": 1.0619499683380127, "learning_rate": 1.653881403956062e-06, "loss": 0.6666, "num_input_tokens_seen": 69795184, "step": 120210 }, { "epoch": 17.9051236222818, "grad_norm": 1.238633394241333, "learning_rate": 1.6527193525855911e-06, "loss": 0.6479, "num_input_tokens_seen": 69798000, "step": 120215 }, { "epoch": 17.90586833482276, "grad_norm": 0.5976313948631287, "learning_rate": 1.6515576956436906e-06, "loss": 0.6091, "num_input_tokens_seen": 69801072, "step": 120220 }, { "epoch": 17.906613047363717, "grad_norm": 2.540611743927002, "learning_rate": 1.6503964331500004e-06, "loss": 0.595, "num_input_tokens_seen": 69803568, "step": 120225 }, { "epoch": 17.907357759904677, "grad_norm": 2.089219093322754, "learning_rate": 1.649235565124127e-06, "loss": 0.6116, "num_input_tokens_seen": 69806544, "step": 120230 }, { "epoch": 17.908102472445638, "grad_norm": 2.0605931282043457, "learning_rate": 1.648075091585688e-06, "loss": 0.5682, "num_input_tokens_seen": 69809616, "step": 120235 }, { "epoch": 17.908847184986595, "grad_norm": 1.769476294517517, "learning_rate": 1.6469150125542843e-06, "loss": 0.6161, "num_input_tokens_seen": 69812432, "step": 120240 }, { "epoch": 17.909591897527555, "grad_norm": 2.1844348907470703, "learning_rate": 1.6457553280495168e-06, "loss": 0.6351, "num_input_tokens_seen": 69815344, "step": 120245 }, { "epoch": 17.910336610068512, "grad_norm": 1.6076980829238892, "learning_rate": 1.6445960380909814e-06, "loss": 0.6544, "num_input_tokens_seen": 69818224, "step": 120250 }, { "epoch": 17.911081322609473, "grad_norm": 1.8937610387802124, "learning_rate": 1.6434371426982508e-06, "loss": 0.573, "num_input_tokens_seen": 69821104, "step": 120255 }, { "epoch": 17.911826035150433, "grad_norm": 3.0426409244537354, "learning_rate": 1.642278641890918e-06, "loss": 0.6184, "num_input_tokens_seen": 69823792, "step": 120260 }, { "epoch": 17.91257074769139, "grad_norm": 2.185046434402466, "learning_rate": 1.641120535688548e-06, "loss": 0.7074, "num_input_tokens_seen": 69826384, "step": 120265 }, { "epoch": 17.91331546023235, "grad_norm": 1.6850826740264893, "learning_rate": 1.6399628241106996e-06, "loss": 0.6477, "num_input_tokens_seen": 69829200, "step": 120270 }, { "epoch": 17.91406017277331, "grad_norm": 1.4923349618911743, "learning_rate": 1.638805507176941e-06, "loss": 0.7095, "num_input_tokens_seen": 69832048, "step": 120275 }, { "epoch": 17.91480488531427, "grad_norm": 1.0868300199508667, "learning_rate": 1.637648584906823e-06, "loss": 0.4285, "num_input_tokens_seen": 69834800, "step": 120280 }, { "epoch": 17.91554959785523, "grad_norm": 1.9752914905548096, "learning_rate": 1.6364920573198856e-06, "loss": 0.6036, "num_input_tokens_seen": 69837488, "step": 120285 }, { "epoch": 17.916294310396186, "grad_norm": 2.8654749393463135, "learning_rate": 1.6353359244356658e-06, "loss": 0.6976, "num_input_tokens_seen": 69840368, "step": 120290 }, { "epoch": 17.917039022937146, "grad_norm": 1.5341593027114868, "learning_rate": 1.634180186273701e-06, "loss": 0.4909, "num_input_tokens_seen": 69843280, "step": 120295 }, { "epoch": 17.917783735478107, "grad_norm": 2.4673616886138916, "learning_rate": 1.6330248428535117e-06, "loss": 0.6807, "num_input_tokens_seen": 69846224, "step": 120300 }, { "epoch": 17.918528448019064, "grad_norm": 1.6338850259780884, "learning_rate": 1.6318698941946237e-06, "loss": 0.7044, "num_input_tokens_seen": 69849008, "step": 120305 }, { "epoch": 17.919273160560024, "grad_norm": 1.1477848291397095, "learning_rate": 1.6307153403165382e-06, "loss": 0.6752, "num_input_tokens_seen": 69851792, "step": 120310 }, { "epoch": 17.92001787310098, "grad_norm": 1.3740955591201782, "learning_rate": 1.6295611812387673e-06, "loss": 0.6999, "num_input_tokens_seen": 69854672, "step": 120315 }, { "epoch": 17.920762585641942, "grad_norm": 1.2702242136001587, "learning_rate": 1.6284074169808067e-06, "loss": 0.6305, "num_input_tokens_seen": 69857744, "step": 120320 }, { "epoch": 17.921507298182902, "grad_norm": 1.5531657934188843, "learning_rate": 1.6272540475621518e-06, "loss": 0.5833, "num_input_tokens_seen": 69861008, "step": 120325 }, { "epoch": 17.92225201072386, "grad_norm": 1.5389419794082642, "learning_rate": 1.6261010730022842e-06, "loss": 0.5967, "num_input_tokens_seen": 69863728, "step": 120330 }, { "epoch": 17.92299672326482, "grad_norm": 1.615427017211914, "learning_rate": 1.6249484933206853e-06, "loss": 0.6094, "num_input_tokens_seen": 69866608, "step": 120335 }, { "epoch": 17.92374143580578, "grad_norm": 0.9342974424362183, "learning_rate": 1.6237963085368236e-06, "loss": 0.4196, "num_input_tokens_seen": 69869008, "step": 120340 }, { "epoch": 17.924486148346737, "grad_norm": 0.9330852031707764, "learning_rate": 1.6226445186701577e-06, "loss": 0.6062, "num_input_tokens_seen": 69871760, "step": 120345 }, { "epoch": 17.925230860887698, "grad_norm": 2.113231658935547, "learning_rate": 1.6214931237401588e-06, "loss": 0.6483, "num_input_tokens_seen": 69874608, "step": 120350 }, { "epoch": 17.925975573428655, "grad_norm": 1.9049327373504639, "learning_rate": 1.6203421237662692e-06, "loss": 0.5804, "num_input_tokens_seen": 69877136, "step": 120355 }, { "epoch": 17.926720285969616, "grad_norm": 1.7968206405639648, "learning_rate": 1.619191518767943e-06, "loss": 0.5913, "num_input_tokens_seen": 69879952, "step": 120360 }, { "epoch": 17.927464998510576, "grad_norm": 1.4072219133377075, "learning_rate": 1.618041308764609e-06, "loss": 0.7905, "num_input_tokens_seen": 69882672, "step": 120365 }, { "epoch": 17.928209711051533, "grad_norm": 2.3029417991638184, "learning_rate": 1.6168914937757019e-06, "loss": 0.6602, "num_input_tokens_seen": 69885520, "step": 120370 }, { "epoch": 17.928954423592494, "grad_norm": 1.964385986328125, "learning_rate": 1.6157420738206503e-06, "loss": 0.5106, "num_input_tokens_seen": 69888560, "step": 120375 }, { "epoch": 17.929699136133454, "grad_norm": 1.4728810787200928, "learning_rate": 1.6145930489188666e-06, "loss": 0.6645, "num_input_tokens_seen": 69891472, "step": 120380 }, { "epoch": 17.93044384867441, "grad_norm": 1.3197133541107178, "learning_rate": 1.6134444190897685e-06, "loss": 0.6624, "num_input_tokens_seen": 69894672, "step": 120385 }, { "epoch": 17.93118856121537, "grad_norm": 0.9652179479598999, "learning_rate": 1.6122961843527546e-06, "loss": 0.6548, "num_input_tokens_seen": 69897456, "step": 120390 }, { "epoch": 17.93193327375633, "grad_norm": 1.821048617362976, "learning_rate": 1.6111483447272286e-06, "loss": 0.8134, "num_input_tokens_seen": 69900720, "step": 120395 }, { "epoch": 17.93267798629729, "grad_norm": 1.3693978786468506, "learning_rate": 1.6100009002325806e-06, "loss": 0.4782, "num_input_tokens_seen": 69903536, "step": 120400 }, { "epoch": 17.93342269883825, "grad_norm": 2.0912539958953857, "learning_rate": 1.608853850888195e-06, "loss": 0.8131, "num_input_tokens_seen": 69906288, "step": 120405 }, { "epoch": 17.934167411379207, "grad_norm": 3.4995837211608887, "learning_rate": 1.6077071967134511e-06, "loss": 0.6715, "num_input_tokens_seen": 69909136, "step": 120410 }, { "epoch": 17.934912123920167, "grad_norm": 2.3192245960235596, "learning_rate": 1.6065609377277136e-06, "loss": 0.5769, "num_input_tokens_seen": 69911888, "step": 120415 }, { "epoch": 17.935656836461128, "grad_norm": 1.5595369338989258, "learning_rate": 1.6054150739503587e-06, "loss": 0.647, "num_input_tokens_seen": 69914864, "step": 120420 }, { "epoch": 17.936401549002085, "grad_norm": 1.060221791267395, "learning_rate": 1.604269605400735e-06, "loss": 0.5471, "num_input_tokens_seen": 69918064, "step": 120425 }, { "epoch": 17.937146261543045, "grad_norm": 1.3810455799102783, "learning_rate": 1.6031245320982018e-06, "loss": 0.5287, "num_input_tokens_seen": 69920752, "step": 120430 }, { "epoch": 17.937890974084002, "grad_norm": 1.4484237432479858, "learning_rate": 1.601979854062094e-06, "loss": 0.524, "num_input_tokens_seen": 69923760, "step": 120435 }, { "epoch": 17.938635686624963, "grad_norm": 1.2972766160964966, "learning_rate": 1.6008355713117623e-06, "loss": 0.6431, "num_input_tokens_seen": 69926640, "step": 120440 }, { "epoch": 17.939380399165923, "grad_norm": 2.128438949584961, "learning_rate": 1.599691683866525e-06, "loss": 0.5926, "num_input_tokens_seen": 69929488, "step": 120445 }, { "epoch": 17.94012511170688, "grad_norm": 2.1702704429626465, "learning_rate": 1.5985481917457217e-06, "loss": 0.6491, "num_input_tokens_seen": 69932528, "step": 120450 }, { "epoch": 17.94086982424784, "grad_norm": 2.7959516048431396, "learning_rate": 1.5974050949686597e-06, "loss": 0.6206, "num_input_tokens_seen": 69935184, "step": 120455 }, { "epoch": 17.941614536788798, "grad_norm": 2.2505698204040527, "learning_rate": 1.5962623935546483e-06, "loss": 0.4993, "num_input_tokens_seen": 69938160, "step": 120460 }, { "epoch": 17.94235924932976, "grad_norm": 1.320597529411316, "learning_rate": 1.5951200875230055e-06, "loss": 0.4568, "num_input_tokens_seen": 69940848, "step": 120465 }, { "epoch": 17.94310396187072, "grad_norm": 1.3304954767227173, "learning_rate": 1.5939781768930185e-06, "loss": 0.5824, "num_input_tokens_seen": 69943696, "step": 120470 }, { "epoch": 17.943848674411676, "grad_norm": 1.6156399250030518, "learning_rate": 1.59283666168398e-06, "loss": 0.5217, "num_input_tokens_seen": 69946544, "step": 120475 }, { "epoch": 17.944593386952636, "grad_norm": 1.0573139190673828, "learning_rate": 1.5916955419151725e-06, "loss": 0.5142, "num_input_tokens_seen": 69949712, "step": 120480 }, { "epoch": 17.945338099493597, "grad_norm": 3.9440276622772217, "learning_rate": 1.5905548176058826e-06, "loss": 0.7118, "num_input_tokens_seen": 69952560, "step": 120485 }, { "epoch": 17.946082812034554, "grad_norm": 1.147379755973816, "learning_rate": 1.5894144887753786e-06, "loss": 0.5651, "num_input_tokens_seen": 69955472, "step": 120490 }, { "epoch": 17.946827524575514, "grad_norm": 2.451530694961548, "learning_rate": 1.5882745554429174e-06, "loss": 0.6862, "num_input_tokens_seen": 69958480, "step": 120495 }, { "epoch": 17.94757223711647, "grad_norm": 2.8038253784179688, "learning_rate": 1.5871350176277667e-06, "loss": 0.5283, "num_input_tokens_seen": 69961552, "step": 120500 }, { "epoch": 17.948316949657432, "grad_norm": 1.2983671426773071, "learning_rate": 1.585995875349172e-06, "loss": 0.6137, "num_input_tokens_seen": 69964496, "step": 120505 }, { "epoch": 17.949061662198392, "grad_norm": 2.1184515953063965, "learning_rate": 1.5848571286263825e-06, "loss": 0.6569, "num_input_tokens_seen": 69967248, "step": 120510 }, { "epoch": 17.94980637473935, "grad_norm": 2.110527753829956, "learning_rate": 1.5837187774786293e-06, "loss": 0.5357, "num_input_tokens_seen": 69970160, "step": 120515 }, { "epoch": 17.95055108728031, "grad_norm": 1.622192621231079, "learning_rate": 1.5825808219251532e-06, "loss": 0.5739, "num_input_tokens_seen": 69972976, "step": 120520 }, { "epoch": 17.95129579982127, "grad_norm": 1.4658329486846924, "learning_rate": 1.5814432619851687e-06, "loss": 0.7464, "num_input_tokens_seen": 69975920, "step": 120525 }, { "epoch": 17.952040512362228, "grad_norm": 1.3921194076538086, "learning_rate": 1.5803060976779026e-06, "loss": 0.5232, "num_input_tokens_seen": 69978832, "step": 120530 }, { "epoch": 17.952785224903188, "grad_norm": 1.3493363857269287, "learning_rate": 1.5791693290225646e-06, "loss": 0.4808, "num_input_tokens_seen": 69981872, "step": 120535 }, { "epoch": 17.953529937444145, "grad_norm": 0.4773896336555481, "learning_rate": 1.5780329560383527e-06, "loss": 0.4162, "num_input_tokens_seen": 69984656, "step": 120540 }, { "epoch": 17.954274649985106, "grad_norm": 4.378621578216553, "learning_rate": 1.5768969787444716e-06, "loss": 0.8487, "num_input_tokens_seen": 69987568, "step": 120545 }, { "epoch": 17.955019362526066, "grad_norm": 0.8931766748428345, "learning_rate": 1.5757613971601054e-06, "loss": 0.4951, "num_input_tokens_seen": 69990352, "step": 120550 }, { "epoch": 17.955764075067023, "grad_norm": 1.7135536670684814, "learning_rate": 1.5746262113044474e-06, "loss": 0.584, "num_input_tokens_seen": 69993136, "step": 120555 }, { "epoch": 17.956508787607984, "grad_norm": 3.056293249130249, "learning_rate": 1.5734914211966683e-06, "loss": 0.7362, "num_input_tokens_seen": 69995984, "step": 120560 }, { "epoch": 17.957253500148944, "grad_norm": 1.4036222696304321, "learning_rate": 1.5723570268559445e-06, "loss": 0.5767, "num_input_tokens_seen": 69998736, "step": 120565 }, { "epoch": 17.9579982126899, "grad_norm": 3.191467761993408, "learning_rate": 1.5712230283014385e-06, "loss": 0.6786, "num_input_tokens_seen": 70001680, "step": 120570 }, { "epoch": 17.95874292523086, "grad_norm": 2.247750759124756, "learning_rate": 1.570089425552304e-06, "loss": 0.4527, "num_input_tokens_seen": 70004400, "step": 120575 }, { "epoch": 17.95948763777182, "grad_norm": 1.5666496753692627, "learning_rate": 1.5689562186276986e-06, "loss": 0.5415, "num_input_tokens_seen": 70007216, "step": 120580 }, { "epoch": 17.96023235031278, "grad_norm": 2.412863254547119, "learning_rate": 1.567823407546759e-06, "loss": 0.7147, "num_input_tokens_seen": 70010384, "step": 120585 }, { "epoch": 17.96097706285374, "grad_norm": 1.8709533214569092, "learning_rate": 1.5666909923286315e-06, "loss": 0.5293, "num_input_tokens_seen": 70013232, "step": 120590 }, { "epoch": 17.961721775394697, "grad_norm": 1.808294653892517, "learning_rate": 1.5655589729924453e-06, "loss": 0.5616, "num_input_tokens_seen": 70016080, "step": 120595 }, { "epoch": 17.962466487935657, "grad_norm": 2.145289659500122, "learning_rate": 1.564427349557318e-06, "loss": 0.8314, "num_input_tokens_seen": 70018928, "step": 120600 }, { "epoch": 17.963211200476614, "grad_norm": 0.9057837128639221, "learning_rate": 1.5632961220423737e-06, "loss": 0.6205, "num_input_tokens_seen": 70022288, "step": 120605 }, { "epoch": 17.963955913017575, "grad_norm": 2.1599624156951904, "learning_rate": 1.5621652904667244e-06, "loss": 0.6241, "num_input_tokens_seen": 70025040, "step": 120610 }, { "epoch": 17.964700625558535, "grad_norm": 1.6919244527816772, "learning_rate": 1.5610348548494692e-06, "loss": 0.6329, "num_input_tokens_seen": 70028208, "step": 120615 }, { "epoch": 17.965445338099492, "grad_norm": 1.2568342685699463, "learning_rate": 1.5599048152097034e-06, "loss": 0.4809, "num_input_tokens_seen": 70031088, "step": 120620 }, { "epoch": 17.966190050640453, "grad_norm": 0.6967865824699402, "learning_rate": 1.558775171566529e-06, "loss": 0.3653, "num_input_tokens_seen": 70034096, "step": 120625 }, { "epoch": 17.966934763181413, "grad_norm": 1.3692392110824585, "learning_rate": 1.5576459239390162e-06, "loss": 0.4391, "num_input_tokens_seen": 70036976, "step": 120630 }, { "epoch": 17.96767947572237, "grad_norm": 1.552135705947876, "learning_rate": 1.5565170723462558e-06, "loss": 0.6373, "num_input_tokens_seen": 70039856, "step": 120635 }, { "epoch": 17.96842418826333, "grad_norm": 1.1937661170959473, "learning_rate": 1.5553886168073073e-06, "loss": 0.6582, "num_input_tokens_seen": 70042736, "step": 120640 }, { "epoch": 17.969168900804288, "grad_norm": 2.6776411533355713, "learning_rate": 1.5542605573412444e-06, "loss": 0.7377, "num_input_tokens_seen": 70045936, "step": 120645 }, { "epoch": 17.96991361334525, "grad_norm": 3.078413724899292, "learning_rate": 1.5531328939671215e-06, "loss": 0.6377, "num_input_tokens_seen": 70049296, "step": 120650 }, { "epoch": 17.97065832588621, "grad_norm": 1.7292883396148682, "learning_rate": 1.552005626703984e-06, "loss": 0.4206, "num_input_tokens_seen": 70052304, "step": 120655 }, { "epoch": 17.971403038427166, "grad_norm": 2.119978189468384, "learning_rate": 1.550878755570881e-06, "loss": 0.7204, "num_input_tokens_seen": 70055216, "step": 120660 }, { "epoch": 17.972147750968126, "grad_norm": 1.4683812856674194, "learning_rate": 1.5497522805868525e-06, "loss": 0.6248, "num_input_tokens_seen": 70058384, "step": 120665 }, { "epoch": 17.972892463509087, "grad_norm": 0.9621517658233643, "learning_rate": 1.5486262017709225e-06, "loss": 0.621, "num_input_tokens_seen": 70061200, "step": 120670 }, { "epoch": 17.973637176050044, "grad_norm": 1.3695969581604004, "learning_rate": 1.5475005191421172e-06, "loss": 0.697, "num_input_tokens_seen": 70064080, "step": 120675 }, { "epoch": 17.974381888591004, "grad_norm": 1.306161642074585, "learning_rate": 1.5463752327194548e-06, "loss": 0.3056, "num_input_tokens_seen": 70067088, "step": 120680 }, { "epoch": 17.97512660113196, "grad_norm": 1.1243709325790405, "learning_rate": 1.545250342521945e-06, "loss": 0.6489, "num_input_tokens_seen": 70070384, "step": 120685 }, { "epoch": 17.975871313672922, "grad_norm": 1.460499882698059, "learning_rate": 1.544125848568595e-06, "loss": 0.7027, "num_input_tokens_seen": 70073072, "step": 120690 }, { "epoch": 17.976616026213883, "grad_norm": 1.6093456745147705, "learning_rate": 1.5430017508783978e-06, "loss": 0.6167, "num_input_tokens_seen": 70076368, "step": 120695 }, { "epoch": 17.97736073875484, "grad_norm": 1.4598970413208008, "learning_rate": 1.541878049470344e-06, "loss": 0.5985, "num_input_tokens_seen": 70079216, "step": 120700 }, { "epoch": 17.9781054512958, "grad_norm": 0.799085795879364, "learning_rate": 1.5407547443634206e-06, "loss": 0.5075, "num_input_tokens_seen": 70081840, "step": 120705 }, { "epoch": 17.97885016383676, "grad_norm": 1.219199776649475, "learning_rate": 1.5396318355765993e-06, "loss": 0.6128, "num_input_tokens_seen": 70084944, "step": 120710 }, { "epoch": 17.979594876377718, "grad_norm": 0.5415294170379639, "learning_rate": 1.538509323128859e-06, "loss": 0.7259, "num_input_tokens_seen": 70087792, "step": 120715 }, { "epoch": 17.980339588918678, "grad_norm": 3.258795738220215, "learning_rate": 1.5373872070391536e-06, "loss": 0.5153, "num_input_tokens_seen": 70090544, "step": 120720 }, { "epoch": 17.981084301459635, "grad_norm": 3.1311538219451904, "learning_rate": 1.5362654873264493e-06, "loss": 0.4758, "num_input_tokens_seen": 70093488, "step": 120725 }, { "epoch": 17.981829014000596, "grad_norm": 1.1399753093719482, "learning_rate": 1.5351441640096941e-06, "loss": 0.5812, "num_input_tokens_seen": 70096272, "step": 120730 }, { "epoch": 17.982573726541556, "grad_norm": 1.6506847143173218, "learning_rate": 1.5340232371078262e-06, "loss": 0.5919, "num_input_tokens_seen": 70098992, "step": 120735 }, { "epoch": 17.983318439082513, "grad_norm": 1.8793070316314697, "learning_rate": 1.5329027066397884e-06, "loss": 0.4745, "num_input_tokens_seen": 70101936, "step": 120740 }, { "epoch": 17.984063151623474, "grad_norm": 0.763309121131897, "learning_rate": 1.5317825726245045e-06, "loss": 0.5409, "num_input_tokens_seen": 70104976, "step": 120745 }, { "epoch": 17.984807864164434, "grad_norm": 1.1196904182434082, "learning_rate": 1.5306628350809037e-06, "loss": 0.5302, "num_input_tokens_seen": 70107824, "step": 120750 }, { "epoch": 17.98555257670539, "grad_norm": 2.313607692718506, "learning_rate": 1.529543494027899e-06, "loss": 0.6119, "num_input_tokens_seen": 70110736, "step": 120755 }, { "epoch": 17.98629728924635, "grad_norm": 1.9301230907440186, "learning_rate": 1.5284245494844057e-06, "loss": 0.5521, "num_input_tokens_seen": 70114000, "step": 120760 }, { "epoch": 17.98704200178731, "grad_norm": 2.1805419921875, "learning_rate": 1.5273060014693224e-06, "loss": 0.4736, "num_input_tokens_seen": 70117136, "step": 120765 }, { "epoch": 17.98778671432827, "grad_norm": 1.3648325204849243, "learning_rate": 1.526187850001548e-06, "loss": 0.5017, "num_input_tokens_seen": 70120080, "step": 120770 }, { "epoch": 17.98853142686923, "grad_norm": 1.1328461170196533, "learning_rate": 1.5250700950999758e-06, "loss": 0.4118, "num_input_tokens_seen": 70123088, "step": 120775 }, { "epoch": 17.989276139410187, "grad_norm": 2.641577959060669, "learning_rate": 1.5239527367834794e-06, "loss": 0.6338, "num_input_tokens_seen": 70125936, "step": 120780 }, { "epoch": 17.990020851951147, "grad_norm": 1.3625539541244507, "learning_rate": 1.5228357750709465e-06, "loss": 0.6813, "num_input_tokens_seen": 70128656, "step": 120785 }, { "epoch": 17.990765564492108, "grad_norm": 1.8651857376098633, "learning_rate": 1.5217192099812372e-06, "loss": 0.8888, "num_input_tokens_seen": 70131568, "step": 120790 }, { "epoch": 17.991510277033065, "grad_norm": 1.3261678218841553, "learning_rate": 1.5206030415332223e-06, "loss": 0.7051, "num_input_tokens_seen": 70134608, "step": 120795 }, { "epoch": 17.992254989574025, "grad_norm": 4.191829681396484, "learning_rate": 1.519487269745759e-06, "loss": 0.6295, "num_input_tokens_seen": 70137648, "step": 120800 }, { "epoch": 17.992999702114982, "grad_norm": 1.0217341184616089, "learning_rate": 1.5183718946376907e-06, "loss": 0.4326, "num_input_tokens_seen": 70140624, "step": 120805 }, { "epoch": 17.993744414655943, "grad_norm": 1.3469218015670776, "learning_rate": 1.5172569162278661e-06, "loss": 0.5677, "num_input_tokens_seen": 70143248, "step": 120810 }, { "epoch": 17.994489127196903, "grad_norm": 1.3654601573944092, "learning_rate": 1.5161423345351116e-06, "loss": 0.6597, "num_input_tokens_seen": 70146384, "step": 120815 }, { "epoch": 17.99523383973786, "grad_norm": 2.4392285346984863, "learning_rate": 1.515028149578271e-06, "loss": 0.6312, "num_input_tokens_seen": 70149488, "step": 120820 }, { "epoch": 17.99597855227882, "grad_norm": 1.6387773752212524, "learning_rate": 1.5139143613761565e-06, "loss": 0.4634, "num_input_tokens_seen": 70152208, "step": 120825 }, { "epoch": 17.996723264819778, "grad_norm": 0.9848193526268005, "learning_rate": 1.5128009699475948e-06, "loss": 0.6645, "num_input_tokens_seen": 70155152, "step": 120830 }, { "epoch": 17.99746797736074, "grad_norm": 2.09652042388916, "learning_rate": 1.5116879753113822e-06, "loss": 0.7108, "num_input_tokens_seen": 70158160, "step": 120835 }, { "epoch": 17.9982126899017, "grad_norm": 1.4292057752609253, "learning_rate": 1.510575377486334e-06, "loss": 0.6036, "num_input_tokens_seen": 70161072, "step": 120840 }, { "epoch": 17.998957402442656, "grad_norm": 1.7192174196243286, "learning_rate": 1.5094631764912354e-06, "loss": 0.6139, "num_input_tokens_seen": 70163856, "step": 120845 }, { "epoch": 17.999702114983616, "grad_norm": 1.977211356163025, "learning_rate": 1.5083513723448877e-06, "loss": 0.5259, "num_input_tokens_seen": 70166640, "step": 120850 }, { "epoch": 18.0, "eval_loss": 0.6566025614738464, "eval_runtime": 47.0065, "eval_samples_per_second": 63.481, "eval_steps_per_second": 15.87, "num_input_tokens_seen": 70167432, "step": 120852 }, { "epoch": 18.000446827524577, "grad_norm": 2.869886636734009, "learning_rate": 1.507239965066068e-06, "loss": 0.4984, "num_input_tokens_seen": 70169320, "step": 120855 }, { "epoch": 18.001191540065534, "grad_norm": 1.6409178972244263, "learning_rate": 1.506128954673547e-06, "loss": 0.5752, "num_input_tokens_seen": 70172264, "step": 120860 }, { "epoch": 18.001936252606495, "grad_norm": 2.0772995948791504, "learning_rate": 1.505018341186104e-06, "loss": 0.6599, "num_input_tokens_seen": 70174984, "step": 120865 }, { "epoch": 18.00268096514745, "grad_norm": 0.6108407378196716, "learning_rate": 1.5039081246224967e-06, "loss": 0.4268, "num_input_tokens_seen": 70177832, "step": 120870 }, { "epoch": 18.003425677688412, "grad_norm": 1.3066012859344482, "learning_rate": 1.502798305001482e-06, "loss": 0.5409, "num_input_tokens_seen": 70180872, "step": 120875 }, { "epoch": 18.004170390229373, "grad_norm": 0.985868513584137, "learning_rate": 1.5016888823418035e-06, "loss": 0.42, "num_input_tokens_seen": 70183784, "step": 120880 }, { "epoch": 18.00491510277033, "grad_norm": 2.1238160133361816, "learning_rate": 1.5005798566622125e-06, "loss": 0.6371, "num_input_tokens_seen": 70186536, "step": 120885 }, { "epoch": 18.00565981531129, "grad_norm": 1.1082346439361572, "learning_rate": 1.4994712279814415e-06, "loss": 0.6518, "num_input_tokens_seen": 70190024, "step": 120890 }, { "epoch": 18.00640452785225, "grad_norm": 1.4165538549423218, "learning_rate": 1.4983629963182143e-06, "loss": 0.4678, "num_input_tokens_seen": 70192680, "step": 120895 }, { "epoch": 18.007149240393208, "grad_norm": 2.7571046352386475, "learning_rate": 1.4972551616912633e-06, "loss": 0.7899, "num_input_tokens_seen": 70195784, "step": 120900 }, { "epoch": 18.007893952934168, "grad_norm": 1.814772367477417, "learning_rate": 1.4961477241192956e-06, "loss": 0.602, "num_input_tokens_seen": 70198728, "step": 120905 }, { "epoch": 18.008638665475125, "grad_norm": 1.179488182067871, "learning_rate": 1.4950406836210267e-06, "loss": 0.4787, "num_input_tokens_seen": 70201544, "step": 120910 }, { "epoch": 18.009383378016086, "grad_norm": 2.0099985599517822, "learning_rate": 1.493934040215153e-06, "loss": 0.6658, "num_input_tokens_seen": 70204232, "step": 120915 }, { "epoch": 18.010128090557046, "grad_norm": 1.2842439413070679, "learning_rate": 1.492827793920376e-06, "loss": 0.5759, "num_input_tokens_seen": 70206856, "step": 120920 }, { "epoch": 18.010872803098003, "grad_norm": 1.2162411212921143, "learning_rate": 1.4917219447553838e-06, "loss": 0.4586, "num_input_tokens_seen": 70209512, "step": 120925 }, { "epoch": 18.011617515638964, "grad_norm": 0.9278336763381958, "learning_rate": 1.49061649273885e-06, "loss": 0.5401, "num_input_tokens_seen": 70212488, "step": 120930 }, { "epoch": 18.012362228179924, "grad_norm": 1.3132216930389404, "learning_rate": 1.4895114378894625e-06, "loss": 0.6346, "num_input_tokens_seen": 70216008, "step": 120935 }, { "epoch": 18.01310694072088, "grad_norm": 2.340043544769287, "learning_rate": 1.4884067802258845e-06, "loss": 0.5124, "num_input_tokens_seen": 70218920, "step": 120940 }, { "epoch": 18.013851653261842, "grad_norm": 1.9918618202209473, "learning_rate": 1.4873025197667756e-06, "loss": 0.5123, "num_input_tokens_seen": 70221704, "step": 120945 }, { "epoch": 18.0145963658028, "grad_norm": 2.0120480060577393, "learning_rate": 1.4861986565307935e-06, "loss": 0.4973, "num_input_tokens_seen": 70224552, "step": 120950 }, { "epoch": 18.01534107834376, "grad_norm": 1.3999732732772827, "learning_rate": 1.4850951905365868e-06, "loss": 0.5131, "num_input_tokens_seen": 70227240, "step": 120955 }, { "epoch": 18.01608579088472, "grad_norm": 1.3611979484558105, "learning_rate": 1.4839921218027935e-06, "loss": 0.7035, "num_input_tokens_seen": 70230088, "step": 120960 }, { "epoch": 18.016830503425677, "grad_norm": 1.789960265159607, "learning_rate": 1.4828894503480601e-06, "loss": 0.6132, "num_input_tokens_seen": 70232904, "step": 120965 }, { "epoch": 18.017575215966637, "grad_norm": 3.044837236404419, "learning_rate": 1.4817871761910047e-06, "loss": 0.8146, "num_input_tokens_seen": 70235432, "step": 120970 }, { "epoch": 18.018319928507594, "grad_norm": 0.665635347366333, "learning_rate": 1.4806852993502485e-06, "loss": 0.4376, "num_input_tokens_seen": 70238120, "step": 120975 }, { "epoch": 18.019064641048555, "grad_norm": 1.5360075235366821, "learning_rate": 1.4795838198444157e-06, "loss": 0.5709, "num_input_tokens_seen": 70241032, "step": 120980 }, { "epoch": 18.019809353589515, "grad_norm": 0.9362487196922302, "learning_rate": 1.4784827376921052e-06, "loss": 0.4258, "num_input_tokens_seen": 70243912, "step": 120985 }, { "epoch": 18.020554066130472, "grad_norm": 2.267446279525757, "learning_rate": 1.4773820529119243e-06, "loss": 0.7248, "num_input_tokens_seen": 70246792, "step": 120990 }, { "epoch": 18.021298778671433, "grad_norm": 2.295593738555908, "learning_rate": 1.4762817655224664e-06, "loss": 0.5822, "num_input_tokens_seen": 70249576, "step": 120995 }, { "epoch": 18.022043491212393, "grad_norm": 1.5001252889633179, "learning_rate": 1.4751818755423225e-06, "loss": 0.4598, "num_input_tokens_seen": 70252520, "step": 121000 }, { "epoch": 18.02278820375335, "grad_norm": 1.1618907451629639, "learning_rate": 1.4740823829900664e-06, "loss": 0.4635, "num_input_tokens_seen": 70255336, "step": 121005 }, { "epoch": 18.02353291629431, "grad_norm": 1.2040706872940063, "learning_rate": 1.4729832878842803e-06, "loss": 0.6629, "num_input_tokens_seen": 70258440, "step": 121010 }, { "epoch": 18.024277628835268, "grad_norm": 2.618149518966675, "learning_rate": 1.4718845902435303e-06, "loss": 0.4067, "num_input_tokens_seen": 70261128, "step": 121015 }, { "epoch": 18.02502234137623, "grad_norm": 2.9322099685668945, "learning_rate": 1.4707862900863734e-06, "loss": 0.7452, "num_input_tokens_seen": 70263976, "step": 121020 }, { "epoch": 18.02576705391719, "grad_norm": 1.901781678199768, "learning_rate": 1.4696883874313727e-06, "loss": 0.4442, "num_input_tokens_seen": 70266920, "step": 121025 }, { "epoch": 18.026511766458146, "grad_norm": 1.6042903661727905, "learning_rate": 1.4685908822970663e-06, "loss": 0.6828, "num_input_tokens_seen": 70269832, "step": 121030 }, { "epoch": 18.027256478999107, "grad_norm": 1.4919592142105103, "learning_rate": 1.4674937747020057e-06, "loss": 0.6499, "num_input_tokens_seen": 70272808, "step": 121035 }, { "epoch": 18.028001191540067, "grad_norm": 0.9773953557014465, "learning_rate": 1.4663970646647152e-06, "loss": 0.4346, "num_input_tokens_seen": 70275560, "step": 121040 }, { "epoch": 18.028745904081024, "grad_norm": 1.1137287616729736, "learning_rate": 1.4653007522037325e-06, "loss": 0.5588, "num_input_tokens_seen": 70278312, "step": 121045 }, { "epoch": 18.029490616621985, "grad_norm": 1.8483797311782837, "learning_rate": 1.4642048373375712e-06, "loss": 0.5123, "num_input_tokens_seen": 70281480, "step": 121050 }, { "epoch": 18.03023532916294, "grad_norm": 4.236401557922363, "learning_rate": 1.4631093200847517e-06, "loss": 0.6675, "num_input_tokens_seen": 70284392, "step": 121055 }, { "epoch": 18.030980041703902, "grad_norm": 1.114967942237854, "learning_rate": 1.4620142004637766e-06, "loss": 0.6344, "num_input_tokens_seen": 70287336, "step": 121060 }, { "epoch": 18.031724754244863, "grad_norm": 3.243870496749878, "learning_rate": 1.4609194784931502e-06, "loss": 0.6122, "num_input_tokens_seen": 70290408, "step": 121065 }, { "epoch": 18.03246946678582, "grad_norm": 1.2013314962387085, "learning_rate": 1.459825154191366e-06, "loss": 0.5199, "num_input_tokens_seen": 70293192, "step": 121070 }, { "epoch": 18.03321417932678, "grad_norm": 2.2549455165863037, "learning_rate": 1.4587312275769065e-06, "loss": 0.5502, "num_input_tokens_seen": 70295880, "step": 121075 }, { "epoch": 18.03395889186774, "grad_norm": 3.602792978286743, "learning_rate": 1.4576376986682598e-06, "loss": 0.6941, "num_input_tokens_seen": 70298632, "step": 121080 }, { "epoch": 18.034703604408698, "grad_norm": 1.5363231897354126, "learning_rate": 1.4565445674838919e-06, "loss": 0.5388, "num_input_tokens_seen": 70301608, "step": 121085 }, { "epoch": 18.035448316949658, "grad_norm": 0.9907313585281372, "learning_rate": 1.455451834042279e-06, "loss": 0.5845, "num_input_tokens_seen": 70304616, "step": 121090 }, { "epoch": 18.036193029490615, "grad_norm": 1.9737588167190552, "learning_rate": 1.4543594983618792e-06, "loss": 0.6795, "num_input_tokens_seen": 70307496, "step": 121095 }, { "epoch": 18.036937742031576, "grad_norm": 2.1620402336120605, "learning_rate": 1.4532675604611412e-06, "loss": 0.6675, "num_input_tokens_seen": 70310760, "step": 121100 }, { "epoch": 18.037682454572536, "grad_norm": 3.358798027038574, "learning_rate": 1.45217602035852e-06, "loss": 0.5695, "num_input_tokens_seen": 70313704, "step": 121105 }, { "epoch": 18.038427167113493, "grad_norm": 1.5368657112121582, "learning_rate": 1.451084878072448e-06, "loss": 0.5029, "num_input_tokens_seen": 70316840, "step": 121110 }, { "epoch": 18.039171879654454, "grad_norm": 3.900290012359619, "learning_rate": 1.4499941336213657e-06, "loss": 0.6057, "num_input_tokens_seen": 70319624, "step": 121115 }, { "epoch": 18.039916592195414, "grad_norm": 1.2138248682022095, "learning_rate": 1.448903787023692e-06, "loss": 0.5223, "num_input_tokens_seen": 70322408, "step": 121120 }, { "epoch": 18.04066130473637, "grad_norm": 1.1097838878631592, "learning_rate": 1.4478138382978595e-06, "loss": 0.716, "num_input_tokens_seen": 70325224, "step": 121125 }, { "epoch": 18.041406017277332, "grad_norm": 2.301624298095703, "learning_rate": 1.4467242874622726e-06, "loss": 0.6141, "num_input_tokens_seen": 70327944, "step": 121130 }, { "epoch": 18.04215072981829, "grad_norm": 1.701459288597107, "learning_rate": 1.445635134535342e-06, "loss": 0.5347, "num_input_tokens_seen": 70330504, "step": 121135 }, { "epoch": 18.04289544235925, "grad_norm": 1.1042602062225342, "learning_rate": 1.4445463795354664e-06, "loss": 0.4791, "num_input_tokens_seen": 70334696, "step": 121140 }, { "epoch": 18.04364015490021, "grad_norm": 1.6230124235153198, "learning_rate": 1.443458022481034e-06, "loss": 0.7954, "num_input_tokens_seen": 70337448, "step": 121145 }, { "epoch": 18.044384867441167, "grad_norm": 1.4065722227096558, "learning_rate": 1.4423700633904414e-06, "loss": 0.6148, "num_input_tokens_seen": 70340360, "step": 121150 }, { "epoch": 18.045129579982127, "grad_norm": 1.4327071905136108, "learning_rate": 1.4412825022820598e-06, "loss": 0.8013, "num_input_tokens_seen": 70343592, "step": 121155 }, { "epoch": 18.045874292523084, "grad_norm": 1.447731614112854, "learning_rate": 1.440195339174269e-06, "loss": 0.5225, "num_input_tokens_seen": 70346504, "step": 121160 }, { "epoch": 18.046619005064045, "grad_norm": 1.1182036399841309, "learning_rate": 1.4391085740854293e-06, "loss": 0.7188, "num_input_tokens_seen": 70349544, "step": 121165 }, { "epoch": 18.047363717605005, "grad_norm": 1.0561788082122803, "learning_rate": 1.4380222070339095e-06, "loss": 0.5131, "num_input_tokens_seen": 70352360, "step": 121170 }, { "epoch": 18.048108430145962, "grad_norm": 1.3819878101348877, "learning_rate": 1.4369362380380558e-06, "loss": 0.6418, "num_input_tokens_seen": 70355528, "step": 121175 }, { "epoch": 18.048853142686923, "grad_norm": 1.1105520725250244, "learning_rate": 1.435850667116212e-06, "loss": 0.5004, "num_input_tokens_seen": 70358280, "step": 121180 }, { "epoch": 18.049597855227884, "grad_norm": 2.0491576194763184, "learning_rate": 1.4347654942867245e-06, "loss": 0.5458, "num_input_tokens_seen": 70361288, "step": 121185 }, { "epoch": 18.05034256776884, "grad_norm": 2.454336643218994, "learning_rate": 1.4336807195679203e-06, "loss": 0.5648, "num_input_tokens_seen": 70364296, "step": 121190 }, { "epoch": 18.0510872803098, "grad_norm": 1.5585633516311646, "learning_rate": 1.4325963429781347e-06, "loss": 0.7409, "num_input_tokens_seen": 70367240, "step": 121195 }, { "epoch": 18.051831992850758, "grad_norm": 1.0987889766693115, "learning_rate": 1.4315123645356782e-06, "loss": 0.7239, "num_input_tokens_seen": 70370056, "step": 121200 }, { "epoch": 18.05257670539172, "grad_norm": 1.3492209911346436, "learning_rate": 1.4304287842588665e-06, "loss": 0.4838, "num_input_tokens_seen": 70373064, "step": 121205 }, { "epoch": 18.05332141793268, "grad_norm": 2.114861249923706, "learning_rate": 1.4293456021660018e-06, "loss": 0.6135, "num_input_tokens_seen": 70375816, "step": 121210 }, { "epoch": 18.054066130473636, "grad_norm": 1.162448763847351, "learning_rate": 1.4282628182753915e-06, "loss": 0.6044, "num_input_tokens_seen": 70378920, "step": 121215 }, { "epoch": 18.054810843014597, "grad_norm": 1.3826779127120972, "learning_rate": 1.4271804326053239e-06, "loss": 0.3983, "num_input_tokens_seen": 70382344, "step": 121220 }, { "epoch": 18.055555555555557, "grad_norm": 1.5601526498794556, "learning_rate": 1.4260984451740815e-06, "loss": 0.6372, "num_input_tokens_seen": 70385288, "step": 121225 }, { "epoch": 18.056300268096514, "grad_norm": 1.4036568403244019, "learning_rate": 1.4250168559999499e-06, "loss": 0.4322, "num_input_tokens_seen": 70388008, "step": 121230 }, { "epoch": 18.057044980637475, "grad_norm": 2.2812201976776123, "learning_rate": 1.423935665101192e-06, "loss": 0.6149, "num_input_tokens_seen": 70390952, "step": 121235 }, { "epoch": 18.05778969317843, "grad_norm": 2.0795817375183105, "learning_rate": 1.422854872496085e-06, "loss": 0.6296, "num_input_tokens_seen": 70393704, "step": 121240 }, { "epoch": 18.058534405719392, "grad_norm": 1.9209433794021606, "learning_rate": 1.421774478202878e-06, "loss": 0.5942, "num_input_tokens_seen": 70396808, "step": 121245 }, { "epoch": 18.059279118260353, "grad_norm": 0.8132762312889099, "learning_rate": 1.4206944822398316e-06, "loss": 0.3984, "num_input_tokens_seen": 70399560, "step": 121250 }, { "epoch": 18.06002383080131, "grad_norm": 1.4763994216918945, "learning_rate": 1.4196148846251867e-06, "loss": 0.5001, "num_input_tokens_seen": 70402568, "step": 121255 }, { "epoch": 18.06076854334227, "grad_norm": 1.2147588729858398, "learning_rate": 1.4185356853771787e-06, "loss": 0.6003, "num_input_tokens_seen": 70405512, "step": 121260 }, { "epoch": 18.06151325588323, "grad_norm": 1.2364778518676758, "learning_rate": 1.4174568845140457e-06, "loss": 0.6737, "num_input_tokens_seen": 70408264, "step": 121265 }, { "epoch": 18.062257968424188, "grad_norm": 2.10638427734375, "learning_rate": 1.416378482054012e-06, "loss": 0.6344, "num_input_tokens_seen": 70411112, "step": 121270 }, { "epoch": 18.06300268096515, "grad_norm": 1.8487979173660278, "learning_rate": 1.4153004780152939e-06, "loss": 0.4983, "num_input_tokens_seen": 70413768, "step": 121275 }, { "epoch": 18.063747393506105, "grad_norm": 1.3434584140777588, "learning_rate": 1.414222872416099e-06, "loss": 0.691, "num_input_tokens_seen": 70416744, "step": 121280 }, { "epoch": 18.064492106047066, "grad_norm": 1.1175618171691895, "learning_rate": 1.4131456652746428e-06, "loss": 0.5999, "num_input_tokens_seen": 70419528, "step": 121285 }, { "epoch": 18.065236818588026, "grad_norm": 1.4549224376678467, "learning_rate": 1.4120688566091112e-06, "loss": 0.7684, "num_input_tokens_seen": 70422536, "step": 121290 }, { "epoch": 18.065981531128983, "grad_norm": 1.287318468093872, "learning_rate": 1.410992446437709e-06, "loss": 0.631, "num_input_tokens_seen": 70425704, "step": 121295 }, { "epoch": 18.066726243669944, "grad_norm": 3.3156213760375977, "learning_rate": 1.4099164347786132e-06, "loss": 0.9055, "num_input_tokens_seen": 70428872, "step": 121300 }, { "epoch": 18.0674709562109, "grad_norm": 1.5269675254821777, "learning_rate": 1.408840821650001e-06, "loss": 0.8774, "num_input_tokens_seen": 70431560, "step": 121305 }, { "epoch": 18.06821566875186, "grad_norm": 1.4589910507202148, "learning_rate": 1.4077656070700496e-06, "loss": 0.5238, "num_input_tokens_seen": 70434408, "step": 121310 }, { "epoch": 18.068960381292822, "grad_norm": 1.4946918487548828, "learning_rate": 1.4066907910569166e-06, "loss": 0.6212, "num_input_tokens_seen": 70437320, "step": 121315 }, { "epoch": 18.06970509383378, "grad_norm": 1.8243192434310913, "learning_rate": 1.4056163736287682e-06, "loss": 0.4836, "num_input_tokens_seen": 70440296, "step": 121320 }, { "epoch": 18.07044980637474, "grad_norm": 1.364399790763855, "learning_rate": 1.4045423548037478e-06, "loss": 0.4924, "num_input_tokens_seen": 70443240, "step": 121325 }, { "epoch": 18.0711945189157, "grad_norm": 1.4374767541885376, "learning_rate": 1.4034687346000052e-06, "loss": 0.5658, "num_input_tokens_seen": 70445960, "step": 121330 }, { "epoch": 18.071939231456657, "grad_norm": 1.3555108308792114, "learning_rate": 1.4023955130356758e-06, "loss": 0.6774, "num_input_tokens_seen": 70449320, "step": 121335 }, { "epoch": 18.072683943997617, "grad_norm": 2.1670806407928467, "learning_rate": 1.4013226901288868e-06, "loss": 0.5083, "num_input_tokens_seen": 70452168, "step": 121340 }, { "epoch": 18.073428656538574, "grad_norm": 1.532638669013977, "learning_rate": 1.400250265897768e-06, "loss": 0.8945, "num_input_tokens_seen": 70455080, "step": 121345 }, { "epoch": 18.074173369079535, "grad_norm": 1.373291015625, "learning_rate": 1.3991782403604353e-06, "loss": 0.5724, "num_input_tokens_seen": 70457896, "step": 121350 }, { "epoch": 18.074918081620496, "grad_norm": 1.672131061553955, "learning_rate": 1.3981066135349995e-06, "loss": 0.6207, "num_input_tokens_seen": 70460904, "step": 121355 }, { "epoch": 18.075662794161452, "grad_norm": 0.7027020454406738, "learning_rate": 1.397035385439563e-06, "loss": 0.5435, "num_input_tokens_seen": 70463816, "step": 121360 }, { "epoch": 18.076407506702413, "grad_norm": 2.614469289779663, "learning_rate": 1.3959645560922275e-06, "loss": 0.6842, "num_input_tokens_seen": 70466856, "step": 121365 }, { "epoch": 18.077152219243374, "grad_norm": 1.5413554906845093, "learning_rate": 1.3948941255110787e-06, "loss": 0.7502, "num_input_tokens_seen": 70469800, "step": 121370 }, { "epoch": 18.07789693178433, "grad_norm": 2.1373982429504395, "learning_rate": 1.393824093714205e-06, "loss": 0.5634, "num_input_tokens_seen": 70472840, "step": 121375 }, { "epoch": 18.07864164432529, "grad_norm": 1.1272566318511963, "learning_rate": 1.3927544607196807e-06, "loss": 0.3722, "num_input_tokens_seen": 70475368, "step": 121380 }, { "epoch": 18.079386356866248, "grad_norm": 2.932209014892578, "learning_rate": 1.3916852265455722e-06, "loss": 0.734, "num_input_tokens_seen": 70478440, "step": 121385 }, { "epoch": 18.08013106940721, "grad_norm": 1.2737780809402466, "learning_rate": 1.3906163912099506e-06, "loss": 0.4386, "num_input_tokens_seen": 70481288, "step": 121390 }, { "epoch": 18.08087578194817, "grad_norm": 2.2837531566619873, "learning_rate": 1.3895479547308716e-06, "loss": 0.6622, "num_input_tokens_seen": 70484168, "step": 121395 }, { "epoch": 18.081620494489126, "grad_norm": 1.1739997863769531, "learning_rate": 1.3884799171263841e-06, "loss": 0.6679, "num_input_tokens_seen": 70487112, "step": 121400 }, { "epoch": 18.082365207030087, "grad_norm": 2.4549832344055176, "learning_rate": 1.3874122784145239e-06, "loss": 0.7139, "num_input_tokens_seen": 70489832, "step": 121405 }, { "epoch": 18.083109919571047, "grad_norm": 2.7111101150512695, "learning_rate": 1.3863450386133402e-06, "loss": 0.6189, "num_input_tokens_seen": 70492840, "step": 121410 }, { "epoch": 18.083854632112004, "grad_norm": 1.6964678764343262, "learning_rate": 1.385278197740858e-06, "loss": 0.5207, "num_input_tokens_seen": 70495688, "step": 121415 }, { "epoch": 18.084599344652965, "grad_norm": 1.2897547483444214, "learning_rate": 1.384211755815093e-06, "loss": 0.6666, "num_input_tokens_seen": 70498568, "step": 121420 }, { "epoch": 18.08534405719392, "grad_norm": 1.5695180892944336, "learning_rate": 1.3831457128540753e-06, "loss": 0.4861, "num_input_tokens_seen": 70501384, "step": 121425 }, { "epoch": 18.086088769734882, "grad_norm": 1.877461552619934, "learning_rate": 1.3820800688758018e-06, "loss": 0.7891, "num_input_tokens_seen": 70504072, "step": 121430 }, { "epoch": 18.086833482275843, "grad_norm": 2.4559366703033447, "learning_rate": 1.3810148238982857e-06, "loss": 0.642, "num_input_tokens_seen": 70507048, "step": 121435 }, { "epoch": 18.0875781948168, "grad_norm": 3.050096273422241, "learning_rate": 1.3799499779395152e-06, "loss": 0.5296, "num_input_tokens_seen": 70509896, "step": 121440 }, { "epoch": 18.08832290735776, "grad_norm": 1.9487992525100708, "learning_rate": 1.3788855310174876e-06, "loss": 0.7275, "num_input_tokens_seen": 70513128, "step": 121445 }, { "epoch": 18.08906761989872, "grad_norm": 1.6249972581863403, "learning_rate": 1.3778214831501767e-06, "loss": 0.579, "num_input_tokens_seen": 70516264, "step": 121450 }, { "epoch": 18.089812332439678, "grad_norm": 1.3095309734344482, "learning_rate": 1.3767578343555688e-06, "loss": 0.6547, "num_input_tokens_seen": 70519240, "step": 121455 }, { "epoch": 18.09055704498064, "grad_norm": 2.307630777359009, "learning_rate": 1.3756945846516267e-06, "loss": 0.585, "num_input_tokens_seen": 70522344, "step": 121460 }, { "epoch": 18.091301757521595, "grad_norm": 1.983101725578308, "learning_rate": 1.3746317340563142e-06, "loss": 0.6688, "num_input_tokens_seen": 70525000, "step": 121465 }, { "epoch": 18.092046470062556, "grad_norm": 0.9647349119186401, "learning_rate": 1.3735692825875861e-06, "loss": 0.5928, "num_input_tokens_seen": 70527976, "step": 121470 }, { "epoch": 18.092791182603516, "grad_norm": 3.0216288566589355, "learning_rate": 1.3725072302633895e-06, "loss": 0.7945, "num_input_tokens_seen": 70530792, "step": 121475 }, { "epoch": 18.093535895144473, "grad_norm": 1.5033862590789795, "learning_rate": 1.371445577101671e-06, "loss": 0.7044, "num_input_tokens_seen": 70533736, "step": 121480 }, { "epoch": 18.094280607685434, "grad_norm": 1.840469241142273, "learning_rate": 1.3703843231203634e-06, "loss": 0.7671, "num_input_tokens_seen": 70536520, "step": 121485 }, { "epoch": 18.09502532022639, "grad_norm": 1.2213122844696045, "learning_rate": 1.3693234683373997e-06, "loss": 0.5178, "num_input_tokens_seen": 70539368, "step": 121490 }, { "epoch": 18.09577003276735, "grad_norm": 0.9682946801185608, "learning_rate": 1.3682630127706958e-06, "loss": 0.5686, "num_input_tokens_seen": 70542344, "step": 121495 }, { "epoch": 18.096514745308312, "grad_norm": 2.4026272296905518, "learning_rate": 1.3672029564381711e-06, "loss": 0.6581, "num_input_tokens_seen": 70545384, "step": 121500 }, { "epoch": 18.09725945784927, "grad_norm": 1.0861802101135254, "learning_rate": 1.3661432993577333e-06, "loss": 0.7041, "num_input_tokens_seen": 70548616, "step": 121505 }, { "epoch": 18.09800417039023, "grad_norm": 1.1357358694076538, "learning_rate": 1.365084041547282e-06, "loss": 0.5544, "num_input_tokens_seen": 70552040, "step": 121510 }, { "epoch": 18.09874888293119, "grad_norm": 1.2157260179519653, "learning_rate": 1.364025183024717e-06, "loss": 0.6351, "num_input_tokens_seen": 70555240, "step": 121515 }, { "epoch": 18.099493595472147, "grad_norm": 1.643792986869812, "learning_rate": 1.3629667238079208e-06, "loss": 0.6083, "num_input_tokens_seen": 70557768, "step": 121520 }, { "epoch": 18.100238308013108, "grad_norm": 1.1046996116638184, "learning_rate": 1.361908663914782e-06, "loss": 0.5376, "num_input_tokens_seen": 70560744, "step": 121525 }, { "epoch": 18.100983020554064, "grad_norm": 0.9055070281028748, "learning_rate": 1.3608510033631728e-06, "loss": 0.498, "num_input_tokens_seen": 70563720, "step": 121530 }, { "epoch": 18.101727733095025, "grad_norm": 1.8800437450408936, "learning_rate": 1.3597937421709588e-06, "loss": 0.6311, "num_input_tokens_seen": 70566664, "step": 121535 }, { "epoch": 18.102472445635986, "grad_norm": 0.9121978878974915, "learning_rate": 1.358736880356004e-06, "loss": 0.5296, "num_input_tokens_seen": 70569416, "step": 121540 }, { "epoch": 18.103217158176943, "grad_norm": 1.0623027086257935, "learning_rate": 1.3576804179361552e-06, "loss": 0.5095, "num_input_tokens_seen": 70572296, "step": 121545 }, { "epoch": 18.103961870717903, "grad_norm": 1.4157006740570068, "learning_rate": 1.356624354929273e-06, "loss": 0.5087, "num_input_tokens_seen": 70575080, "step": 121550 }, { "epoch": 18.104706583258864, "grad_norm": 1.7900482416152954, "learning_rate": 1.3555686913531874e-06, "loss": 0.5082, "num_input_tokens_seen": 70577832, "step": 121555 }, { "epoch": 18.10545129579982, "grad_norm": 1.902532935142517, "learning_rate": 1.3545134272257426e-06, "loss": 0.5616, "num_input_tokens_seen": 70580680, "step": 121560 }, { "epoch": 18.10619600834078, "grad_norm": 4.060535430908203, "learning_rate": 1.3534585625647578e-06, "loss": 0.607, "num_input_tokens_seen": 70583432, "step": 121565 }, { "epoch": 18.106940720881738, "grad_norm": 0.8665179014205933, "learning_rate": 1.3524040973880575e-06, "loss": 0.4233, "num_input_tokens_seen": 70586408, "step": 121570 }, { "epoch": 18.1076854334227, "grad_norm": 2.076249361038208, "learning_rate": 1.3513500317134582e-06, "loss": 0.4687, "num_input_tokens_seen": 70589032, "step": 121575 }, { "epoch": 18.10843014596366, "grad_norm": 1.3065029382705688, "learning_rate": 1.3502963655587619e-06, "loss": 0.4497, "num_input_tokens_seen": 70591528, "step": 121580 }, { "epoch": 18.109174858504616, "grad_norm": 1.2985121011734009, "learning_rate": 1.3492430989417742e-06, "loss": 0.551, "num_input_tokens_seen": 70594280, "step": 121585 }, { "epoch": 18.109919571045577, "grad_norm": 1.1137292385101318, "learning_rate": 1.3481902318802835e-06, "loss": 0.3483, "num_input_tokens_seen": 70597672, "step": 121590 }, { "epoch": 18.110664283586537, "grad_norm": 1.5186687707901, "learning_rate": 1.3471377643920841e-06, "loss": 0.6308, "num_input_tokens_seen": 70600616, "step": 121595 }, { "epoch": 18.111408996127494, "grad_norm": 1.6070740222930908, "learning_rate": 1.346085696494953e-06, "loss": 0.6637, "num_input_tokens_seen": 70603720, "step": 121600 }, { "epoch": 18.112153708668455, "grad_norm": 1.1457834243774414, "learning_rate": 1.3450340282066625e-06, "loss": 0.5954, "num_input_tokens_seen": 70606472, "step": 121605 }, { "epoch": 18.11289842120941, "grad_norm": 1.9085060358047485, "learning_rate": 1.3439827595449761e-06, "loss": 0.6428, "num_input_tokens_seen": 70609448, "step": 121610 }, { "epoch": 18.113643133750372, "grad_norm": 1.1627014875411987, "learning_rate": 1.3429318905276627e-06, "loss": 0.6244, "num_input_tokens_seen": 70612232, "step": 121615 }, { "epoch": 18.114387846291333, "grad_norm": 1.6875303983688354, "learning_rate": 1.3418814211724695e-06, "loss": 0.4535, "num_input_tokens_seen": 70615336, "step": 121620 }, { "epoch": 18.11513255883229, "grad_norm": 2.0007290840148926, "learning_rate": 1.3408313514971432e-06, "loss": 0.5828, "num_input_tokens_seen": 70618184, "step": 121625 }, { "epoch": 18.11587727137325, "grad_norm": 2.7533767223358154, "learning_rate": 1.3397816815194281e-06, "loss": 0.5887, "num_input_tokens_seen": 70621064, "step": 121630 }, { "epoch": 18.11662198391421, "grad_norm": 0.8345012068748474, "learning_rate": 1.3387324112570488e-06, "loss": 0.5804, "num_input_tokens_seen": 70623944, "step": 121635 }, { "epoch": 18.117366696455168, "grad_norm": 1.24491286277771, "learning_rate": 1.3376835407277437e-06, "loss": 0.5469, "num_input_tokens_seen": 70626696, "step": 121640 }, { "epoch": 18.11811140899613, "grad_norm": 1.535982608795166, "learning_rate": 1.3366350699492214e-06, "loss": 0.5594, "num_input_tokens_seen": 70629672, "step": 121645 }, { "epoch": 18.118856121537085, "grad_norm": 0.9860138297080994, "learning_rate": 1.3355869989392005e-06, "loss": 0.8078, "num_input_tokens_seen": 70632456, "step": 121650 }, { "epoch": 18.119600834078046, "grad_norm": 1.357747197151184, "learning_rate": 1.3345393277153896e-06, "loss": 0.4804, "num_input_tokens_seen": 70635240, "step": 121655 }, { "epoch": 18.120345546619006, "grad_norm": 1.0752109289169312, "learning_rate": 1.3334920562954822e-06, "loss": 0.3711, "num_input_tokens_seen": 70638248, "step": 121660 }, { "epoch": 18.121090259159963, "grad_norm": 2.1391642093658447, "learning_rate": 1.332445184697173e-06, "loss": 0.502, "num_input_tokens_seen": 70641320, "step": 121665 }, { "epoch": 18.121834971700924, "grad_norm": 3.3000054359436035, "learning_rate": 1.3313987129381422e-06, "loss": 0.5524, "num_input_tokens_seen": 70644168, "step": 121670 }, { "epoch": 18.12257968424188, "grad_norm": 1.0756144523620605, "learning_rate": 1.3303526410360811e-06, "loss": 0.6557, "num_input_tokens_seen": 70646984, "step": 121675 }, { "epoch": 18.12332439678284, "grad_norm": 1.2288851737976074, "learning_rate": 1.3293069690086506e-06, "loss": 0.6362, "num_input_tokens_seen": 70649832, "step": 121680 }, { "epoch": 18.124069109323802, "grad_norm": 1.1110316514968872, "learning_rate": 1.3282616968735256e-06, "loss": 0.6057, "num_input_tokens_seen": 70652584, "step": 121685 }, { "epoch": 18.12481382186476, "grad_norm": 1.2187597751617432, "learning_rate": 1.3272168246483557e-06, "loss": 0.5205, "num_input_tokens_seen": 70655176, "step": 121690 }, { "epoch": 18.12555853440572, "grad_norm": 1.8683043718338013, "learning_rate": 1.3261723523508018e-06, "loss": 0.5275, "num_input_tokens_seen": 70657800, "step": 121695 }, { "epoch": 18.12630324694668, "grad_norm": 0.8000348210334778, "learning_rate": 1.3251282799985026e-06, "loss": 0.4908, "num_input_tokens_seen": 70660712, "step": 121700 }, { "epoch": 18.127047959487637, "grad_norm": 1.4684927463531494, "learning_rate": 1.3240846076090996e-06, "loss": 0.6631, "num_input_tokens_seen": 70663528, "step": 121705 }, { "epoch": 18.127792672028598, "grad_norm": 1.3732272386550903, "learning_rate": 1.3230413352002259e-06, "loss": 0.7609, "num_input_tokens_seen": 70666376, "step": 121710 }, { "epoch": 18.128537384569555, "grad_norm": 1.6280405521392822, "learning_rate": 1.3219984627895005e-06, "loss": 0.5379, "num_input_tokens_seen": 70669288, "step": 121715 }, { "epoch": 18.129282097110515, "grad_norm": 1.6487830877304077, "learning_rate": 1.3209559903945483e-06, "loss": 0.5971, "num_input_tokens_seen": 70672232, "step": 121720 }, { "epoch": 18.130026809651476, "grad_norm": 2.72672438621521, "learning_rate": 1.3199139180329806e-06, "loss": 0.7498, "num_input_tokens_seen": 70675304, "step": 121725 }, { "epoch": 18.130771522192433, "grad_norm": 1.287503957748413, "learning_rate": 1.3188722457223995e-06, "loss": 0.3715, "num_input_tokens_seen": 70677992, "step": 121730 }, { "epoch": 18.131516234733393, "grad_norm": 2.240154504776001, "learning_rate": 1.3178309734803968e-06, "loss": 0.5459, "num_input_tokens_seen": 70681096, "step": 121735 }, { "epoch": 18.132260947274354, "grad_norm": 1.221936821937561, "learning_rate": 1.3167901013245775e-06, "loss": 0.554, "num_input_tokens_seen": 70683816, "step": 121740 }, { "epoch": 18.13300565981531, "grad_norm": 2.119961738586426, "learning_rate": 1.3157496292725169e-06, "loss": 0.4978, "num_input_tokens_seen": 70686664, "step": 121745 }, { "epoch": 18.13375037235627, "grad_norm": 1.1507515907287598, "learning_rate": 1.3147095573417923e-06, "loss": 0.6525, "num_input_tokens_seen": 70689480, "step": 121750 }, { "epoch": 18.134495084897228, "grad_norm": 1.5105209350585938, "learning_rate": 1.3136698855499812e-06, "loss": 0.4765, "num_input_tokens_seen": 70692360, "step": 121755 }, { "epoch": 18.13523979743819, "grad_norm": 2.044571876525879, "learning_rate": 1.3126306139146394e-06, "loss": 0.7045, "num_input_tokens_seen": 70695080, "step": 121760 }, { "epoch": 18.13598450997915, "grad_norm": 1.959822654724121, "learning_rate": 1.3115917424533303e-06, "loss": 0.7682, "num_input_tokens_seen": 70697960, "step": 121765 }, { "epoch": 18.136729222520106, "grad_norm": 1.7550899982452393, "learning_rate": 1.310553271183601e-06, "loss": 0.4456, "num_input_tokens_seen": 70700808, "step": 121770 }, { "epoch": 18.137473935061067, "grad_norm": 1.530592679977417, "learning_rate": 1.3095152001230015e-06, "loss": 0.4074, "num_input_tokens_seen": 70703560, "step": 121775 }, { "epoch": 18.138218647602027, "grad_norm": 3.2460217475891113, "learning_rate": 1.308477529289065e-06, "loss": 0.7283, "num_input_tokens_seen": 70706376, "step": 121780 }, { "epoch": 18.138963360142984, "grad_norm": 1.546316385269165, "learning_rate": 1.3074402586993162e-06, "loss": 0.5395, "num_input_tokens_seen": 70709160, "step": 121785 }, { "epoch": 18.139708072683945, "grad_norm": 1.6926171779632568, "learning_rate": 1.3064033883712883e-06, "loss": 0.587, "num_input_tokens_seen": 70712168, "step": 121790 }, { "epoch": 18.140452785224902, "grad_norm": 1.374600887298584, "learning_rate": 1.3053669183224954e-06, "loss": 0.5874, "num_input_tokens_seen": 70715048, "step": 121795 }, { "epoch": 18.141197497765862, "grad_norm": 1.3675744533538818, "learning_rate": 1.3043308485704454e-06, "loss": 0.5925, "num_input_tokens_seen": 70717960, "step": 121800 }, { "epoch": 18.141942210306823, "grad_norm": 1.5250142812728882, "learning_rate": 1.303295179132638e-06, "loss": 0.5961, "num_input_tokens_seen": 70720776, "step": 121805 }, { "epoch": 18.14268692284778, "grad_norm": 2.1595258712768555, "learning_rate": 1.3022599100265791e-06, "loss": 0.6208, "num_input_tokens_seen": 70723816, "step": 121810 }, { "epoch": 18.14343163538874, "grad_norm": 1.546350121498108, "learning_rate": 1.3012250412697519e-06, "loss": 0.6129, "num_input_tokens_seen": 70726760, "step": 121815 }, { "epoch": 18.1441763479297, "grad_norm": 2.877995014190674, "learning_rate": 1.300190572879642e-06, "loss": 0.6642, "num_input_tokens_seen": 70729992, "step": 121820 }, { "epoch": 18.144921060470658, "grad_norm": 2.8863413333892822, "learning_rate": 1.2991565048737248e-06, "loss": 0.5715, "num_input_tokens_seen": 70732744, "step": 121825 }, { "epoch": 18.14566577301162, "grad_norm": 1.3443603515625, "learning_rate": 1.2981228372694692e-06, "loss": 0.5246, "num_input_tokens_seen": 70735784, "step": 121830 }, { "epoch": 18.146410485552575, "grad_norm": 1.7905486822128296, "learning_rate": 1.2970895700843394e-06, "loss": 0.6487, "num_input_tokens_seen": 70738792, "step": 121835 }, { "epoch": 18.147155198093536, "grad_norm": 2.0160763263702393, "learning_rate": 1.296056703335788e-06, "loss": 0.5738, "num_input_tokens_seen": 70741800, "step": 121840 }, { "epoch": 18.147899910634496, "grad_norm": 2.5850706100463867, "learning_rate": 1.2950242370412702e-06, "loss": 0.4256, "num_input_tokens_seen": 70744776, "step": 121845 }, { "epoch": 18.148644623175453, "grad_norm": 1.7153263092041016, "learning_rate": 1.2939921712182223e-06, "loss": 0.4432, "num_input_tokens_seen": 70748072, "step": 121850 }, { "epoch": 18.149389335716414, "grad_norm": 3.0567853450775146, "learning_rate": 1.2929605058840889e-06, "loss": 0.5408, "num_input_tokens_seen": 70750984, "step": 121855 }, { "epoch": 18.15013404825737, "grad_norm": 1.5117530822753906, "learning_rate": 1.2919292410562889e-06, "loss": 0.7689, "num_input_tokens_seen": 70753896, "step": 121860 }, { "epoch": 18.15087876079833, "grad_norm": 1.0772134065628052, "learning_rate": 1.2908983767522504e-06, "loss": 0.647, "num_input_tokens_seen": 70756968, "step": 121865 }, { "epoch": 18.151623473339292, "grad_norm": 2.125302791595459, "learning_rate": 1.289867912989387e-06, "loss": 0.6536, "num_input_tokens_seen": 70759912, "step": 121870 }, { "epoch": 18.15236818588025, "grad_norm": 2.118338108062744, "learning_rate": 1.2888378497851045e-06, "loss": 0.5197, "num_input_tokens_seen": 70763016, "step": 121875 }, { "epoch": 18.15311289842121, "grad_norm": 2.1300389766693115, "learning_rate": 1.2878081871568082e-06, "loss": 0.5978, "num_input_tokens_seen": 70766184, "step": 121880 }, { "epoch": 18.15385761096217, "grad_norm": 1.8500267267227173, "learning_rate": 1.2867789251218925e-06, "loss": 0.4925, "num_input_tokens_seen": 70768968, "step": 121885 }, { "epoch": 18.154602323503127, "grad_norm": 1.5447646379470825, "learning_rate": 1.2857500636977466e-06, "loss": 0.7425, "num_input_tokens_seen": 70771784, "step": 121890 }, { "epoch": 18.155347036044088, "grad_norm": 3.2414586544036865, "learning_rate": 1.2847216029017506e-06, "loss": 0.7979, "num_input_tokens_seen": 70774440, "step": 121895 }, { "epoch": 18.156091748585045, "grad_norm": 0.9600133299827576, "learning_rate": 1.2836935427512826e-06, "loss": 0.6454, "num_input_tokens_seen": 70777352, "step": 121900 }, { "epoch": 18.156836461126005, "grad_norm": 0.9052849411964417, "learning_rate": 1.2826658832637062e-06, "loss": 0.654, "num_input_tokens_seen": 70780168, "step": 121905 }, { "epoch": 18.157581173666966, "grad_norm": 2.0842173099517822, "learning_rate": 1.2816386244563827e-06, "loss": 0.4701, "num_input_tokens_seen": 70782792, "step": 121910 }, { "epoch": 18.158325886207923, "grad_norm": 1.1555674076080322, "learning_rate": 1.2806117663466704e-06, "loss": 0.7039, "num_input_tokens_seen": 70785704, "step": 121915 }, { "epoch": 18.159070598748883, "grad_norm": 1.3969357013702393, "learning_rate": 1.279585308951914e-06, "loss": 0.5002, "num_input_tokens_seen": 70788552, "step": 121920 }, { "epoch": 18.159815311289844, "grad_norm": 2.2948179244995117, "learning_rate": 1.2785592522894573e-06, "loss": 0.5528, "num_input_tokens_seen": 70791048, "step": 121925 }, { "epoch": 18.1605600238308, "grad_norm": 3.6444451808929443, "learning_rate": 1.2775335963766317e-06, "loss": 0.763, "num_input_tokens_seen": 70793896, "step": 121930 }, { "epoch": 18.16130473637176, "grad_norm": 0.9570736289024353, "learning_rate": 1.2765083412307672e-06, "loss": 0.5927, "num_input_tokens_seen": 70797000, "step": 121935 }, { "epoch": 18.162049448912718, "grad_norm": 1.8234279155731201, "learning_rate": 1.2754834868691834e-06, "loss": 0.4545, "num_input_tokens_seen": 70799880, "step": 121940 }, { "epoch": 18.16279416145368, "grad_norm": 1.1752727031707764, "learning_rate": 1.2744590333091888e-06, "loss": 0.4911, "num_input_tokens_seen": 70802856, "step": 121945 }, { "epoch": 18.16353887399464, "grad_norm": 2.9723167419433594, "learning_rate": 1.2734349805680974e-06, "loss": 0.6942, "num_input_tokens_seen": 70805928, "step": 121950 }, { "epoch": 18.164283586535596, "grad_norm": 1.516256332397461, "learning_rate": 1.2724113286632061e-06, "loss": 0.6044, "num_input_tokens_seen": 70808712, "step": 121955 }, { "epoch": 18.165028299076557, "grad_norm": 1.1019959449768066, "learning_rate": 1.2713880776118126e-06, "loss": 0.5684, "num_input_tokens_seen": 70811784, "step": 121960 }, { "epoch": 18.165773011617517, "grad_norm": 1.0736337900161743, "learning_rate": 1.2703652274311973e-06, "loss": 0.5454, "num_input_tokens_seen": 70815016, "step": 121965 }, { "epoch": 18.166517724158474, "grad_norm": 1.9280683994293213, "learning_rate": 1.2693427781386464e-06, "loss": 0.4423, "num_input_tokens_seen": 70818216, "step": 121970 }, { "epoch": 18.167262436699435, "grad_norm": 3.1220614910125732, "learning_rate": 1.2683207297514293e-06, "loss": 0.6427, "num_input_tokens_seen": 70821160, "step": 121975 }, { "epoch": 18.168007149240392, "grad_norm": 1.444762945175171, "learning_rate": 1.267299082286816e-06, "loss": 0.4566, "num_input_tokens_seen": 70824232, "step": 121980 }, { "epoch": 18.168751861781352, "grad_norm": 1.7035635709762573, "learning_rate": 1.2662778357620614e-06, "loss": 0.7883, "num_input_tokens_seen": 70827240, "step": 121985 }, { "epoch": 18.169496574322313, "grad_norm": 2.6765859127044678, "learning_rate": 1.2652569901944244e-06, "loss": 0.5048, "num_input_tokens_seen": 70830344, "step": 121990 }, { "epoch": 18.17024128686327, "grad_norm": 2.1286861896514893, "learning_rate": 1.2642365456011467e-06, "loss": 0.7645, "num_input_tokens_seen": 70833224, "step": 121995 }, { "epoch": 18.17098599940423, "grad_norm": 1.525505542755127, "learning_rate": 1.2632165019994646e-06, "loss": 0.5839, "num_input_tokens_seen": 70835880, "step": 122000 }, { "epoch": 18.171730711945187, "grad_norm": 1.6399345397949219, "learning_rate": 1.262196859406617e-06, "loss": 0.6637, "num_input_tokens_seen": 70839016, "step": 122005 }, { "epoch": 18.172475424486148, "grad_norm": 1.8050788640975952, "learning_rate": 1.261177617839826e-06, "loss": 0.5894, "num_input_tokens_seen": 70842120, "step": 122010 }, { "epoch": 18.17322013702711, "grad_norm": 2.5693886280059814, "learning_rate": 1.2601587773163142e-06, "loss": 0.6966, "num_input_tokens_seen": 70845672, "step": 122015 }, { "epoch": 18.173964849568065, "grad_norm": 1.3968979120254517, "learning_rate": 1.259140337853293e-06, "loss": 0.7458, "num_input_tokens_seen": 70848808, "step": 122020 }, { "epoch": 18.174709562109026, "grad_norm": 1.252454161643982, "learning_rate": 1.258122299467962e-06, "loss": 0.5488, "num_input_tokens_seen": 70851560, "step": 122025 }, { "epoch": 18.175454274649987, "grad_norm": 2.0327064990997314, "learning_rate": 1.2571046621775273e-06, "loss": 0.5877, "num_input_tokens_seen": 70854472, "step": 122030 }, { "epoch": 18.176198987190944, "grad_norm": 2.4623239040374756, "learning_rate": 1.2560874259991778e-06, "loss": 0.5065, "num_input_tokens_seen": 70857224, "step": 122035 }, { "epoch": 18.176943699731904, "grad_norm": 2.018979549407959, "learning_rate": 1.2550705909500998e-06, "loss": 0.7233, "num_input_tokens_seen": 70859784, "step": 122040 }, { "epoch": 18.17768841227286, "grad_norm": 1.4920319318771362, "learning_rate": 1.2540541570474684e-06, "loss": 0.5306, "num_input_tokens_seen": 70862792, "step": 122045 }, { "epoch": 18.17843312481382, "grad_norm": 1.590042233467102, "learning_rate": 1.2530381243084616e-06, "loss": 0.5792, "num_input_tokens_seen": 70865544, "step": 122050 }, { "epoch": 18.179177837354782, "grad_norm": 2.0518486499786377, "learning_rate": 1.2520224927502405e-06, "loss": 0.87, "num_input_tokens_seen": 70868328, "step": 122055 }, { "epoch": 18.17992254989574, "grad_norm": 3.4957501888275146, "learning_rate": 1.251007262389961e-06, "loss": 0.8334, "num_input_tokens_seen": 70871464, "step": 122060 }, { "epoch": 18.1806672624367, "grad_norm": 1.090905785560608, "learning_rate": 1.2499924332447788e-06, "loss": 0.4993, "num_input_tokens_seen": 70874440, "step": 122065 }, { "epoch": 18.18141197497766, "grad_norm": 2.3343381881713867, "learning_rate": 1.24897800533183e-06, "loss": 0.6761, "num_input_tokens_seen": 70877416, "step": 122070 }, { "epoch": 18.182156687518617, "grad_norm": 1.6829944849014282, "learning_rate": 1.247963978668265e-06, "loss": 0.4737, "num_input_tokens_seen": 70880680, "step": 122075 }, { "epoch": 18.182901400059578, "grad_norm": 0.8197080492973328, "learning_rate": 1.2469503532712008e-06, "loss": 0.4504, "num_input_tokens_seen": 70883464, "step": 122080 }, { "epoch": 18.183646112600535, "grad_norm": 1.563598871231079, "learning_rate": 1.2459371291577759e-06, "loss": 0.5503, "num_input_tokens_seen": 70886536, "step": 122085 }, { "epoch": 18.184390825141495, "grad_norm": 1.0753225088119507, "learning_rate": 1.2449243063450967e-06, "loss": 0.5676, "num_input_tokens_seen": 70889704, "step": 122090 }, { "epoch": 18.185135537682456, "grad_norm": 1.2327193021774292, "learning_rate": 1.2439118848502796e-06, "loss": 0.6219, "num_input_tokens_seen": 70892776, "step": 122095 }, { "epoch": 18.185880250223413, "grad_norm": 2.110999584197998, "learning_rate": 1.2428998646904277e-06, "loss": 0.6377, "num_input_tokens_seen": 70895752, "step": 122100 }, { "epoch": 18.186624962764373, "grad_norm": 2.5011160373687744, "learning_rate": 1.241888245882633e-06, "loss": 0.9386, "num_input_tokens_seen": 70898632, "step": 122105 }, { "epoch": 18.187369675305334, "grad_norm": 3.3342607021331787, "learning_rate": 1.240877028443993e-06, "loss": 0.6939, "num_input_tokens_seen": 70901384, "step": 122110 }, { "epoch": 18.18811438784629, "grad_norm": 1.3810524940490723, "learning_rate": 1.2398662123915827e-06, "loss": 0.7679, "num_input_tokens_seen": 70904232, "step": 122115 }, { "epoch": 18.18885910038725, "grad_norm": 2.096391201019287, "learning_rate": 1.2388557977424915e-06, "loss": 0.6072, "num_input_tokens_seen": 70907016, "step": 122120 }, { "epoch": 18.18960381292821, "grad_norm": 1.3313082456588745, "learning_rate": 1.2378457845137775e-06, "loss": 0.5371, "num_input_tokens_seen": 70910024, "step": 122125 }, { "epoch": 18.19034852546917, "grad_norm": 3.828669786453247, "learning_rate": 1.236836172722511e-06, "loss": 0.6095, "num_input_tokens_seen": 70913000, "step": 122130 }, { "epoch": 18.19109323801013, "grad_norm": 1.8236173391342163, "learning_rate": 1.2358269623857416e-06, "loss": 0.5367, "num_input_tokens_seen": 70915688, "step": 122135 }, { "epoch": 18.191837950551086, "grad_norm": 2.047248363494873, "learning_rate": 1.2348181535205283e-06, "loss": 0.7621, "num_input_tokens_seen": 70918376, "step": 122140 }, { "epoch": 18.192582663092047, "grad_norm": 1.055633544921875, "learning_rate": 1.2338097461439047e-06, "loss": 0.3957, "num_input_tokens_seen": 70921448, "step": 122145 }, { "epoch": 18.193327375633007, "grad_norm": 0.9409869313240051, "learning_rate": 1.2328017402729098e-06, "loss": 0.6583, "num_input_tokens_seen": 70924360, "step": 122150 }, { "epoch": 18.194072088173964, "grad_norm": 1.2664490938186646, "learning_rate": 1.2317941359245744e-06, "loss": 0.6234, "num_input_tokens_seen": 70927016, "step": 122155 }, { "epoch": 18.194816800714925, "grad_norm": 1.7968980073928833, "learning_rate": 1.2307869331159182e-06, "loss": 0.6276, "num_input_tokens_seen": 70929864, "step": 122160 }, { "epoch": 18.195561513255882, "grad_norm": 1.612215280532837, "learning_rate": 1.229780131863964e-06, "loss": 0.6823, "num_input_tokens_seen": 70932488, "step": 122165 }, { "epoch": 18.196306225796842, "grad_norm": 1.276078462600708, "learning_rate": 1.2287737321857118e-06, "loss": 0.5821, "num_input_tokens_seen": 70935560, "step": 122170 }, { "epoch": 18.197050938337803, "grad_norm": 0.8449290990829468, "learning_rate": 1.2277677340981675e-06, "loss": 0.6718, "num_input_tokens_seen": 70938408, "step": 122175 }, { "epoch": 18.19779565087876, "grad_norm": 0.9614825248718262, "learning_rate": 1.2267621376183286e-06, "loss": 0.4429, "num_input_tokens_seen": 70941192, "step": 122180 }, { "epoch": 18.19854036341972, "grad_norm": 1.5784707069396973, "learning_rate": 1.2257569427631789e-06, "loss": 0.693, "num_input_tokens_seen": 70943944, "step": 122185 }, { "epoch": 18.199285075960677, "grad_norm": 1.0446079969406128, "learning_rate": 1.2247521495497043e-06, "loss": 0.5057, "num_input_tokens_seen": 70946568, "step": 122190 }, { "epoch": 18.200029788501638, "grad_norm": 0.9834450483322144, "learning_rate": 1.223747757994878e-06, "loss": 0.51, "num_input_tokens_seen": 70949160, "step": 122195 }, { "epoch": 18.2007745010426, "grad_norm": 0.912813127040863, "learning_rate": 1.2227437681156695e-06, "loss": 0.5186, "num_input_tokens_seen": 70951816, "step": 122200 }, { "epoch": 18.201519213583556, "grad_norm": 3.4913527965545654, "learning_rate": 1.2217401799290345e-06, "loss": 0.5823, "num_input_tokens_seen": 70954856, "step": 122205 }, { "epoch": 18.202263926124516, "grad_norm": 1.1251240968704224, "learning_rate": 1.2207369934519347e-06, "loss": 0.6207, "num_input_tokens_seen": 70957640, "step": 122210 }, { "epoch": 18.203008638665477, "grad_norm": 1.0509065389633179, "learning_rate": 1.2197342087013146e-06, "loss": 0.4335, "num_input_tokens_seen": 70960424, "step": 122215 }, { "epoch": 18.203753351206434, "grad_norm": 1.4518933296203613, "learning_rate": 1.2187318256941166e-06, "loss": 0.5347, "num_input_tokens_seen": 70963592, "step": 122220 }, { "epoch": 18.204498063747394, "grad_norm": 2.3622171878814697, "learning_rate": 1.2177298444472741e-06, "loss": 0.5384, "num_input_tokens_seen": 70966344, "step": 122225 }, { "epoch": 18.20524277628835, "grad_norm": 1.9354360103607178, "learning_rate": 1.2167282649777123e-06, "loss": 0.5752, "num_input_tokens_seen": 70969224, "step": 122230 }, { "epoch": 18.20598748882931, "grad_norm": 2.095240354537964, "learning_rate": 1.2157270873023596e-06, "loss": 0.7612, "num_input_tokens_seen": 70972072, "step": 122235 }, { "epoch": 18.206732201370272, "grad_norm": 1.443464994430542, "learning_rate": 1.2147263114381191e-06, "loss": 0.4306, "num_input_tokens_seen": 70974824, "step": 122240 }, { "epoch": 18.20747691391123, "grad_norm": 0.9187219142913818, "learning_rate": 1.213725937401905e-06, "loss": 0.6693, "num_input_tokens_seen": 70977672, "step": 122245 }, { "epoch": 18.20822162645219, "grad_norm": 1.7737953662872314, "learning_rate": 1.2127259652106149e-06, "loss": 0.54, "num_input_tokens_seen": 70980424, "step": 122250 }, { "epoch": 18.20896633899315, "grad_norm": 1.5748801231384277, "learning_rate": 1.2117263948811464e-06, "loss": 0.595, "num_input_tokens_seen": 70983240, "step": 122255 }, { "epoch": 18.209711051534107, "grad_norm": 1.2011183500289917, "learning_rate": 1.2107272264303831e-06, "loss": 0.535, "num_input_tokens_seen": 70986408, "step": 122260 }, { "epoch": 18.210455764075068, "grad_norm": 1.4237011671066284, "learning_rate": 1.2097284598752034e-06, "loss": 0.4313, "num_input_tokens_seen": 70989224, "step": 122265 }, { "epoch": 18.211200476616025, "grad_norm": 1.336666464805603, "learning_rate": 1.2087300952324826e-06, "loss": 0.6042, "num_input_tokens_seen": 70992264, "step": 122270 }, { "epoch": 18.211945189156985, "grad_norm": 1.06050443649292, "learning_rate": 1.2077321325190849e-06, "loss": 0.5128, "num_input_tokens_seen": 70995176, "step": 122275 }, { "epoch": 18.212689901697946, "grad_norm": 2.2211363315582275, "learning_rate": 1.206734571751872e-06, "loss": 0.613, "num_input_tokens_seen": 70997768, "step": 122280 }, { "epoch": 18.213434614238903, "grad_norm": 1.6816339492797852, "learning_rate": 1.2057374129476968e-06, "loss": 0.8366, "num_input_tokens_seen": 71000712, "step": 122285 }, { "epoch": 18.214179326779863, "grad_norm": 1.942711591720581, "learning_rate": 1.2047406561234042e-06, "loss": 0.6128, "num_input_tokens_seen": 71003368, "step": 122290 }, { "epoch": 18.214924039320824, "grad_norm": 1.4735020399093628, "learning_rate": 1.203744301295831e-06, "loss": 0.7249, "num_input_tokens_seen": 71006664, "step": 122295 }, { "epoch": 18.21566875186178, "grad_norm": 1.1921476125717163, "learning_rate": 1.2027483484818165e-06, "loss": 0.6386, "num_input_tokens_seen": 71009544, "step": 122300 }, { "epoch": 18.21641346440274, "grad_norm": 2.227501392364502, "learning_rate": 1.2017527976981828e-06, "loss": 0.5757, "num_input_tokens_seen": 71012232, "step": 122305 }, { "epoch": 18.2171581769437, "grad_norm": 2.0804288387298584, "learning_rate": 1.200757648961745e-06, "loss": 0.5321, "num_input_tokens_seen": 71015304, "step": 122310 }, { "epoch": 18.21790288948466, "grad_norm": 1.7515829801559448, "learning_rate": 1.1997629022893198e-06, "loss": 0.509, "num_input_tokens_seen": 71018248, "step": 122315 }, { "epoch": 18.21864760202562, "grad_norm": 1.6332751512527466, "learning_rate": 1.1987685576977131e-06, "loss": 0.6637, "num_input_tokens_seen": 71021192, "step": 122320 }, { "epoch": 18.219392314566576, "grad_norm": 1.4040721654891968, "learning_rate": 1.19777461520372e-06, "loss": 0.6002, "num_input_tokens_seen": 71023816, "step": 122325 }, { "epoch": 18.220137027107537, "grad_norm": 1.3048279285430908, "learning_rate": 1.19678107482413e-06, "loss": 0.5888, "num_input_tokens_seen": 71026600, "step": 122330 }, { "epoch": 18.220881739648497, "grad_norm": 1.299607276916504, "learning_rate": 1.1957879365757346e-06, "loss": 0.6196, "num_input_tokens_seen": 71029480, "step": 122335 }, { "epoch": 18.221626452189454, "grad_norm": 1.4668872356414795, "learning_rate": 1.1947952004753044e-06, "loss": 0.5364, "num_input_tokens_seen": 71032328, "step": 122340 }, { "epoch": 18.222371164730415, "grad_norm": 1.5417503118515015, "learning_rate": 1.1938028665396173e-06, "loss": 0.5769, "num_input_tokens_seen": 71035176, "step": 122345 }, { "epoch": 18.223115877271372, "grad_norm": 1.7977099418640137, "learning_rate": 1.1928109347854377e-06, "loss": 0.5605, "num_input_tokens_seen": 71037992, "step": 122350 }, { "epoch": 18.223860589812332, "grad_norm": 2.261146068572998, "learning_rate": 1.1918194052295162e-06, "loss": 0.5379, "num_input_tokens_seen": 71040872, "step": 122355 }, { "epoch": 18.224605302353293, "grad_norm": 2.6612792015075684, "learning_rate": 1.1908282778886115e-06, "loss": 0.695, "num_input_tokens_seen": 71043912, "step": 122360 }, { "epoch": 18.22535001489425, "grad_norm": 1.4237936735153198, "learning_rate": 1.1898375527794603e-06, "loss": 0.7299, "num_input_tokens_seen": 71046888, "step": 122365 }, { "epoch": 18.22609472743521, "grad_norm": 0.9289398193359375, "learning_rate": 1.1888472299188102e-06, "loss": 0.4281, "num_input_tokens_seen": 71049544, "step": 122370 }, { "epoch": 18.226839439976168, "grad_norm": 3.1267507076263428, "learning_rate": 1.1878573093233814e-06, "loss": 0.5935, "num_input_tokens_seen": 71052328, "step": 122375 }, { "epoch": 18.227584152517128, "grad_norm": 2.4646005630493164, "learning_rate": 1.1868677910099018e-06, "loss": 0.7812, "num_input_tokens_seen": 71055176, "step": 122380 }, { "epoch": 18.22832886505809, "grad_norm": 2.2929320335388184, "learning_rate": 1.1858786749950919e-06, "loss": 0.607, "num_input_tokens_seen": 71058216, "step": 122385 }, { "epoch": 18.229073577599046, "grad_norm": 0.9959126114845276, "learning_rate": 1.184889961295657e-06, "loss": 0.4519, "num_input_tokens_seen": 71060936, "step": 122390 }, { "epoch": 18.229818290140006, "grad_norm": 2.4422481060028076, "learning_rate": 1.1839016499283013e-06, "loss": 0.6419, "num_input_tokens_seen": 71063592, "step": 122395 }, { "epoch": 18.230563002680967, "grad_norm": 1.4663199186325073, "learning_rate": 1.1829137409097191e-06, "loss": 0.5566, "num_input_tokens_seen": 71066504, "step": 122400 }, { "epoch": 18.231307715221924, "grad_norm": 2.2405354976654053, "learning_rate": 1.1819262342566056e-06, "loss": 0.4169, "num_input_tokens_seen": 71069384, "step": 122405 }, { "epoch": 18.232052427762884, "grad_norm": 1.268830418586731, "learning_rate": 1.1809391299856365e-06, "loss": 0.4829, "num_input_tokens_seen": 71072392, "step": 122410 }, { "epoch": 18.23279714030384, "grad_norm": 2.688140392303467, "learning_rate": 1.1799524281134983e-06, "loss": 0.6379, "num_input_tokens_seen": 71075496, "step": 122415 }, { "epoch": 18.2335418528448, "grad_norm": 1.6544216871261597, "learning_rate": 1.1789661286568472e-06, "loss": 0.4086, "num_input_tokens_seen": 71078216, "step": 122420 }, { "epoch": 18.234286565385762, "grad_norm": 1.4626305103302002, "learning_rate": 1.1779802316323585e-06, "loss": 0.6954, "num_input_tokens_seen": 71081576, "step": 122425 }, { "epoch": 18.23503127792672, "grad_norm": 1.452583909034729, "learning_rate": 1.176994737056683e-06, "loss": 0.6594, "num_input_tokens_seen": 71084264, "step": 122430 }, { "epoch": 18.23577599046768, "grad_norm": 3.788281202316284, "learning_rate": 1.176009644946463e-06, "loss": 0.7013, "num_input_tokens_seen": 71086984, "step": 122435 }, { "epoch": 18.23652070300864, "grad_norm": 2.5505330562591553, "learning_rate": 1.1750249553183518e-06, "loss": 0.4961, "num_input_tokens_seen": 71089736, "step": 122440 }, { "epoch": 18.237265415549597, "grad_norm": 3.428037405014038, "learning_rate": 1.1740406681889748e-06, "loss": 0.7929, "num_input_tokens_seen": 71092776, "step": 122445 }, { "epoch": 18.238010128090558, "grad_norm": 1.8410100936889648, "learning_rate": 1.173056783574969e-06, "loss": 0.6029, "num_input_tokens_seen": 71095592, "step": 122450 }, { "epoch": 18.238754840631515, "grad_norm": 1.4351893663406372, "learning_rate": 1.1720733014929514e-06, "loss": 0.5439, "num_input_tokens_seen": 71098856, "step": 122455 }, { "epoch": 18.239499553172475, "grad_norm": 1.5730292797088623, "learning_rate": 1.1710902219595366e-06, "loss": 0.5562, "num_input_tokens_seen": 71101800, "step": 122460 }, { "epoch": 18.240244265713436, "grad_norm": 1.2719346284866333, "learning_rate": 1.1701075449913363e-06, "loss": 0.682, "num_input_tokens_seen": 71104840, "step": 122465 }, { "epoch": 18.240988978254393, "grad_norm": 1.1246232986450195, "learning_rate": 1.1691252706049456e-06, "loss": 0.4622, "num_input_tokens_seen": 71107816, "step": 122470 }, { "epoch": 18.241733690795353, "grad_norm": 1.1939328908920288, "learning_rate": 1.168143398816965e-06, "loss": 0.5646, "num_input_tokens_seen": 71110984, "step": 122475 }, { "epoch": 18.242478403336314, "grad_norm": 2.301588773727417, "learning_rate": 1.1671619296439785e-06, "loss": 0.6839, "num_input_tokens_seen": 71113800, "step": 122480 }, { "epoch": 18.24322311587727, "grad_norm": 1.5575308799743652, "learning_rate": 1.16618086310257e-06, "loss": 0.5356, "num_input_tokens_seen": 71116776, "step": 122485 }, { "epoch": 18.24396782841823, "grad_norm": 1.2075295448303223, "learning_rate": 1.1652001992093097e-06, "loss": 0.5058, "num_input_tokens_seen": 71119848, "step": 122490 }, { "epoch": 18.24471254095919, "grad_norm": 1.4300031661987305, "learning_rate": 1.1642199379807706e-06, "loss": 0.7566, "num_input_tokens_seen": 71122504, "step": 122495 }, { "epoch": 18.24545725350015, "grad_norm": 1.9168617725372314, "learning_rate": 1.1632400794335084e-06, "loss": 0.6524, "num_input_tokens_seen": 71125224, "step": 122500 }, { "epoch": 18.24620196604111, "grad_norm": 1.399558663368225, "learning_rate": 1.16226062358408e-06, "loss": 0.4118, "num_input_tokens_seen": 71127656, "step": 122505 }, { "epoch": 18.246946678582066, "grad_norm": 1.8983677625656128, "learning_rate": 1.1612815704490298e-06, "loss": 0.6167, "num_input_tokens_seen": 71130568, "step": 122510 }, { "epoch": 18.247691391123027, "grad_norm": 1.2721223831176758, "learning_rate": 1.1603029200448978e-06, "loss": 0.5083, "num_input_tokens_seen": 71133736, "step": 122515 }, { "epoch": 18.248436103663984, "grad_norm": 2.530362844467163, "learning_rate": 1.1593246723882206e-06, "loss": 0.5555, "num_input_tokens_seen": 71136968, "step": 122520 }, { "epoch": 18.249180816204944, "grad_norm": 2.0190060138702393, "learning_rate": 1.158346827495524e-06, "loss": 0.7743, "num_input_tokens_seen": 71139752, "step": 122525 }, { "epoch": 18.249925528745905, "grad_norm": 2.005934715270996, "learning_rate": 1.1573693853833224e-06, "loss": 0.467, "num_input_tokens_seen": 71142440, "step": 122530 }, { "epoch": 18.250670241286862, "grad_norm": 1.0868415832519531, "learning_rate": 1.156392346068133e-06, "loss": 0.561, "num_input_tokens_seen": 71145224, "step": 122535 }, { "epoch": 18.251414953827823, "grad_norm": 1.3897457122802734, "learning_rate": 1.1554157095664625e-06, "loss": 0.6909, "num_input_tokens_seen": 71147944, "step": 122540 }, { "epoch": 18.252159666368783, "grad_norm": 1.6338292360305786, "learning_rate": 1.1544394758948112e-06, "loss": 0.6116, "num_input_tokens_seen": 71151016, "step": 122545 }, { "epoch": 18.25290437890974, "grad_norm": 1.0287102460861206, "learning_rate": 1.1534636450696634e-06, "loss": 0.5774, "num_input_tokens_seen": 71154184, "step": 122550 }, { "epoch": 18.2536490914507, "grad_norm": 1.3288805484771729, "learning_rate": 1.1524882171075168e-06, "loss": 0.7556, "num_input_tokens_seen": 71157064, "step": 122555 }, { "epoch": 18.254393803991658, "grad_norm": 1.2336925268173218, "learning_rate": 1.151513192024839e-06, "loss": 0.6383, "num_input_tokens_seen": 71160008, "step": 122560 }, { "epoch": 18.255138516532618, "grad_norm": 1.9044114351272583, "learning_rate": 1.150538569838111e-06, "loss": 0.6136, "num_input_tokens_seen": 71162664, "step": 122565 }, { "epoch": 18.25588322907358, "grad_norm": 1.678174614906311, "learning_rate": 1.1495643505637922e-06, "loss": 0.567, "num_input_tokens_seen": 71165640, "step": 122570 }, { "epoch": 18.256627941614536, "grad_norm": 3.2630040645599365, "learning_rate": 1.148590534218344e-06, "loss": 0.6398, "num_input_tokens_seen": 71168328, "step": 122575 }, { "epoch": 18.257372654155496, "grad_norm": 2.1663458347320557, "learning_rate": 1.1476171208182146e-06, "loss": 0.755, "num_input_tokens_seen": 71171304, "step": 122580 }, { "epoch": 18.258117366696457, "grad_norm": 3.957322835922241, "learning_rate": 1.1466441103798575e-06, "loss": 0.645, "num_input_tokens_seen": 71174408, "step": 122585 }, { "epoch": 18.258862079237414, "grad_norm": 1.0503090620040894, "learning_rate": 1.1456715029197012e-06, "loss": 0.4957, "num_input_tokens_seen": 71177224, "step": 122590 }, { "epoch": 18.259606791778374, "grad_norm": 1.9488320350646973, "learning_rate": 1.1446992984541827e-06, "loss": 0.5862, "num_input_tokens_seen": 71180424, "step": 122595 }, { "epoch": 18.26035150431933, "grad_norm": 1.576410174369812, "learning_rate": 1.143727496999722e-06, "loss": 0.6041, "num_input_tokens_seen": 71183144, "step": 122600 }, { "epoch": 18.26109621686029, "grad_norm": 1.3371087312698364, "learning_rate": 1.1427560985727392e-06, "loss": 0.4903, "num_input_tokens_seen": 71186568, "step": 122605 }, { "epoch": 18.261840929401252, "grad_norm": 2.1974804401397705, "learning_rate": 1.1417851031896438e-06, "loss": 0.5935, "num_input_tokens_seen": 71189576, "step": 122610 }, { "epoch": 18.26258564194221, "grad_norm": 2.885969638824463, "learning_rate": 1.140814510866839e-06, "loss": 0.6958, "num_input_tokens_seen": 71192392, "step": 122615 }, { "epoch": 18.26333035448317, "grad_norm": 0.8388538360595703, "learning_rate": 1.1398443216207282e-06, "loss": 0.4844, "num_input_tokens_seen": 71195400, "step": 122620 }, { "epoch": 18.26407506702413, "grad_norm": 0.7418863773345947, "learning_rate": 1.138874535467696e-06, "loss": 0.5987, "num_input_tokens_seen": 71198344, "step": 122625 }, { "epoch": 18.264819779565087, "grad_norm": 1.5364928245544434, "learning_rate": 1.1379051524241236e-06, "loss": 0.6841, "num_input_tokens_seen": 71201192, "step": 122630 }, { "epoch": 18.265564492106048, "grad_norm": 1.146731972694397, "learning_rate": 1.1369361725063948e-06, "loss": 0.4752, "num_input_tokens_seen": 71204040, "step": 122635 }, { "epoch": 18.266309204647005, "grad_norm": 1.773419976234436, "learning_rate": 1.1359675957308745e-06, "loss": 0.761, "num_input_tokens_seen": 71206792, "step": 122640 }, { "epoch": 18.267053917187965, "grad_norm": 1.0403882265090942, "learning_rate": 1.1349994221139276e-06, "loss": 0.5104, "num_input_tokens_seen": 71210088, "step": 122645 }, { "epoch": 18.267798629728926, "grad_norm": 2.729079246520996, "learning_rate": 1.134031651671913e-06, "loss": 0.3841, "num_input_tokens_seen": 71213000, "step": 122650 }, { "epoch": 18.268543342269883, "grad_norm": 1.3828352689743042, "learning_rate": 1.133064284421176e-06, "loss": 0.7119, "num_input_tokens_seen": 71215848, "step": 122655 }, { "epoch": 18.269288054810843, "grad_norm": 1.3115057945251465, "learning_rate": 1.132097320378056e-06, "loss": 0.5574, "num_input_tokens_seen": 71218792, "step": 122660 }, { "epoch": 18.270032767351804, "grad_norm": 1.0674116611480713, "learning_rate": 1.1311307595588987e-06, "loss": 0.5309, "num_input_tokens_seen": 71221608, "step": 122665 }, { "epoch": 18.27077747989276, "grad_norm": 1.6707463264465332, "learning_rate": 1.130164601980027e-06, "loss": 0.4389, "num_input_tokens_seen": 71224520, "step": 122670 }, { "epoch": 18.27152219243372, "grad_norm": 2.3996164798736572, "learning_rate": 1.1291988476577614e-06, "loss": 0.8503, "num_input_tokens_seen": 71227624, "step": 122675 }, { "epoch": 18.27226690497468, "grad_norm": 1.9080497026443481, "learning_rate": 1.1282334966084246e-06, "loss": 0.6298, "num_input_tokens_seen": 71230408, "step": 122680 }, { "epoch": 18.27301161751564, "grad_norm": 2.921113967895508, "learning_rate": 1.1272685488483148e-06, "loss": 0.6238, "num_input_tokens_seen": 71233128, "step": 122685 }, { "epoch": 18.2737563300566, "grad_norm": 2.703986644744873, "learning_rate": 1.126304004393744e-06, "loss": 0.493, "num_input_tokens_seen": 71235848, "step": 122690 }, { "epoch": 18.274501042597556, "grad_norm": 0.8939514756202698, "learning_rate": 1.125339863261002e-06, "loss": 0.5052, "num_input_tokens_seen": 71238984, "step": 122695 }, { "epoch": 18.275245755138517, "grad_norm": 1.258772373199463, "learning_rate": 1.1243761254663781e-06, "loss": 0.6699, "num_input_tokens_seen": 71241928, "step": 122700 }, { "epoch": 18.275990467679474, "grad_norm": 2.6512081623077393, "learning_rate": 1.1234127910261543e-06, "loss": 0.6224, "num_input_tokens_seen": 71245192, "step": 122705 }, { "epoch": 18.276735180220435, "grad_norm": 2.3126394748687744, "learning_rate": 1.1224498599566009e-06, "loss": 0.6158, "num_input_tokens_seen": 71247848, "step": 122710 }, { "epoch": 18.277479892761395, "grad_norm": 2.31978178024292, "learning_rate": 1.1214873322739933e-06, "loss": 0.6205, "num_input_tokens_seen": 71250504, "step": 122715 }, { "epoch": 18.278224605302352, "grad_norm": 3.0822689533233643, "learning_rate": 1.1205252079945882e-06, "loss": 0.5568, "num_input_tokens_seen": 71253416, "step": 122720 }, { "epoch": 18.278969317843313, "grad_norm": 1.5501457452774048, "learning_rate": 1.1195634871346395e-06, "loss": 0.61, "num_input_tokens_seen": 71256328, "step": 122725 }, { "epoch": 18.279714030384273, "grad_norm": 1.0304312705993652, "learning_rate": 1.1186021697103893e-06, "loss": 0.5566, "num_input_tokens_seen": 71259176, "step": 122730 }, { "epoch": 18.28045874292523, "grad_norm": 1.5146193504333496, "learning_rate": 1.1176412557380888e-06, "loss": 0.5562, "num_input_tokens_seen": 71262088, "step": 122735 }, { "epoch": 18.28120345546619, "grad_norm": 1.1833581924438477, "learning_rate": 1.116680745233961e-06, "loss": 0.5091, "num_input_tokens_seen": 71265192, "step": 122740 }, { "epoch": 18.281948168007148, "grad_norm": 2.0504965782165527, "learning_rate": 1.1157206382142433e-06, "loss": 0.6173, "num_input_tokens_seen": 71267944, "step": 122745 }, { "epoch": 18.282692880548108, "grad_norm": 2.4661431312561035, "learning_rate": 1.1147609346951526e-06, "loss": 0.7578, "num_input_tokens_seen": 71270984, "step": 122750 }, { "epoch": 18.28343759308907, "grad_norm": 1.0980467796325684, "learning_rate": 1.113801634692893e-06, "loss": 0.4816, "num_input_tokens_seen": 71273736, "step": 122755 }, { "epoch": 18.284182305630026, "grad_norm": 2.215118646621704, "learning_rate": 1.1128427382236823e-06, "loss": 0.4772, "num_input_tokens_seen": 71276808, "step": 122760 }, { "epoch": 18.284927018170986, "grad_norm": 1.900551438331604, "learning_rate": 1.1118842453037126e-06, "loss": 0.6166, "num_input_tokens_seen": 71279496, "step": 122765 }, { "epoch": 18.285671730711947, "grad_norm": 1.4590693712234497, "learning_rate": 1.1109261559491823e-06, "loss": 0.4695, "num_input_tokens_seen": 71282408, "step": 122770 }, { "epoch": 18.286416443252904, "grad_norm": 2.312268018722534, "learning_rate": 1.109968470176273e-06, "loss": 0.486, "num_input_tokens_seen": 71285480, "step": 122775 }, { "epoch": 18.287161155793864, "grad_norm": 1.5367518663406372, "learning_rate": 1.109011188001169e-06, "loss": 0.5562, "num_input_tokens_seen": 71288264, "step": 122780 }, { "epoch": 18.28790586833482, "grad_norm": 1.7136062383651733, "learning_rate": 1.1080543094400374e-06, "loss": 0.4921, "num_input_tokens_seen": 71291112, "step": 122785 }, { "epoch": 18.28865058087578, "grad_norm": 1.2702178955078125, "learning_rate": 1.1070978345090494e-06, "loss": 0.5107, "num_input_tokens_seen": 71294152, "step": 122790 }, { "epoch": 18.289395293416742, "grad_norm": 2.2704241275787354, "learning_rate": 1.1061417632243554e-06, "loss": 0.6912, "num_input_tokens_seen": 71297160, "step": 122795 }, { "epoch": 18.2901400059577, "grad_norm": 1.0758675336837769, "learning_rate": 1.105186095602112e-06, "loss": 0.5686, "num_input_tokens_seen": 71300264, "step": 122800 }, { "epoch": 18.29088471849866, "grad_norm": 2.143195152282715, "learning_rate": 1.1042308316584649e-06, "loss": 0.3657, "num_input_tokens_seen": 71303464, "step": 122805 }, { "epoch": 18.29162943103962, "grad_norm": 3.531949758529663, "learning_rate": 1.1032759714095481e-06, "loss": 0.4194, "num_input_tokens_seen": 71306344, "step": 122810 }, { "epoch": 18.292374143580577, "grad_norm": 1.0357338190078735, "learning_rate": 1.1023215148714988e-06, "loss": 0.5375, "num_input_tokens_seen": 71309128, "step": 122815 }, { "epoch": 18.293118856121538, "grad_norm": 1.2389075756072998, "learning_rate": 1.1013674620604376e-06, "loss": 0.5692, "num_input_tokens_seen": 71311944, "step": 122820 }, { "epoch": 18.293863568662495, "grad_norm": 1.6891926527023315, "learning_rate": 1.1004138129924874e-06, "loss": 0.5657, "num_input_tokens_seen": 71314920, "step": 122825 }, { "epoch": 18.294608281203455, "grad_norm": 2.573582172393799, "learning_rate": 1.0994605676837521e-06, "loss": 0.4346, "num_input_tokens_seen": 71317704, "step": 122830 }, { "epoch": 18.295352993744416, "grad_norm": 2.8560760021209717, "learning_rate": 1.0985077261503384e-06, "loss": 0.5764, "num_input_tokens_seen": 71320744, "step": 122835 }, { "epoch": 18.296097706285373, "grad_norm": 1.1476006507873535, "learning_rate": 1.0975552884083473e-06, "loss": 0.792, "num_input_tokens_seen": 71323976, "step": 122840 }, { "epoch": 18.296842418826333, "grad_norm": 2.306333541870117, "learning_rate": 1.096603254473863e-06, "loss": 0.6664, "num_input_tokens_seen": 71326664, "step": 122845 }, { "epoch": 18.297587131367294, "grad_norm": 1.6836131811141968, "learning_rate": 1.0956516243629754e-06, "loss": 0.8411, "num_input_tokens_seen": 71329864, "step": 122850 }, { "epoch": 18.29833184390825, "grad_norm": 1.7153098583221436, "learning_rate": 1.094700398091758e-06, "loss": 0.4569, "num_input_tokens_seen": 71332968, "step": 122855 }, { "epoch": 18.29907655644921, "grad_norm": 1.3847296237945557, "learning_rate": 1.093749575676281e-06, "loss": 0.5289, "num_input_tokens_seen": 71336136, "step": 122860 }, { "epoch": 18.29982126899017, "grad_norm": 1.4274235963821411, "learning_rate": 1.092799157132604e-06, "loss": 0.4105, "num_input_tokens_seen": 71339080, "step": 122865 }, { "epoch": 18.30056598153113, "grad_norm": 1.3284107446670532, "learning_rate": 1.091849142476792e-06, "loss": 0.4823, "num_input_tokens_seen": 71341896, "step": 122870 }, { "epoch": 18.30131069407209, "grad_norm": 1.840956449508667, "learning_rate": 1.0908995317248898e-06, "loss": 0.6697, "num_input_tokens_seen": 71344648, "step": 122875 }, { "epoch": 18.302055406613047, "grad_norm": 2.3827109336853027, "learning_rate": 1.0899503248929355e-06, "loss": 0.555, "num_input_tokens_seen": 71347496, "step": 122880 }, { "epoch": 18.302800119154007, "grad_norm": 1.1809444427490234, "learning_rate": 1.0890015219969713e-06, "loss": 0.6393, "num_input_tokens_seen": 71350408, "step": 122885 }, { "epoch": 18.303544831694964, "grad_norm": 1.2639074325561523, "learning_rate": 1.0880531230530233e-06, "loss": 0.6293, "num_input_tokens_seen": 71353064, "step": 122890 }, { "epoch": 18.304289544235925, "grad_norm": 1.254992127418518, "learning_rate": 1.0871051280771178e-06, "loss": 0.5159, "num_input_tokens_seen": 71355848, "step": 122895 }, { "epoch": 18.305034256776885, "grad_norm": 2.555846929550171, "learning_rate": 1.0861575370852612e-06, "loss": 0.5191, "num_input_tokens_seen": 71358536, "step": 122900 }, { "epoch": 18.305778969317842, "grad_norm": 1.366119146347046, "learning_rate": 1.085210350093474e-06, "loss": 0.4559, "num_input_tokens_seen": 71361384, "step": 122905 }, { "epoch": 18.306523681858803, "grad_norm": 0.8834714293479919, "learning_rate": 1.084263567117752e-06, "loss": 0.5472, "num_input_tokens_seen": 71364520, "step": 122910 }, { "epoch": 18.307268394399763, "grad_norm": 1.4789259433746338, "learning_rate": 1.083317188174085e-06, "loss": 0.4927, "num_input_tokens_seen": 71367304, "step": 122915 }, { "epoch": 18.30801310694072, "grad_norm": 2.1753756999969482, "learning_rate": 1.0823712132784713e-06, "loss": 0.8614, "num_input_tokens_seen": 71370120, "step": 122920 }, { "epoch": 18.30875781948168, "grad_norm": 3.112229108810425, "learning_rate": 1.0814256424468872e-06, "loss": 0.5771, "num_input_tokens_seen": 71372648, "step": 122925 }, { "epoch": 18.309502532022638, "grad_norm": 1.638089895248413, "learning_rate": 1.0804804756953057e-06, "loss": 0.6027, "num_input_tokens_seen": 71375624, "step": 122930 }, { "epoch": 18.310247244563598, "grad_norm": 1.7870699167251587, "learning_rate": 1.079535713039695e-06, "loss": 0.6713, "num_input_tokens_seen": 71378472, "step": 122935 }, { "epoch": 18.31099195710456, "grad_norm": 2.197253942489624, "learning_rate": 1.078591354496017e-06, "loss": 0.5581, "num_input_tokens_seen": 71381352, "step": 122940 }, { "epoch": 18.311736669645516, "grad_norm": 1.7117434740066528, "learning_rate": 1.0776474000802255e-06, "loss": 0.6217, "num_input_tokens_seen": 71384392, "step": 122945 }, { "epoch": 18.312481382186476, "grad_norm": 1.177872896194458, "learning_rate": 1.0767038498082694e-06, "loss": 0.6633, "num_input_tokens_seen": 71387400, "step": 122950 }, { "epoch": 18.313226094727437, "grad_norm": 1.6930806636810303, "learning_rate": 1.0757607036960853e-06, "loss": 0.631, "num_input_tokens_seen": 71390600, "step": 122955 }, { "epoch": 18.313970807268394, "grad_norm": 1.676937222480774, "learning_rate": 1.0748179617596082e-06, "loss": 0.5959, "num_input_tokens_seen": 71393832, "step": 122960 }, { "epoch": 18.314715519809354, "grad_norm": 1.576703667640686, "learning_rate": 1.0738756240147668e-06, "loss": 0.6581, "num_input_tokens_seen": 71397032, "step": 122965 }, { "epoch": 18.31546023235031, "grad_norm": 0.8616365790367126, "learning_rate": 1.0729336904774762e-06, "loss": 0.5225, "num_input_tokens_seen": 71399848, "step": 122970 }, { "epoch": 18.316204944891272, "grad_norm": 1.069509506225586, "learning_rate": 1.071992161163654e-06, "loss": 0.5522, "num_input_tokens_seen": 71402920, "step": 122975 }, { "epoch": 18.316949657432232, "grad_norm": 1.672528624534607, "learning_rate": 1.0710510360892072e-06, "loss": 0.5044, "num_input_tokens_seen": 71405960, "step": 122980 }, { "epoch": 18.31769436997319, "grad_norm": 3.385434627532959, "learning_rate": 1.0701103152700343e-06, "loss": 0.6534, "num_input_tokens_seen": 71408584, "step": 122985 }, { "epoch": 18.31843908251415, "grad_norm": 3.6307830810546875, "learning_rate": 1.0691699987220194e-06, "loss": 0.5349, "num_input_tokens_seen": 71411208, "step": 122990 }, { "epoch": 18.31918379505511, "grad_norm": 2.923980712890625, "learning_rate": 1.068230086461061e-06, "loss": 0.6287, "num_input_tokens_seen": 71413800, "step": 122995 }, { "epoch": 18.319928507596067, "grad_norm": 1.0056220293045044, "learning_rate": 1.067290578503033e-06, "loss": 0.4081, "num_input_tokens_seen": 71416712, "step": 123000 }, { "epoch": 18.320673220137028, "grad_norm": 2.6456918716430664, "learning_rate": 1.0663514748637998e-06, "loss": 0.7248, "num_input_tokens_seen": 71419912, "step": 123005 }, { "epoch": 18.321417932677985, "grad_norm": 1.1484531164169312, "learning_rate": 1.0654127755592381e-06, "loss": 0.5195, "num_input_tokens_seen": 71422728, "step": 123010 }, { "epoch": 18.322162645218945, "grad_norm": 1.7341301441192627, "learning_rate": 1.0644744806051988e-06, "loss": 0.5978, "num_input_tokens_seen": 71425608, "step": 123015 }, { "epoch": 18.322907357759906, "grad_norm": 1.2834798097610474, "learning_rate": 1.0635365900175414e-06, "loss": 0.6214, "num_input_tokens_seen": 71428680, "step": 123020 }, { "epoch": 18.323652070300863, "grad_norm": 1.964294672012329, "learning_rate": 1.062599103812098e-06, "loss": 0.5945, "num_input_tokens_seen": 71431496, "step": 123025 }, { "epoch": 18.324396782841823, "grad_norm": 1.5088399648666382, "learning_rate": 1.0616620220047197e-06, "loss": 0.6467, "num_input_tokens_seen": 71434280, "step": 123030 }, { "epoch": 18.32514149538278, "grad_norm": 2.1350879669189453, "learning_rate": 1.0607253446112324e-06, "loss": 0.5689, "num_input_tokens_seen": 71437224, "step": 123035 }, { "epoch": 18.32588620792374, "grad_norm": 2.501561164855957, "learning_rate": 1.0597890716474545e-06, "loss": 0.6505, "num_input_tokens_seen": 71440040, "step": 123040 }, { "epoch": 18.3266309204647, "grad_norm": 2.3221628665924072, "learning_rate": 1.058853203129212e-06, "loss": 0.7518, "num_input_tokens_seen": 71442824, "step": 123045 }, { "epoch": 18.32737563300566, "grad_norm": 2.4824583530426025, "learning_rate": 1.0579177390723116e-06, "loss": 0.7134, "num_input_tokens_seen": 71445416, "step": 123050 }, { "epoch": 18.32812034554662, "grad_norm": 1.25301992893219, "learning_rate": 1.0569826794925602e-06, "loss": 0.541, "num_input_tokens_seen": 71448264, "step": 123055 }, { "epoch": 18.32886505808758, "grad_norm": 1.6580313444137573, "learning_rate": 1.0560480244057452e-06, "loss": 0.5812, "num_input_tokens_seen": 71451336, "step": 123060 }, { "epoch": 18.329609770628537, "grad_norm": 2.0657739639282227, "learning_rate": 1.0551137738276678e-06, "loss": 0.589, "num_input_tokens_seen": 71454184, "step": 123065 }, { "epoch": 18.330354483169497, "grad_norm": 1.687760353088379, "learning_rate": 1.0541799277741071e-06, "loss": 0.5098, "num_input_tokens_seen": 71457128, "step": 123070 }, { "epoch": 18.331099195710454, "grad_norm": 1.9663976430892944, "learning_rate": 1.0532464862608366e-06, "loss": 0.5549, "num_input_tokens_seen": 71460040, "step": 123075 }, { "epoch": 18.331843908251415, "grad_norm": 1.5380239486694336, "learning_rate": 1.0523134493036296e-06, "loss": 0.5264, "num_input_tokens_seen": 71463176, "step": 123080 }, { "epoch": 18.332588620792375, "grad_norm": 0.9899426698684692, "learning_rate": 1.051380816918243e-06, "loss": 0.5672, "num_input_tokens_seen": 71466120, "step": 123085 }, { "epoch": 18.333333333333332, "grad_norm": 2.5251574516296387, "learning_rate": 1.0504485891204452e-06, "loss": 0.6195, "num_input_tokens_seen": 71468904, "step": 123090 }, { "epoch": 18.334078045874293, "grad_norm": 1.4772918224334717, "learning_rate": 1.0495167659259703e-06, "loss": 0.6412, "num_input_tokens_seen": 71471624, "step": 123095 }, { "epoch": 18.334822758415253, "grad_norm": 2.1161482334136963, "learning_rate": 1.0485853473505724e-06, "loss": 0.7573, "num_input_tokens_seen": 71474408, "step": 123100 }, { "epoch": 18.33556747095621, "grad_norm": 1.5297654867172241, "learning_rate": 1.0476543334099781e-06, "loss": 0.7444, "num_input_tokens_seen": 71477480, "step": 123105 }, { "epoch": 18.33631218349717, "grad_norm": 5.883151531219482, "learning_rate": 1.0467237241199218e-06, "loss": 0.6035, "num_input_tokens_seen": 71480296, "step": 123110 }, { "epoch": 18.337056896038128, "grad_norm": 2.7963812351226807, "learning_rate": 1.0457935194961245e-06, "loss": 0.5635, "num_input_tokens_seen": 71483208, "step": 123115 }, { "epoch": 18.33780160857909, "grad_norm": 1.5301101207733154, "learning_rate": 1.044863719554298e-06, "loss": 0.5079, "num_input_tokens_seen": 71486280, "step": 123120 }, { "epoch": 18.33854632112005, "grad_norm": 1.1949820518493652, "learning_rate": 1.0439343243101558e-06, "loss": 0.4922, "num_input_tokens_seen": 71489224, "step": 123125 }, { "epoch": 18.339291033661006, "grad_norm": 1.450514554977417, "learning_rate": 1.04300533377939e-06, "loss": 0.6947, "num_input_tokens_seen": 71492264, "step": 123130 }, { "epoch": 18.340035746201966, "grad_norm": 1.0731477737426758, "learning_rate": 1.0420767479777022e-06, "loss": 0.4601, "num_input_tokens_seen": 71494920, "step": 123135 }, { "epoch": 18.340780458742927, "grad_norm": 1.02623450756073, "learning_rate": 1.0411485669207772e-06, "loss": 0.6155, "num_input_tokens_seen": 71497672, "step": 123140 }, { "epoch": 18.341525171283884, "grad_norm": 1.2785670757293701, "learning_rate": 1.0402207906242966e-06, "loss": 0.7318, "num_input_tokens_seen": 71500744, "step": 123145 }, { "epoch": 18.342269883824844, "grad_norm": 1.9047718048095703, "learning_rate": 1.0392934191039372e-06, "loss": 0.5838, "num_input_tokens_seen": 71503624, "step": 123150 }, { "epoch": 18.3430145963658, "grad_norm": 1.9897441864013672, "learning_rate": 1.0383664523753584e-06, "loss": 0.673, "num_input_tokens_seen": 71506472, "step": 123155 }, { "epoch": 18.343759308906762, "grad_norm": 1.2641355991363525, "learning_rate": 1.0374398904542283e-06, "loss": 0.6065, "num_input_tokens_seen": 71509576, "step": 123160 }, { "epoch": 18.344504021447722, "grad_norm": 1.7009096145629883, "learning_rate": 1.0365137333561925e-06, "loss": 0.618, "num_input_tokens_seen": 71512584, "step": 123165 }, { "epoch": 18.34524873398868, "grad_norm": 1.2314223051071167, "learning_rate": 1.0355879810969054e-06, "loss": 0.7018, "num_input_tokens_seen": 71515720, "step": 123170 }, { "epoch": 18.34599344652964, "grad_norm": 1.816238522529602, "learning_rate": 1.0346626336920019e-06, "loss": 0.563, "num_input_tokens_seen": 71518696, "step": 123175 }, { "epoch": 18.3467381590706, "grad_norm": 1.570320725440979, "learning_rate": 1.0337376911571161e-06, "loss": 0.8951, "num_input_tokens_seen": 71521640, "step": 123180 }, { "epoch": 18.347482871611557, "grad_norm": 0.9714375734329224, "learning_rate": 1.032813153507875e-06, "loss": 0.5801, "num_input_tokens_seen": 71524360, "step": 123185 }, { "epoch": 18.348227584152518, "grad_norm": 1.6952910423278809, "learning_rate": 1.0318890207598963e-06, "loss": 0.6947, "num_input_tokens_seen": 71527176, "step": 123190 }, { "epoch": 18.348972296693475, "grad_norm": 1.125491738319397, "learning_rate": 1.0309652929287926e-06, "loss": 0.6158, "num_input_tokens_seen": 71530440, "step": 123195 }, { "epoch": 18.349717009234435, "grad_norm": 1.3295048475265503, "learning_rate": 1.0300419700301684e-06, "loss": 0.4119, "num_input_tokens_seen": 71533448, "step": 123200 }, { "epoch": 18.350461721775396, "grad_norm": 1.8322049379348755, "learning_rate": 1.0291190520796246e-06, "loss": 0.4686, "num_input_tokens_seen": 71536168, "step": 123205 }, { "epoch": 18.351206434316353, "grad_norm": 1.6360507011413574, "learning_rate": 1.028196539092746e-06, "loss": 0.5819, "num_input_tokens_seen": 71538888, "step": 123210 }, { "epoch": 18.351951146857314, "grad_norm": 1.3618547916412354, "learning_rate": 1.027274431085129e-06, "loss": 0.5847, "num_input_tokens_seen": 71541736, "step": 123215 }, { "epoch": 18.35269585939827, "grad_norm": 1.3494545221328735, "learning_rate": 1.0263527280723411e-06, "loss": 0.6158, "num_input_tokens_seen": 71544648, "step": 123220 }, { "epoch": 18.35344057193923, "grad_norm": 1.44585382938385, "learning_rate": 1.025431430069962e-06, "loss": 0.5644, "num_input_tokens_seen": 71547368, "step": 123225 }, { "epoch": 18.35418528448019, "grad_norm": 1.2635912895202637, "learning_rate": 1.0245105370935536e-06, "loss": 0.6538, "num_input_tokens_seen": 71550408, "step": 123230 }, { "epoch": 18.35492999702115, "grad_norm": 0.9267440438270569, "learning_rate": 1.0235900491586652e-06, "loss": 0.6027, "num_input_tokens_seen": 71552968, "step": 123235 }, { "epoch": 18.35567470956211, "grad_norm": 2.2305679321289062, "learning_rate": 1.022669966280862e-06, "loss": 0.5576, "num_input_tokens_seen": 71555880, "step": 123240 }, { "epoch": 18.35641942210307, "grad_norm": 1.306233286857605, "learning_rate": 1.021750288475673e-06, "loss": 0.596, "num_input_tokens_seen": 71558824, "step": 123245 }, { "epoch": 18.357164134644027, "grad_norm": 2.8560869693756104, "learning_rate": 1.0208310157586497e-06, "loss": 0.7605, "num_input_tokens_seen": 71561448, "step": 123250 }, { "epoch": 18.357908847184987, "grad_norm": 1.0156148672103882, "learning_rate": 1.0199121481453106e-06, "loss": 0.5707, "num_input_tokens_seen": 71564040, "step": 123255 }, { "epoch": 18.358653559725944, "grad_norm": 2.2499144077301025, "learning_rate": 1.0189936856511873e-06, "loss": 0.7841, "num_input_tokens_seen": 71567208, "step": 123260 }, { "epoch": 18.359398272266905, "grad_norm": 1.2708244323730469, "learning_rate": 1.018075628291787e-06, "loss": 0.4946, "num_input_tokens_seen": 71569992, "step": 123265 }, { "epoch": 18.360142984807865, "grad_norm": 1.8081997632980347, "learning_rate": 1.0171579760826279e-06, "loss": 0.6421, "num_input_tokens_seen": 71572840, "step": 123270 }, { "epoch": 18.360887697348822, "grad_norm": 2.0682222843170166, "learning_rate": 1.0162407290392112e-06, "loss": 0.7471, "num_input_tokens_seen": 71575720, "step": 123275 }, { "epoch": 18.361632409889783, "grad_norm": 2.2083966732025146, "learning_rate": 1.0153238871770277e-06, "loss": 0.5633, "num_input_tokens_seen": 71578344, "step": 123280 }, { "epoch": 18.362377122430743, "grad_norm": 1.3297784328460693, "learning_rate": 1.014407450511573e-06, "loss": 0.52, "num_input_tokens_seen": 71581224, "step": 123285 }, { "epoch": 18.3631218349717, "grad_norm": 1.6730481386184692, "learning_rate": 1.013491419058324e-06, "loss": 0.814, "num_input_tokens_seen": 71584232, "step": 123290 }, { "epoch": 18.36386654751266, "grad_norm": 1.3732619285583496, "learning_rate": 1.0125757928327623e-06, "loss": 0.5335, "num_input_tokens_seen": 71587176, "step": 123295 }, { "epoch": 18.364611260053618, "grad_norm": 1.435957908630371, "learning_rate": 1.011660571850348e-06, "loss": 0.7008, "num_input_tokens_seen": 71590120, "step": 123300 }, { "epoch": 18.36535597259458, "grad_norm": 1.905828833580017, "learning_rate": 1.010745756126552e-06, "loss": 0.748, "num_input_tokens_seen": 71593256, "step": 123305 }, { "epoch": 18.36610068513554, "grad_norm": 1.2356549501419067, "learning_rate": 1.0098313456768233e-06, "loss": 0.6926, "num_input_tokens_seen": 71596200, "step": 123310 }, { "epoch": 18.366845397676496, "grad_norm": 1.6570804119110107, "learning_rate": 1.008917340516613e-06, "loss": 0.7016, "num_input_tokens_seen": 71598952, "step": 123315 }, { "epoch": 18.367590110217456, "grad_norm": 1.4047907590866089, "learning_rate": 1.008003740661359e-06, "loss": 0.6416, "num_input_tokens_seen": 71601928, "step": 123320 }, { "epoch": 18.368334822758417, "grad_norm": 1.8606747388839722, "learning_rate": 1.007090546126499e-06, "loss": 0.5499, "num_input_tokens_seen": 71605192, "step": 123325 }, { "epoch": 18.369079535299374, "grad_norm": 1.3426722288131714, "learning_rate": 1.0061777569274593e-06, "loss": 0.5395, "num_input_tokens_seen": 71608168, "step": 123330 }, { "epoch": 18.369824247840334, "grad_norm": 1.5388158559799194, "learning_rate": 1.0052653730796558e-06, "loss": 0.6146, "num_input_tokens_seen": 71611400, "step": 123335 }, { "epoch": 18.37056896038129, "grad_norm": 1.6160924434661865, "learning_rate": 1.004353394598509e-06, "loss": 0.5244, "num_input_tokens_seen": 71614568, "step": 123340 }, { "epoch": 18.371313672922252, "grad_norm": 0.4468226432800293, "learning_rate": 1.0034418214994235e-06, "loss": 0.5537, "num_input_tokens_seen": 71617352, "step": 123345 }, { "epoch": 18.372058385463212, "grad_norm": 1.0427045822143555, "learning_rate": 1.0025306537978007e-06, "loss": 0.478, "num_input_tokens_seen": 71620424, "step": 123350 }, { "epoch": 18.37280309800417, "grad_norm": 2.4827966690063477, "learning_rate": 1.001619891509034e-06, "loss": 0.6522, "num_input_tokens_seen": 71623336, "step": 123355 }, { "epoch": 18.37354781054513, "grad_norm": 1.5315980911254883, "learning_rate": 1.000709534648503e-06, "loss": 0.585, "num_input_tokens_seen": 71626312, "step": 123360 }, { "epoch": 18.37429252308609, "grad_norm": 2.880567789077759, "learning_rate": 9.997995832315977e-07, "loss": 0.6504, "num_input_tokens_seen": 71629192, "step": 123365 }, { "epoch": 18.375037235627047, "grad_norm": 3.012735605239868, "learning_rate": 9.988900372736808e-07, "loss": 0.4751, "num_input_tokens_seen": 71632200, "step": 123370 }, { "epoch": 18.375781948168008, "grad_norm": 0.963742196559906, "learning_rate": 9.979808967901267e-07, "loss": 0.5357, "num_input_tokens_seen": 71635080, "step": 123375 }, { "epoch": 18.376526660708965, "grad_norm": 1.371138572692871, "learning_rate": 9.97072161796292e-07, "loss": 0.5634, "num_input_tokens_seen": 71637896, "step": 123380 }, { "epoch": 18.377271373249926, "grad_norm": 2.3902504444122314, "learning_rate": 9.961638323075284e-07, "loss": 0.7212, "num_input_tokens_seen": 71640840, "step": 123385 }, { "epoch": 18.378016085790886, "grad_norm": 1.6433331966400146, "learning_rate": 9.952559083391765e-07, "loss": 0.5144, "num_input_tokens_seen": 71643720, "step": 123390 }, { "epoch": 18.378760798331843, "grad_norm": 1.356221079826355, "learning_rate": 9.943483899065798e-07, "loss": 0.5774, "num_input_tokens_seen": 71646632, "step": 123395 }, { "epoch": 18.379505510872804, "grad_norm": 2.978769540786743, "learning_rate": 9.9344127702507e-07, "loss": 0.8866, "num_input_tokens_seen": 71649800, "step": 123400 }, { "epoch": 18.38025022341376, "grad_norm": 2.7340941429138184, "learning_rate": 9.925345697099686e-07, "loss": 0.5319, "num_input_tokens_seen": 71652488, "step": 123405 }, { "epoch": 18.38099493595472, "grad_norm": 1.5229907035827637, "learning_rate": 9.916282679765965e-07, "loss": 0.4529, "num_input_tokens_seen": 71655336, "step": 123410 }, { "epoch": 18.38173964849568, "grad_norm": 1.8827247619628906, "learning_rate": 9.907223718402608e-07, "loss": 0.6488, "num_input_tokens_seen": 71658216, "step": 123415 }, { "epoch": 18.38248436103664, "grad_norm": 1.854346513748169, "learning_rate": 9.898168813162744e-07, "loss": 0.6621, "num_input_tokens_seen": 71661128, "step": 123420 }, { "epoch": 18.3832290735776, "grad_norm": 1.2247687578201294, "learning_rate": 9.889117964199252e-07, "loss": 0.5021, "num_input_tokens_seen": 71664232, "step": 123425 }, { "epoch": 18.38397378611856, "grad_norm": 1.4731417894363403, "learning_rate": 9.880071171665089e-07, "loss": 0.6088, "num_input_tokens_seen": 71667336, "step": 123430 }, { "epoch": 18.384718498659517, "grad_norm": 1.725016713142395, "learning_rate": 9.871028435713081e-07, "loss": 0.5357, "num_input_tokens_seen": 71670344, "step": 123435 }, { "epoch": 18.385463211200477, "grad_norm": 1.381127119064331, "learning_rate": 9.861989756495965e-07, "loss": 0.5245, "num_input_tokens_seen": 71673064, "step": 123440 }, { "epoch": 18.386207923741434, "grad_norm": 1.4820915460586548, "learning_rate": 9.852955134166481e-07, "loss": 0.4529, "num_input_tokens_seen": 71676296, "step": 123445 }, { "epoch": 18.386952636282395, "grad_norm": 1.1883821487426758, "learning_rate": 9.843924568877282e-07, "loss": 0.6507, "num_input_tokens_seen": 71679208, "step": 123450 }, { "epoch": 18.387697348823355, "grad_norm": 1.7779241800308228, "learning_rate": 9.834898060780861e-07, "loss": 0.6217, "num_input_tokens_seen": 71682344, "step": 123455 }, { "epoch": 18.388442061364312, "grad_norm": 1.3315434455871582, "learning_rate": 9.825875610029733e-07, "loss": 0.5872, "num_input_tokens_seen": 71685096, "step": 123460 }, { "epoch": 18.389186773905273, "grad_norm": 2.5185766220092773, "learning_rate": 9.81685721677636e-07, "loss": 0.7338, "num_input_tokens_seen": 71687560, "step": 123465 }, { "epoch": 18.389931486446233, "grad_norm": 1.6288845539093018, "learning_rate": 9.807842881173034e-07, "loss": 0.5311, "num_input_tokens_seen": 71690536, "step": 123470 }, { "epoch": 18.39067619898719, "grad_norm": 2.1728692054748535, "learning_rate": 9.79883260337211e-07, "loss": 0.7883, "num_input_tokens_seen": 71693096, "step": 123475 }, { "epoch": 18.39142091152815, "grad_norm": 1.4411259889602661, "learning_rate": 9.789826383525796e-07, "loss": 0.7157, "num_input_tokens_seen": 71696168, "step": 123480 }, { "epoch": 18.392165624069108, "grad_norm": 1.2073286771774292, "learning_rate": 9.780824221786195e-07, "loss": 0.4982, "num_input_tokens_seen": 71699016, "step": 123485 }, { "epoch": 18.39291033661007, "grad_norm": 1.5111663341522217, "learning_rate": 9.771826118305432e-07, "loss": 0.5995, "num_input_tokens_seen": 71702184, "step": 123490 }, { "epoch": 18.39365504915103, "grad_norm": 2.119741678237915, "learning_rate": 9.762832073235501e-07, "loss": 0.5636, "num_input_tokens_seen": 71705000, "step": 123495 }, { "epoch": 18.394399761691986, "grad_norm": 1.0639644861221313, "learning_rate": 9.75384208672836e-07, "loss": 0.5276, "num_input_tokens_seen": 71707848, "step": 123500 }, { "epoch": 18.395144474232946, "grad_norm": 1.5565868616104126, "learning_rate": 9.744856158935888e-07, "loss": 0.4767, "num_input_tokens_seen": 71710600, "step": 123505 }, { "epoch": 18.395889186773907, "grad_norm": 1.3699082136154175, "learning_rate": 9.735874290009884e-07, "loss": 0.6376, "num_input_tokens_seen": 71713672, "step": 123510 }, { "epoch": 18.396633899314864, "grad_norm": 2.081897258758545, "learning_rate": 9.72689648010211e-07, "loss": 0.6287, "num_input_tokens_seen": 71716520, "step": 123515 }, { "epoch": 18.397378611855824, "grad_norm": 1.0906915664672852, "learning_rate": 9.717922729364198e-07, "loss": 0.4397, "num_input_tokens_seen": 71719560, "step": 123520 }, { "epoch": 18.39812332439678, "grad_norm": 1.2588814496994019, "learning_rate": 9.708953037947804e-07, "loss": 0.4898, "num_input_tokens_seen": 71722408, "step": 123525 }, { "epoch": 18.398868036937742, "grad_norm": 3.743837833404541, "learning_rate": 9.699987406004364e-07, "loss": 0.7775, "num_input_tokens_seen": 71725096, "step": 123530 }, { "epoch": 18.399612749478703, "grad_norm": 3.751300811767578, "learning_rate": 9.691025833685446e-07, "loss": 0.6836, "num_input_tokens_seen": 71727912, "step": 123535 }, { "epoch": 18.40035746201966, "grad_norm": 1.6540943384170532, "learning_rate": 9.68206832114238e-07, "loss": 0.7455, "num_input_tokens_seen": 71730632, "step": 123540 }, { "epoch": 18.40110217456062, "grad_norm": 0.9680535793304443, "learning_rate": 9.673114868526568e-07, "loss": 0.4642, "num_input_tokens_seen": 71733288, "step": 123545 }, { "epoch": 18.401846887101577, "grad_norm": 0.8916699290275574, "learning_rate": 9.664165475989168e-07, "loss": 0.6684, "num_input_tokens_seen": 71736072, "step": 123550 }, { "epoch": 18.402591599642538, "grad_norm": 2.4020071029663086, "learning_rate": 9.655220143681476e-07, "loss": 0.7015, "num_input_tokens_seen": 71739240, "step": 123555 }, { "epoch": 18.403336312183498, "grad_norm": 2.025005578994751, "learning_rate": 9.646278871754539e-07, "loss": 0.5615, "num_input_tokens_seen": 71742024, "step": 123560 }, { "epoch": 18.404081024724455, "grad_norm": 1.4033201932907104, "learning_rate": 9.637341660359428e-07, "loss": 0.5409, "num_input_tokens_seen": 71744680, "step": 123565 }, { "epoch": 18.404825737265416, "grad_norm": 1.738638997077942, "learning_rate": 9.628408509647164e-07, "loss": 0.5796, "num_input_tokens_seen": 71747592, "step": 123570 }, { "epoch": 18.405570449806376, "grad_norm": 1.1995965242385864, "learning_rate": 9.619479419768596e-07, "loss": 0.4947, "num_input_tokens_seen": 71750664, "step": 123575 }, { "epoch": 18.406315162347333, "grad_norm": 0.6663806438446045, "learning_rate": 9.610554390874632e-07, "loss": 0.4943, "num_input_tokens_seen": 71753416, "step": 123580 }, { "epoch": 18.407059874888294, "grad_norm": 0.9832264184951782, "learning_rate": 9.60163342311604e-07, "loss": 0.6554, "num_input_tokens_seen": 71756648, "step": 123585 }, { "epoch": 18.40780458742925, "grad_norm": 1.6816178560256958, "learning_rate": 9.592716516643536e-07, "loss": 0.4939, "num_input_tokens_seen": 71759432, "step": 123590 }, { "epoch": 18.40854929997021, "grad_norm": 1.5415573120117188, "learning_rate": 9.583803671607743e-07, "loss": 0.6939, "num_input_tokens_seen": 71762088, "step": 123595 }, { "epoch": 18.40929401251117, "grad_norm": 1.8182666301727295, "learning_rate": 9.574894888159186e-07, "loss": 0.5269, "num_input_tokens_seen": 71765160, "step": 123600 }, { "epoch": 18.41003872505213, "grad_norm": 1.2498804330825806, "learning_rate": 9.565990166448463e-07, "loss": 0.7437, "num_input_tokens_seen": 71767912, "step": 123605 }, { "epoch": 18.41078343759309, "grad_norm": 1.2131177186965942, "learning_rate": 9.557089506625954e-07, "loss": 0.5938, "num_input_tokens_seen": 71771016, "step": 123610 }, { "epoch": 18.41152815013405, "grad_norm": 1.2356499433517456, "learning_rate": 9.54819290884207e-07, "loss": 0.6852, "num_input_tokens_seen": 71774184, "step": 123615 }, { "epoch": 18.412272862675007, "grad_norm": 1.3411496877670288, "learning_rate": 9.539300373247045e-07, "loss": 0.53, "num_input_tokens_seen": 71777256, "step": 123620 }, { "epoch": 18.413017575215967, "grad_norm": 1.6574084758758545, "learning_rate": 9.530411899991182e-07, "loss": 0.7468, "num_input_tokens_seen": 71780584, "step": 123625 }, { "epoch": 18.413762287756924, "grad_norm": 2.1906280517578125, "learning_rate": 9.521527489224552e-07, "loss": 0.5884, "num_input_tokens_seen": 71783560, "step": 123630 }, { "epoch": 18.414507000297885, "grad_norm": 1.4085721969604492, "learning_rate": 9.512647141097369e-07, "loss": 0.4399, "num_input_tokens_seen": 71786632, "step": 123635 }, { "epoch": 18.415251712838845, "grad_norm": 1.3584904670715332, "learning_rate": 9.503770855759569e-07, "loss": 0.5415, "num_input_tokens_seen": 71789736, "step": 123640 }, { "epoch": 18.415996425379802, "grad_norm": 0.6254015564918518, "learning_rate": 9.494898633361144e-07, "loss": 0.519, "num_input_tokens_seen": 71792584, "step": 123645 }, { "epoch": 18.416741137920763, "grad_norm": 0.7983991503715515, "learning_rate": 9.486030474051944e-07, "loss": 0.6003, "num_input_tokens_seen": 71795176, "step": 123650 }, { "epoch": 18.417485850461723, "grad_norm": 1.1592861413955688, "learning_rate": 9.477166377981822e-07, "loss": 0.5299, "num_input_tokens_seen": 71798344, "step": 123655 }, { "epoch": 18.41823056300268, "grad_norm": 0.859357476234436, "learning_rate": 9.468306345300548e-07, "loss": 0.4982, "num_input_tokens_seen": 71801064, "step": 123660 }, { "epoch": 18.41897527554364, "grad_norm": 3.0224320888519287, "learning_rate": 9.459450376157697e-07, "loss": 0.6636, "num_input_tokens_seen": 71804040, "step": 123665 }, { "epoch": 18.419719988084598, "grad_norm": 2.5006818771362305, "learning_rate": 9.450598470703037e-07, "loss": 0.528, "num_input_tokens_seen": 71806888, "step": 123670 }, { "epoch": 18.42046470062556, "grad_norm": 0.8973444700241089, "learning_rate": 9.441750629086004e-07, "loss": 0.5834, "num_input_tokens_seen": 71809800, "step": 123675 }, { "epoch": 18.42120941316652, "grad_norm": 1.257240653038025, "learning_rate": 9.432906851456064e-07, "loss": 0.5183, "num_input_tokens_seen": 71812808, "step": 123680 }, { "epoch": 18.421954125707476, "grad_norm": 1.5230330228805542, "learning_rate": 9.424067137962705e-07, "loss": 0.5197, "num_input_tokens_seen": 71815688, "step": 123685 }, { "epoch": 18.422698838248436, "grad_norm": 1.5306614637374878, "learning_rate": 9.4152314887552e-07, "loss": 0.5149, "num_input_tokens_seen": 71818440, "step": 123690 }, { "epoch": 18.423443550789397, "grad_norm": 1.9714405536651611, "learning_rate": 9.406399903982844e-07, "loss": 0.7298, "num_input_tokens_seen": 71821512, "step": 123695 }, { "epoch": 18.424188263330354, "grad_norm": 1.5620102882385254, "learning_rate": 9.397572383794823e-07, "loss": 0.8298, "num_input_tokens_seen": 71824520, "step": 123700 }, { "epoch": 18.424932975871315, "grad_norm": 1.1545621156692505, "learning_rate": 9.388748928340296e-07, "loss": 0.7446, "num_input_tokens_seen": 71827464, "step": 123705 }, { "epoch": 18.42567768841227, "grad_norm": 1.3768895864486694, "learning_rate": 9.379929537768339e-07, "loss": 0.6378, "num_input_tokens_seen": 71830184, "step": 123710 }, { "epoch": 18.426422400953232, "grad_norm": 1.6079198122024536, "learning_rate": 9.371114212227889e-07, "loss": 0.5539, "num_input_tokens_seen": 71832904, "step": 123715 }, { "epoch": 18.427167113494193, "grad_norm": 2.0133988857269287, "learning_rate": 9.362302951867907e-07, "loss": 0.5927, "num_input_tokens_seen": 71835912, "step": 123720 }, { "epoch": 18.42791182603515, "grad_norm": 1.793296217918396, "learning_rate": 9.353495756837222e-07, "loss": 0.7895, "num_input_tokens_seen": 71838792, "step": 123725 }, { "epoch": 18.42865653857611, "grad_norm": 1.0219146013259888, "learning_rate": 9.344692627284657e-07, "loss": 0.6918, "num_input_tokens_seen": 71841320, "step": 123730 }, { "epoch": 18.42940125111707, "grad_norm": 1.4015727043151855, "learning_rate": 9.335893563358899e-07, "loss": 0.5345, "num_input_tokens_seen": 71844200, "step": 123735 }, { "epoch": 18.430145963658028, "grad_norm": 1.4142918586730957, "learning_rate": 9.327098565208636e-07, "loss": 0.414, "num_input_tokens_seen": 71846984, "step": 123740 }, { "epoch": 18.430890676198988, "grad_norm": 1.9013988971710205, "learning_rate": 9.318307632982415e-07, "loss": 0.5959, "num_input_tokens_seen": 71849608, "step": 123745 }, { "epoch": 18.431635388739945, "grad_norm": 1.4752404689788818, "learning_rate": 9.309520766828811e-07, "loss": 0.611, "num_input_tokens_seen": 71852552, "step": 123750 }, { "epoch": 18.432380101280906, "grad_norm": 1.5445048809051514, "learning_rate": 9.300737966896206e-07, "loss": 0.5585, "num_input_tokens_seen": 71855880, "step": 123755 }, { "epoch": 18.433124813821866, "grad_norm": 1.1990301609039307, "learning_rate": 9.291959233332981e-07, "loss": 0.5106, "num_input_tokens_seen": 71858600, "step": 123760 }, { "epoch": 18.433869526362823, "grad_norm": 1.1906402111053467, "learning_rate": 9.28318456628749e-07, "loss": 0.5698, "num_input_tokens_seen": 71861608, "step": 123765 }, { "epoch": 18.434614238903784, "grad_norm": 1.2243537902832031, "learning_rate": 9.274413965907919e-07, "loss": 0.7108, "num_input_tokens_seen": 71864392, "step": 123770 }, { "epoch": 18.43535895144474, "grad_norm": 3.0051283836364746, "learning_rate": 9.265647432342455e-07, "loss": 0.6811, "num_input_tokens_seen": 71867208, "step": 123775 }, { "epoch": 18.4361036639857, "grad_norm": 2.0563437938690186, "learning_rate": 9.256884965739232e-07, "loss": 0.4338, "num_input_tokens_seen": 71870088, "step": 123780 }, { "epoch": 18.43684837652666, "grad_norm": 1.3016343116760254, "learning_rate": 9.248126566246267e-07, "loss": 0.693, "num_input_tokens_seen": 71872904, "step": 123785 }, { "epoch": 18.43759308906762, "grad_norm": 1.5296306610107422, "learning_rate": 9.239372234011473e-07, "loss": 0.6737, "num_input_tokens_seen": 71875656, "step": 123790 }, { "epoch": 18.43833780160858, "grad_norm": 1.3460158109664917, "learning_rate": 9.230621969182812e-07, "loss": 0.7258, "num_input_tokens_seen": 71878312, "step": 123795 }, { "epoch": 18.43908251414954, "grad_norm": 1.44941246509552, "learning_rate": 9.221875771908084e-07, "loss": 0.5549, "num_input_tokens_seen": 71881512, "step": 123800 }, { "epoch": 18.439827226690497, "grad_norm": 1.4474585056304932, "learning_rate": 9.213133642335031e-07, "loss": 0.5352, "num_input_tokens_seen": 71884424, "step": 123805 }, { "epoch": 18.440571939231457, "grad_norm": 1.8518329858779907, "learning_rate": 9.204395580611397e-07, "loss": 0.6407, "num_input_tokens_seen": 71887240, "step": 123810 }, { "epoch": 18.441316651772414, "grad_norm": 1.324591875076294, "learning_rate": 9.195661586884729e-07, "loss": 0.6312, "num_input_tokens_seen": 71890088, "step": 123815 }, { "epoch": 18.442061364313375, "grad_norm": 1.0552151203155518, "learning_rate": 9.186931661302634e-07, "loss": 0.6044, "num_input_tokens_seen": 71892968, "step": 123820 }, { "epoch": 18.442806076854335, "grad_norm": 2.236466407775879, "learning_rate": 9.178205804012546e-07, "loss": 0.5934, "num_input_tokens_seen": 71895752, "step": 123825 }, { "epoch": 18.443550789395292, "grad_norm": 2.328009605407715, "learning_rate": 9.16948401516196e-07, "loss": 0.7323, "num_input_tokens_seen": 71898888, "step": 123830 }, { "epoch": 18.444295501936253, "grad_norm": 1.6359220743179321, "learning_rate": 9.160766294898148e-07, "loss": 0.4398, "num_input_tokens_seen": 71901704, "step": 123835 }, { "epoch": 18.445040214477213, "grad_norm": 2.2898166179656982, "learning_rate": 9.152052643368408e-07, "loss": 0.4677, "num_input_tokens_seen": 71904520, "step": 123840 }, { "epoch": 18.44578492701817, "grad_norm": 1.6244434118270874, "learning_rate": 9.143343060719956e-07, "loss": 0.4307, "num_input_tokens_seen": 71907336, "step": 123845 }, { "epoch": 18.44652963955913, "grad_norm": 1.083196759223938, "learning_rate": 9.13463754709995e-07, "loss": 0.5921, "num_input_tokens_seen": 71910536, "step": 123850 }, { "epoch": 18.447274352100088, "grad_norm": 2.2729387283325195, "learning_rate": 9.125936102655414e-07, "loss": 0.487, "num_input_tokens_seen": 71913320, "step": 123855 }, { "epoch": 18.44801906464105, "grad_norm": 1.3062735795974731, "learning_rate": 9.117238727533367e-07, "loss": 0.5132, "num_input_tokens_seen": 71916552, "step": 123860 }, { "epoch": 18.44876377718201, "grad_norm": 1.4662647247314453, "learning_rate": 9.108545421880776e-07, "loss": 0.5973, "num_input_tokens_seen": 71919400, "step": 123865 }, { "epoch": 18.449508489722966, "grad_norm": 2.279726028442383, "learning_rate": 9.09985618584444e-07, "loss": 0.5647, "num_input_tokens_seen": 71922216, "step": 123870 }, { "epoch": 18.450253202263927, "grad_norm": 0.9198715686798096, "learning_rate": 9.091171019571215e-07, "loss": 0.529, "num_input_tokens_seen": 71925064, "step": 123875 }, { "epoch": 18.450997914804887, "grad_norm": 1.611026644706726, "learning_rate": 9.082489923207815e-07, "loss": 0.4731, "num_input_tokens_seen": 71928104, "step": 123880 }, { "epoch": 18.451742627345844, "grad_norm": 1.5063800811767578, "learning_rate": 9.073812896900874e-07, "loss": 0.7202, "num_input_tokens_seen": 71930920, "step": 123885 }, { "epoch": 18.452487339886805, "grad_norm": 3.131603479385376, "learning_rate": 9.065139940797024e-07, "loss": 0.7864, "num_input_tokens_seen": 71933768, "step": 123890 }, { "epoch": 18.45323205242776, "grad_norm": 1.4657615423202515, "learning_rate": 9.056471055042732e-07, "loss": 0.7472, "num_input_tokens_seen": 71936872, "step": 123895 }, { "epoch": 18.453976764968722, "grad_norm": 2.646648645401001, "learning_rate": 9.04780623978449e-07, "loss": 0.5076, "num_input_tokens_seen": 71939880, "step": 123900 }, { "epoch": 18.454721477509683, "grad_norm": 1.5807934999465942, "learning_rate": 9.039145495168655e-07, "loss": 0.5265, "num_input_tokens_seen": 71942664, "step": 123905 }, { "epoch": 18.45546619005064, "grad_norm": 1.9466304779052734, "learning_rate": 9.030488821341554e-07, "loss": 0.7643, "num_input_tokens_seen": 71945544, "step": 123910 }, { "epoch": 18.4562109025916, "grad_norm": 2.20778489112854, "learning_rate": 9.021836218449459e-07, "loss": 0.6942, "num_input_tokens_seen": 71948200, "step": 123915 }, { "epoch": 18.456955615132557, "grad_norm": 1.6125508546829224, "learning_rate": 9.01318768663853e-07, "loss": 0.6892, "num_input_tokens_seen": 71951240, "step": 123920 }, { "epoch": 18.457700327673518, "grad_norm": 1.1917096376419067, "learning_rate": 9.004543226054846e-07, "loss": 0.6817, "num_input_tokens_seen": 71953832, "step": 123925 }, { "epoch": 18.458445040214478, "grad_norm": 1.3620669841766357, "learning_rate": 8.995902836844455e-07, "loss": 0.5027, "num_input_tokens_seen": 71956456, "step": 123930 }, { "epoch": 18.459189752755435, "grad_norm": 1.0167670249938965, "learning_rate": 8.987266519153353e-07, "loss": 0.7318, "num_input_tokens_seen": 71959496, "step": 123935 }, { "epoch": 18.459934465296396, "grad_norm": 0.8271728754043579, "learning_rate": 8.978634273127424e-07, "loss": 0.4365, "num_input_tokens_seen": 71962472, "step": 123940 }, { "epoch": 18.460679177837356, "grad_norm": 1.4950053691864014, "learning_rate": 8.97000609891252e-07, "loss": 0.4683, "num_input_tokens_seen": 71965320, "step": 123945 }, { "epoch": 18.461423890378313, "grad_norm": 1.6106220483779907, "learning_rate": 8.961381996654361e-07, "loss": 0.5138, "num_input_tokens_seen": 71968040, "step": 123950 }, { "epoch": 18.462168602919274, "grad_norm": 1.0893588066101074, "learning_rate": 8.952761966498691e-07, "loss": 0.6122, "num_input_tokens_seen": 71971176, "step": 123955 }, { "epoch": 18.46291331546023, "grad_norm": 1.631401777267456, "learning_rate": 8.944146008591143e-07, "loss": 0.6015, "num_input_tokens_seen": 71973800, "step": 123960 }, { "epoch": 18.46365802800119, "grad_norm": 1.3592915534973145, "learning_rate": 8.93553412307721e-07, "loss": 0.4596, "num_input_tokens_seen": 71976616, "step": 123965 }, { "epoch": 18.464402740542152, "grad_norm": 1.6638485193252563, "learning_rate": 8.926926310102445e-07, "loss": 0.678, "num_input_tokens_seen": 71979656, "step": 123970 }, { "epoch": 18.46514745308311, "grad_norm": 1.3901482820510864, "learning_rate": 8.918322569812259e-07, "loss": 0.4977, "num_input_tokens_seen": 71982600, "step": 123975 }, { "epoch": 18.46589216562407, "grad_norm": 1.322270393371582, "learning_rate": 8.909722902351924e-07, "loss": 0.4894, "num_input_tokens_seen": 71985736, "step": 123980 }, { "epoch": 18.46663687816503, "grad_norm": 1.2450848817825317, "learning_rate": 8.901127307866852e-07, "loss": 0.6702, "num_input_tokens_seen": 71988712, "step": 123985 }, { "epoch": 18.467381590705987, "grad_norm": 3.1944286823272705, "learning_rate": 8.892535786502176e-07, "loss": 0.599, "num_input_tokens_seen": 71991720, "step": 123990 }, { "epoch": 18.468126303246947, "grad_norm": 1.3635990619659424, "learning_rate": 8.883948338403058e-07, "loss": 0.6908, "num_input_tokens_seen": 71994600, "step": 123995 }, { "epoch": 18.468871015787904, "grad_norm": 0.9082528948783875, "learning_rate": 8.87536496371455e-07, "loss": 0.4342, "num_input_tokens_seen": 71997480, "step": 124000 }, { "epoch": 18.469615728328865, "grad_norm": 0.8419618606567383, "learning_rate": 8.866785662581728e-07, "loss": 0.508, "num_input_tokens_seen": 72000520, "step": 124005 }, { "epoch": 18.470360440869825, "grad_norm": 3.152003049850464, "learning_rate": 8.858210435149422e-07, "loss": 0.6616, "num_input_tokens_seen": 72003208, "step": 124010 }, { "epoch": 18.471105153410782, "grad_norm": 0.9475085139274597, "learning_rate": 8.849639281562628e-07, "loss": 0.5907, "num_input_tokens_seen": 72006088, "step": 124015 }, { "epoch": 18.471849865951743, "grad_norm": 0.9081208109855652, "learning_rate": 8.841072201966033e-07, "loss": 0.4682, "num_input_tokens_seen": 72009128, "step": 124020 }, { "epoch": 18.472594578492703, "grad_norm": 0.9522401690483093, "learning_rate": 8.83250919650444e-07, "loss": 0.6124, "num_input_tokens_seen": 72011816, "step": 124025 }, { "epoch": 18.47333929103366, "grad_norm": 1.8779443502426147, "learning_rate": 8.823950265322484e-07, "loss": 0.5617, "num_input_tokens_seen": 72014536, "step": 124030 }, { "epoch": 18.47408400357462, "grad_norm": 2.1189143657684326, "learning_rate": 8.815395408564797e-07, "loss": 0.4524, "num_input_tokens_seen": 72017192, "step": 124035 }, { "epoch": 18.474828716115578, "grad_norm": 1.3258146047592163, "learning_rate": 8.806844626375848e-07, "loss": 0.486, "num_input_tokens_seen": 72019976, "step": 124040 }, { "epoch": 18.47557342865654, "grad_norm": 2.759166955947876, "learning_rate": 8.798297918900162e-07, "loss": 0.641, "num_input_tokens_seen": 72022600, "step": 124045 }, { "epoch": 18.4763181411975, "grad_norm": 2.062749147415161, "learning_rate": 8.789755286282065e-07, "loss": 0.692, "num_input_tokens_seen": 72025448, "step": 124050 }, { "epoch": 18.477062853738456, "grad_norm": 1.483554720878601, "learning_rate": 8.781216728665859e-07, "loss": 0.6174, "num_input_tokens_seen": 72028424, "step": 124055 }, { "epoch": 18.477807566279417, "grad_norm": 1.6346396207809448, "learning_rate": 8.772682246195873e-07, "loss": 0.4894, "num_input_tokens_seen": 72031400, "step": 124060 }, { "epoch": 18.478552278820374, "grad_norm": 1.546783447265625, "learning_rate": 8.764151839016216e-07, "loss": 0.5581, "num_input_tokens_seen": 72034216, "step": 124065 }, { "epoch": 18.479296991361334, "grad_norm": 1.4374396800994873, "learning_rate": 8.755625507271076e-07, "loss": 0.7346, "num_input_tokens_seen": 72037224, "step": 124070 }, { "epoch": 18.480041703902295, "grad_norm": 1.810483694076538, "learning_rate": 8.747103251104394e-07, "loss": 0.5655, "num_input_tokens_seen": 72040136, "step": 124075 }, { "epoch": 18.48078641644325, "grad_norm": 2.7564754486083984, "learning_rate": 8.738585070660249e-07, "loss": 0.593, "num_input_tokens_seen": 72042920, "step": 124080 }, { "epoch": 18.481531128984212, "grad_norm": 1.7214405536651611, "learning_rate": 8.730070966082499e-07, "loss": 0.7406, "num_input_tokens_seen": 72045864, "step": 124085 }, { "epoch": 18.482275841525173, "grad_norm": 1.982666015625, "learning_rate": 8.721560937514972e-07, "loss": 0.5069, "num_input_tokens_seen": 72048648, "step": 124090 }, { "epoch": 18.48302055406613, "grad_norm": 2.3482398986816406, "learning_rate": 8.71305498510147e-07, "loss": 0.7171, "num_input_tokens_seen": 72051624, "step": 124095 }, { "epoch": 18.48376526660709, "grad_norm": 1.670599341392517, "learning_rate": 8.70455310898563e-07, "loss": 0.6025, "num_input_tokens_seen": 72054472, "step": 124100 }, { "epoch": 18.484509979148047, "grad_norm": 1.1832406520843506, "learning_rate": 8.696055309311169e-07, "loss": 0.5198, "num_input_tokens_seen": 72057320, "step": 124105 }, { "epoch": 18.485254691689008, "grad_norm": 3.139275312423706, "learning_rate": 8.687561586221582e-07, "loss": 0.6173, "num_input_tokens_seen": 72060200, "step": 124110 }, { "epoch": 18.48599940422997, "grad_norm": 1.4500718116760254, "learning_rate": 8.679071939860394e-07, "loss": 0.5089, "num_input_tokens_seen": 72063048, "step": 124115 }, { "epoch": 18.486744116770925, "grad_norm": 3.701911211013794, "learning_rate": 8.67058637037102e-07, "loss": 0.5969, "num_input_tokens_seen": 72065960, "step": 124120 }, { "epoch": 18.487488829311886, "grad_norm": 2.1934688091278076, "learning_rate": 8.662104877896788e-07, "loss": 0.6125, "num_input_tokens_seen": 72068680, "step": 124125 }, { "epoch": 18.488233541852846, "grad_norm": 1.5041868686676025, "learning_rate": 8.653627462581027e-07, "loss": 0.579, "num_input_tokens_seen": 72071912, "step": 124130 }, { "epoch": 18.488978254393803, "grad_norm": 2.6506922245025635, "learning_rate": 8.645154124566929e-07, "loss": 0.6034, "num_input_tokens_seen": 72074696, "step": 124135 }, { "epoch": 18.489722966934764, "grad_norm": 1.7223877906799316, "learning_rate": 8.636684863997657e-07, "loss": 0.5999, "num_input_tokens_seen": 72077704, "step": 124140 }, { "epoch": 18.49046767947572, "grad_norm": 2.0818722248077393, "learning_rate": 8.628219681016264e-07, "loss": 0.7128, "num_input_tokens_seen": 72080648, "step": 124145 }, { "epoch": 18.49121239201668, "grad_norm": 1.7963436841964722, "learning_rate": 8.619758575765801e-07, "loss": 0.5923, "num_input_tokens_seen": 72083560, "step": 124150 }, { "epoch": 18.491957104557642, "grad_norm": 1.419598937034607, "learning_rate": 8.611301548389155e-07, "loss": 0.672, "num_input_tokens_seen": 72086472, "step": 124155 }, { "epoch": 18.4927018170986, "grad_norm": 2.6031227111816406, "learning_rate": 8.602848599029267e-07, "loss": 0.7319, "num_input_tokens_seen": 72089000, "step": 124160 }, { "epoch": 18.49344652963956, "grad_norm": 1.7841112613677979, "learning_rate": 8.59439972782894e-07, "loss": 0.6304, "num_input_tokens_seen": 72092072, "step": 124165 }, { "epoch": 18.49419124218052, "grad_norm": 2.122939348220825, "learning_rate": 8.585954934930806e-07, "loss": 0.6621, "num_input_tokens_seen": 72094696, "step": 124170 }, { "epoch": 18.494935954721477, "grad_norm": 0.9000409245491028, "learning_rate": 8.577514220477644e-07, "loss": 0.3886, "num_input_tokens_seen": 72097576, "step": 124175 }, { "epoch": 18.495680667262437, "grad_norm": 1.1989620923995972, "learning_rate": 8.569077584612006e-07, "loss": 0.6741, "num_input_tokens_seen": 72100616, "step": 124180 }, { "epoch": 18.496425379803394, "grad_norm": 2.7929954528808594, "learning_rate": 8.560645027476416e-07, "loss": 0.6039, "num_input_tokens_seen": 72103656, "step": 124185 }, { "epoch": 18.497170092344355, "grad_norm": 1.0978238582611084, "learning_rate": 8.552216549213316e-07, "loss": 0.5038, "num_input_tokens_seen": 72106856, "step": 124190 }, { "epoch": 18.497914804885315, "grad_norm": 0.7325338125228882, "learning_rate": 8.543792149965174e-07, "loss": 0.5755, "num_input_tokens_seen": 72109672, "step": 124195 }, { "epoch": 18.498659517426272, "grad_norm": 1.2664854526519775, "learning_rate": 8.535371829874239e-07, "loss": 0.4777, "num_input_tokens_seen": 72112392, "step": 124200 }, { "epoch": 18.499404229967233, "grad_norm": 2.095520257949829, "learning_rate": 8.52695558908273e-07, "loss": 0.6887, "num_input_tokens_seen": 72115112, "step": 124205 }, { "epoch": 18.500148942508194, "grad_norm": 0.8959783911705017, "learning_rate": 8.51854342773295e-07, "loss": 0.5731, "num_input_tokens_seen": 72118088, "step": 124210 }, { "epoch": 18.50089365504915, "grad_norm": 1.751715064048767, "learning_rate": 8.510135345966897e-07, "loss": 0.6519, "num_input_tokens_seen": 72121704, "step": 124215 }, { "epoch": 18.50163836759011, "grad_norm": 1.2709356546401978, "learning_rate": 8.501731343926706e-07, "loss": 0.5205, "num_input_tokens_seen": 72124744, "step": 124220 }, { "epoch": 18.502383080131068, "grad_norm": 1.3707928657531738, "learning_rate": 8.493331421754291e-07, "loss": 0.5797, "num_input_tokens_seen": 72127784, "step": 124225 }, { "epoch": 18.50312779267203, "grad_norm": 1.3140015602111816, "learning_rate": 8.484935579591596e-07, "loss": 0.6912, "num_input_tokens_seen": 72130568, "step": 124230 }, { "epoch": 18.50387250521299, "grad_norm": 1.991503357887268, "learning_rate": 8.476543817580451e-07, "loss": 0.4587, "num_input_tokens_seen": 72133448, "step": 124235 }, { "epoch": 18.504617217753946, "grad_norm": 1.8547638654708862, "learning_rate": 8.468156135862631e-07, "loss": 0.5826, "num_input_tokens_seen": 72136488, "step": 124240 }, { "epoch": 18.505361930294907, "grad_norm": 1.6756356954574585, "learning_rate": 8.45977253457983e-07, "loss": 0.5032, "num_input_tokens_seen": 72139208, "step": 124245 }, { "epoch": 18.506106642835867, "grad_norm": 1.9717962741851807, "learning_rate": 8.451393013873682e-07, "loss": 0.4676, "num_input_tokens_seen": 72141960, "step": 124250 }, { "epoch": 18.506851355376824, "grad_norm": 1.2118631601333618, "learning_rate": 8.443017573885769e-07, "loss": 0.5818, "num_input_tokens_seen": 72144648, "step": 124255 }, { "epoch": 18.507596067917785, "grad_norm": 0.851560115814209, "learning_rate": 8.434646214757536e-07, "loss": 0.4641, "num_input_tokens_seen": 72147816, "step": 124260 }, { "epoch": 18.50834078045874, "grad_norm": 1.8819786310195923, "learning_rate": 8.42627893663045e-07, "loss": 0.5333, "num_input_tokens_seen": 72150600, "step": 124265 }, { "epoch": 18.509085492999702, "grad_norm": 1.3090308904647827, "learning_rate": 8.417915739645815e-07, "loss": 0.5855, "num_input_tokens_seen": 72153352, "step": 124270 }, { "epoch": 18.509830205540663, "grad_norm": 1.0998646020889282, "learning_rate": 8.40955662394502e-07, "loss": 0.6198, "num_input_tokens_seen": 72156168, "step": 124275 }, { "epoch": 18.51057491808162, "grad_norm": 2.1976771354675293, "learning_rate": 8.401201589669227e-07, "loss": 0.5634, "num_input_tokens_seen": 72159016, "step": 124280 }, { "epoch": 18.51131963062258, "grad_norm": 1.9668889045715332, "learning_rate": 8.392850636959521e-07, "loss": 0.4047, "num_input_tokens_seen": 72161768, "step": 124285 }, { "epoch": 18.512064343163537, "grad_norm": 0.7276020646095276, "learning_rate": 8.384503765957091e-07, "loss": 0.4797, "num_input_tokens_seen": 72164680, "step": 124290 }, { "epoch": 18.512809055704498, "grad_norm": 1.2771683931350708, "learning_rate": 8.376160976802882e-07, "loss": 0.4965, "num_input_tokens_seen": 72167272, "step": 124295 }, { "epoch": 18.51355376824546, "grad_norm": 1.7729055881500244, "learning_rate": 8.367822269637892e-07, "loss": 0.378, "num_input_tokens_seen": 72170216, "step": 124300 }, { "epoch": 18.514298480786415, "grad_norm": 1.631921648979187, "learning_rate": 8.359487644602954e-07, "loss": 0.5606, "num_input_tokens_seen": 72173416, "step": 124305 }, { "epoch": 18.515043193327376, "grad_norm": 1.5991746187210083, "learning_rate": 8.351157101838842e-07, "loss": 0.6402, "num_input_tokens_seen": 72176264, "step": 124310 }, { "epoch": 18.515787905868336, "grad_norm": 1.057085633277893, "learning_rate": 8.34283064148636e-07, "loss": 0.5494, "num_input_tokens_seen": 72179272, "step": 124315 }, { "epoch": 18.516532618409293, "grad_norm": 1.3358185291290283, "learning_rate": 8.334508263686147e-07, "loss": 0.6119, "num_input_tokens_seen": 72182248, "step": 124320 }, { "epoch": 18.517277330950254, "grad_norm": 1.26786470413208, "learning_rate": 8.326189968578785e-07, "loss": 0.6156, "num_input_tokens_seen": 72185320, "step": 124325 }, { "epoch": 18.51802204349121, "grad_norm": 2.397512435913086, "learning_rate": 8.317875756304827e-07, "loss": 0.6501, "num_input_tokens_seen": 72188456, "step": 124330 }, { "epoch": 18.51876675603217, "grad_norm": 1.0228191614151, "learning_rate": 8.309565627004717e-07, "loss": 0.4489, "num_input_tokens_seen": 72190984, "step": 124335 }, { "epoch": 18.519511468573132, "grad_norm": 1.2814701795578003, "learning_rate": 8.301259580818843e-07, "loss": 0.5406, "num_input_tokens_seen": 72194024, "step": 124340 }, { "epoch": 18.52025618111409, "grad_norm": 1.1021509170532227, "learning_rate": 8.292957617887537e-07, "loss": 0.4326, "num_input_tokens_seen": 72196872, "step": 124345 }, { "epoch": 18.52100089365505, "grad_norm": 1.879249095916748, "learning_rate": 8.284659738351047e-07, "loss": 0.5672, "num_input_tokens_seen": 72199784, "step": 124350 }, { "epoch": 18.52174560619601, "grad_norm": 1.6871752738952637, "learning_rate": 8.276365942349595e-07, "loss": 0.6893, "num_input_tokens_seen": 72202536, "step": 124355 }, { "epoch": 18.522490318736967, "grad_norm": 1.5976207256317139, "learning_rate": 8.268076230023264e-07, "loss": 0.7468, "num_input_tokens_seen": 72205192, "step": 124360 }, { "epoch": 18.523235031277927, "grad_norm": 1.5555388927459717, "learning_rate": 8.259790601512052e-07, "loss": 0.511, "num_input_tokens_seen": 72208040, "step": 124365 }, { "epoch": 18.523979743818884, "grad_norm": 1.092803955078125, "learning_rate": 8.251509056956042e-07, "loss": 0.4084, "num_input_tokens_seen": 72210856, "step": 124370 }, { "epoch": 18.524724456359845, "grad_norm": 1.3138526678085327, "learning_rate": 8.243231596495066e-07, "loss": 0.7005, "num_input_tokens_seen": 72213768, "step": 124375 }, { "epoch": 18.525469168900806, "grad_norm": 0.8826163411140442, "learning_rate": 8.234958220268985e-07, "loss": 0.6582, "num_input_tokens_seen": 72216808, "step": 124380 }, { "epoch": 18.526213881441763, "grad_norm": 1.7747607231140137, "learning_rate": 8.22668892841752e-07, "loss": 0.4457, "num_input_tokens_seen": 72219368, "step": 124385 }, { "epoch": 18.526958593982723, "grad_norm": 3.26633620262146, "learning_rate": 8.218423721080476e-07, "loss": 0.5844, "num_input_tokens_seen": 72222024, "step": 124390 }, { "epoch": 18.527703306523684, "grad_norm": 1.9955217838287354, "learning_rate": 8.21016259839738e-07, "loss": 0.5504, "num_input_tokens_seen": 72224968, "step": 124395 }, { "epoch": 18.52844801906464, "grad_norm": 1.3428291082382202, "learning_rate": 8.201905560507872e-07, "loss": 0.693, "num_input_tokens_seen": 72228008, "step": 124400 }, { "epoch": 18.5291927316056, "grad_norm": 1.3262896537780762, "learning_rate": 8.193652607551422e-07, "loss": 0.6002, "num_input_tokens_seen": 72231112, "step": 124405 }, { "epoch": 18.529937444146558, "grad_norm": 1.3212840557098389, "learning_rate": 8.185403739667419e-07, "loss": 0.5304, "num_input_tokens_seen": 72234056, "step": 124410 }, { "epoch": 18.53068215668752, "grad_norm": 1.5909394025802612, "learning_rate": 8.177158956995279e-07, "loss": 0.5019, "num_input_tokens_seen": 72236968, "step": 124415 }, { "epoch": 18.53142686922848, "grad_norm": 2.2680437564849854, "learning_rate": 8.168918259674224e-07, "loss": 0.5414, "num_input_tokens_seen": 72239976, "step": 124420 }, { "epoch": 18.532171581769436, "grad_norm": 1.9758656024932861, "learning_rate": 8.16068164784356e-07, "loss": 0.5499, "num_input_tokens_seen": 72242760, "step": 124425 }, { "epoch": 18.532916294310397, "grad_norm": 1.50974702835083, "learning_rate": 8.152449121642342e-07, "loss": 0.5201, "num_input_tokens_seen": 72245608, "step": 124430 }, { "epoch": 18.533661006851354, "grad_norm": 2.819983959197998, "learning_rate": 8.144220681209708e-07, "loss": 0.623, "num_input_tokens_seen": 72248456, "step": 124435 }, { "epoch": 18.534405719392314, "grad_norm": 2.869779109954834, "learning_rate": 8.135996326684686e-07, "loss": 0.5337, "num_input_tokens_seen": 72251368, "step": 124440 }, { "epoch": 18.535150431933275, "grad_norm": 1.455849289894104, "learning_rate": 8.127776058206166e-07, "loss": 0.4551, "num_input_tokens_seen": 72254152, "step": 124445 }, { "epoch": 18.53589514447423, "grad_norm": 1.3206634521484375, "learning_rate": 8.119559875913036e-07, "loss": 0.3991, "num_input_tokens_seen": 72256840, "step": 124450 }, { "epoch": 18.536639857015192, "grad_norm": 2.6744437217712402, "learning_rate": 8.111347779944101e-07, "loss": 0.8473, "num_input_tokens_seen": 72259656, "step": 124455 }, { "epoch": 18.537384569556153, "grad_norm": 1.2139571905136108, "learning_rate": 8.103139770438112e-07, "loss": 0.5407, "num_input_tokens_seen": 72262600, "step": 124460 }, { "epoch": 18.53812928209711, "grad_norm": 1.1074774265289307, "learning_rate": 8.09493584753368e-07, "loss": 0.5437, "num_input_tokens_seen": 72265992, "step": 124465 }, { "epoch": 18.53887399463807, "grad_norm": 1.3049088716506958, "learning_rate": 8.0867360113695e-07, "loss": 0.7311, "num_input_tokens_seen": 72268776, "step": 124470 }, { "epoch": 18.539618707179027, "grad_norm": 1.1042064428329468, "learning_rate": 8.078540262084017e-07, "loss": 0.6772, "num_input_tokens_seen": 72271688, "step": 124475 }, { "epoch": 18.540363419719988, "grad_norm": 3.1469147205352783, "learning_rate": 8.07034859981573e-07, "loss": 0.5018, "num_input_tokens_seen": 72274600, "step": 124480 }, { "epoch": 18.54110813226095, "grad_norm": 1.9168697595596313, "learning_rate": 8.062161024703029e-07, "loss": 0.5377, "num_input_tokens_seen": 72277288, "step": 124485 }, { "epoch": 18.541852844801905, "grad_norm": 1.451035976409912, "learning_rate": 8.053977536884194e-07, "loss": 0.5932, "num_input_tokens_seen": 72280072, "step": 124490 }, { "epoch": 18.542597557342866, "grad_norm": 1.2082945108413696, "learning_rate": 8.045798136497529e-07, "loss": 0.5361, "num_input_tokens_seen": 72282920, "step": 124495 }, { "epoch": 18.543342269883826, "grad_norm": 1.0543124675750732, "learning_rate": 8.037622823681174e-07, "loss": 0.6518, "num_input_tokens_seen": 72285896, "step": 124500 }, { "epoch": 18.544086982424783, "grad_norm": 1.3892879486083984, "learning_rate": 8.029451598573267e-07, "loss": 0.6275, "num_input_tokens_seen": 72288488, "step": 124505 }, { "epoch": 18.544831694965744, "grad_norm": 0.6954704523086548, "learning_rate": 8.021284461311867e-07, "loss": 0.6056, "num_input_tokens_seen": 72291272, "step": 124510 }, { "epoch": 18.5455764075067, "grad_norm": 1.14508056640625, "learning_rate": 8.013121412034919e-07, "loss": 0.4981, "num_input_tokens_seen": 72293928, "step": 124515 }, { "epoch": 18.54632112004766, "grad_norm": 1.9083844423294067, "learning_rate": 8.004962450880338e-07, "loss": 0.793, "num_input_tokens_seen": 72296808, "step": 124520 }, { "epoch": 18.547065832588622, "grad_norm": 1.205430269241333, "learning_rate": 7.99680757798596e-07, "loss": 0.5205, "num_input_tokens_seen": 72299560, "step": 124525 }, { "epoch": 18.54781054512958, "grad_norm": 3.4182090759277344, "learning_rate": 7.988656793489563e-07, "loss": 0.6984, "num_input_tokens_seen": 72302440, "step": 124530 }, { "epoch": 18.54855525767054, "grad_norm": 1.5508354902267456, "learning_rate": 7.980510097528815e-07, "loss": 0.551, "num_input_tokens_seen": 72305256, "step": 124535 }, { "epoch": 18.5492999702115, "grad_norm": 1.2637112140655518, "learning_rate": 7.972367490241412e-07, "loss": 0.4944, "num_input_tokens_seen": 72307944, "step": 124540 }, { "epoch": 18.550044682752457, "grad_norm": 2.1956021785736084, "learning_rate": 7.964228971764826e-07, "loss": 0.5444, "num_input_tokens_seen": 72310760, "step": 124545 }, { "epoch": 18.550789395293418, "grad_norm": 3.475113868713379, "learning_rate": 7.956094542236642e-07, "loss": 0.6621, "num_input_tokens_seen": 72313704, "step": 124550 }, { "epoch": 18.551534107834375, "grad_norm": 1.1981699466705322, "learning_rate": 7.947964201794223e-07, "loss": 0.5984, "num_input_tokens_seen": 72316808, "step": 124555 }, { "epoch": 18.552278820375335, "grad_norm": 1.116449236869812, "learning_rate": 7.93983795057493e-07, "loss": 0.5659, "num_input_tokens_seen": 72319592, "step": 124560 }, { "epoch": 18.553023532916296, "grad_norm": 1.320115327835083, "learning_rate": 7.931715788716071e-07, "loss": 0.5986, "num_input_tokens_seen": 72322536, "step": 124565 }, { "epoch": 18.553768245457253, "grad_norm": 1.7467949390411377, "learning_rate": 7.923597716354841e-07, "loss": 0.4586, "num_input_tokens_seen": 72325448, "step": 124570 }, { "epoch": 18.554512957998213, "grad_norm": 1.2414891719818115, "learning_rate": 7.915483733628382e-07, "loss": 0.5723, "num_input_tokens_seen": 72328616, "step": 124575 }, { "epoch": 18.55525767053917, "grad_norm": 1.2928401231765747, "learning_rate": 7.907373840673804e-07, "loss": 0.6105, "num_input_tokens_seen": 72331528, "step": 124580 }, { "epoch": 18.55600238308013, "grad_norm": 1.7373499870300293, "learning_rate": 7.899268037628082e-07, "loss": 0.5817, "num_input_tokens_seen": 72334248, "step": 124585 }, { "epoch": 18.55674709562109, "grad_norm": 1.4946738481521606, "learning_rate": 7.891166324628163e-07, "loss": 0.7018, "num_input_tokens_seen": 72336936, "step": 124590 }, { "epoch": 18.557491808162048, "grad_norm": 2.4279699325561523, "learning_rate": 7.883068701810936e-07, "loss": 0.4451, "num_input_tokens_seen": 72339720, "step": 124595 }, { "epoch": 18.55823652070301, "grad_norm": 1.327417254447937, "learning_rate": 7.874975169313181e-07, "loss": 0.6248, "num_input_tokens_seen": 72342504, "step": 124600 }, { "epoch": 18.55898123324397, "grad_norm": 2.127501964569092, "learning_rate": 7.866885727271594e-07, "loss": 0.7383, "num_input_tokens_seen": 72345128, "step": 124605 }, { "epoch": 18.559725945784926, "grad_norm": 1.6022157669067383, "learning_rate": 7.858800375822928e-07, "loss": 0.5467, "num_input_tokens_seen": 72348392, "step": 124610 }, { "epoch": 18.560470658325887, "grad_norm": 1.0298242568969727, "learning_rate": 7.850719115103683e-07, "loss": 0.5191, "num_input_tokens_seen": 72351112, "step": 124615 }, { "epoch": 18.561215370866844, "grad_norm": 1.615875005722046, "learning_rate": 7.842641945250473e-07, "loss": 0.4062, "num_input_tokens_seen": 72353960, "step": 124620 }, { "epoch": 18.561960083407804, "grad_norm": 2.6355926990509033, "learning_rate": 7.834568866399688e-07, "loss": 0.7107, "num_input_tokens_seen": 72356680, "step": 124625 }, { "epoch": 18.562704795948765, "grad_norm": 2.243647813796997, "learning_rate": 7.826499878687749e-07, "loss": 0.7193, "num_input_tokens_seen": 72359400, "step": 124630 }, { "epoch": 18.56344950848972, "grad_norm": 1.1244080066680908, "learning_rate": 7.81843498225096e-07, "loss": 0.4969, "num_input_tokens_seen": 72362248, "step": 124635 }, { "epoch": 18.564194221030682, "grad_norm": 1.7949848175048828, "learning_rate": 7.810374177225549e-07, "loss": 0.6658, "num_input_tokens_seen": 72365000, "step": 124640 }, { "epoch": 18.564938933571643, "grad_norm": 2.3832526206970215, "learning_rate": 7.802317463747738e-07, "loss": 0.6005, "num_input_tokens_seen": 72367752, "step": 124645 }, { "epoch": 18.5656836461126, "grad_norm": 2.6558074951171875, "learning_rate": 7.794264841953613e-07, "loss": 0.5731, "num_input_tokens_seen": 72370824, "step": 124650 }, { "epoch": 18.56642835865356, "grad_norm": 1.4487112760543823, "learning_rate": 7.786216311979233e-07, "loss": 0.5655, "num_input_tokens_seen": 72373608, "step": 124655 }, { "epoch": 18.567173071194517, "grad_norm": 3.4524271488189697, "learning_rate": 7.778171873960516e-07, "loss": 0.5794, "num_input_tokens_seen": 72376552, "step": 124660 }, { "epoch": 18.567917783735478, "grad_norm": 1.2565770149230957, "learning_rate": 7.770131528033409e-07, "loss": 0.4786, "num_input_tokens_seen": 72379624, "step": 124665 }, { "epoch": 18.56866249627644, "grad_norm": 2.0162127017974854, "learning_rate": 7.762095274333747e-07, "loss": 0.4632, "num_input_tokens_seen": 72382472, "step": 124670 }, { "epoch": 18.569407208817395, "grad_norm": 0.6050774455070496, "learning_rate": 7.754063112997284e-07, "loss": 0.4868, "num_input_tokens_seen": 72385704, "step": 124675 }, { "epoch": 18.570151921358356, "grad_norm": 1.9522770643234253, "learning_rate": 7.746035044159688e-07, "loss": 0.6033, "num_input_tokens_seen": 72388200, "step": 124680 }, { "epoch": 18.570896633899316, "grad_norm": 3.0013577938079834, "learning_rate": 7.738011067956658e-07, "loss": 0.6616, "num_input_tokens_seen": 72390920, "step": 124685 }, { "epoch": 18.571641346440273, "grad_norm": 1.416077971458435, "learning_rate": 7.729991184523722e-07, "loss": 0.4563, "num_input_tokens_seen": 72393768, "step": 124690 }, { "epoch": 18.572386058981234, "grad_norm": 1.6129150390625, "learning_rate": 7.7219753939963e-07, "loss": 0.5858, "num_input_tokens_seen": 72396616, "step": 124695 }, { "epoch": 18.57313077152219, "grad_norm": 1.7846850156784058, "learning_rate": 7.713963696509896e-07, "loss": 0.5719, "num_input_tokens_seen": 72399784, "step": 124700 }, { "epoch": 18.57387548406315, "grad_norm": 3.102287769317627, "learning_rate": 7.705956092199818e-07, "loss": 0.6143, "num_input_tokens_seen": 72402920, "step": 124705 }, { "epoch": 18.574620196604112, "grad_norm": 1.4628371000289917, "learning_rate": 7.697952581201373e-07, "loss": 0.7287, "num_input_tokens_seen": 72405704, "step": 124710 }, { "epoch": 18.57536490914507, "grad_norm": 1.808858871459961, "learning_rate": 7.689953163649704e-07, "loss": 0.4901, "num_input_tokens_seen": 72408808, "step": 124715 }, { "epoch": 18.57610962168603, "grad_norm": 1.3515311479568481, "learning_rate": 7.681957839680065e-07, "loss": 0.6712, "num_input_tokens_seen": 72412168, "step": 124720 }, { "epoch": 18.57685433422699, "grad_norm": 2.0019521713256836, "learning_rate": 7.67396660942743e-07, "loss": 0.5663, "num_input_tokens_seen": 72414760, "step": 124725 }, { "epoch": 18.577599046767947, "grad_norm": 1.9915283918380737, "learning_rate": 7.66597947302683e-07, "loss": 0.6084, "num_input_tokens_seen": 72417640, "step": 124730 }, { "epoch": 18.578343759308908, "grad_norm": 1.6408140659332275, "learning_rate": 7.65799643061324e-07, "loss": 0.5285, "num_input_tokens_seen": 72420456, "step": 124735 }, { "epoch": 18.579088471849865, "grad_norm": 2.502591133117676, "learning_rate": 7.65001748232147e-07, "loss": 0.4983, "num_input_tokens_seen": 72423464, "step": 124740 }, { "epoch": 18.579833184390825, "grad_norm": 2.415086269378662, "learning_rate": 7.642042628286355e-07, "loss": 0.7997, "num_input_tokens_seen": 72426472, "step": 124745 }, { "epoch": 18.580577896931786, "grad_norm": 1.4249188899993896, "learning_rate": 7.634071868642595e-07, "loss": 0.7592, "num_input_tokens_seen": 72429320, "step": 124750 }, { "epoch": 18.581322609472743, "grad_norm": 1.3926998376846313, "learning_rate": 7.626105203524886e-07, "loss": 0.5133, "num_input_tokens_seen": 72432360, "step": 124755 }, { "epoch": 18.582067322013703, "grad_norm": 1.3570506572723389, "learning_rate": 7.61814263306776e-07, "loss": 0.6626, "num_input_tokens_seen": 72435112, "step": 124760 }, { "epoch": 18.582812034554664, "grad_norm": 1.1651445627212524, "learning_rate": 7.610184157405803e-07, "loss": 0.3701, "num_input_tokens_seen": 72438152, "step": 124765 }, { "epoch": 18.58355674709562, "grad_norm": 2.150142192840576, "learning_rate": 7.602229776673409e-07, "loss": 0.6655, "num_input_tokens_seen": 72440968, "step": 124770 }, { "epoch": 18.58430145963658, "grad_norm": 1.727100133895874, "learning_rate": 7.594279491004997e-07, "loss": 0.5942, "num_input_tokens_seen": 72443688, "step": 124775 }, { "epoch": 18.585046172177538, "grad_norm": 1.2837131023406982, "learning_rate": 7.586333300534876e-07, "loss": 0.4462, "num_input_tokens_seen": 72446600, "step": 124780 }, { "epoch": 18.5857908847185, "grad_norm": 1.5039056539535522, "learning_rate": 7.578391205397218e-07, "loss": 0.592, "num_input_tokens_seen": 72449960, "step": 124785 }, { "epoch": 18.58653559725946, "grad_norm": 1.3568061590194702, "learning_rate": 7.570453205726303e-07, "loss": 0.7401, "num_input_tokens_seen": 72452648, "step": 124790 }, { "epoch": 18.587280309800416, "grad_norm": 3.3958895206451416, "learning_rate": 7.562519301656162e-07, "loss": 0.7246, "num_input_tokens_seen": 72455624, "step": 124795 }, { "epoch": 18.588025022341377, "grad_norm": 3.402270555496216, "learning_rate": 7.554589493320885e-07, "loss": 0.6253, "num_input_tokens_seen": 72458312, "step": 124800 }, { "epoch": 18.588769734882334, "grad_norm": 1.522868037223816, "learning_rate": 7.54666378085439e-07, "loss": 0.6116, "num_input_tokens_seen": 72461000, "step": 124805 }, { "epoch": 18.589514447423294, "grad_norm": 1.1273945569992065, "learning_rate": 7.538742164390572e-07, "loss": 0.5, "num_input_tokens_seen": 72464040, "step": 124810 }, { "epoch": 18.590259159964255, "grad_norm": 2.00533127784729, "learning_rate": 7.530824644063295e-07, "loss": 0.5827, "num_input_tokens_seen": 72466888, "step": 124815 }, { "epoch": 18.591003872505212, "grad_norm": 1.6154190301895142, "learning_rate": 7.522911220006285e-07, "loss": 0.6087, "num_input_tokens_seen": 72470088, "step": 124820 }, { "epoch": 18.591748585046172, "grad_norm": 2.092580556869507, "learning_rate": 7.515001892353268e-07, "loss": 0.528, "num_input_tokens_seen": 72472840, "step": 124825 }, { "epoch": 18.592493297587133, "grad_norm": 1.4824680089950562, "learning_rate": 7.507096661237834e-07, "loss": 0.6816, "num_input_tokens_seen": 72475848, "step": 124830 }, { "epoch": 18.59323801012809, "grad_norm": 1.0759996175765991, "learning_rate": 7.499195526793567e-07, "loss": 0.6176, "num_input_tokens_seen": 72478728, "step": 124835 }, { "epoch": 18.59398272266905, "grad_norm": 2.628880023956299, "learning_rate": 7.491298489153919e-07, "loss": 0.7855, "num_input_tokens_seen": 72481800, "step": 124840 }, { "epoch": 18.594727435210007, "grad_norm": 1.4941105842590332, "learning_rate": 7.483405548452283e-07, "loss": 0.7462, "num_input_tokens_seen": 72484552, "step": 124845 }, { "epoch": 18.595472147750968, "grad_norm": 1.5949032306671143, "learning_rate": 7.47551670482205e-07, "loss": 0.5721, "num_input_tokens_seen": 72487176, "step": 124850 }, { "epoch": 18.59621686029193, "grad_norm": 1.4258906841278076, "learning_rate": 7.467631958396448e-07, "loss": 0.6225, "num_input_tokens_seen": 72490120, "step": 124855 }, { "epoch": 18.596961572832885, "grad_norm": 1.9210083484649658, "learning_rate": 7.459751309308733e-07, "loss": 0.4751, "num_input_tokens_seen": 72493064, "step": 124860 }, { "epoch": 18.597706285373846, "grad_norm": 3.021813154220581, "learning_rate": 7.451874757691991e-07, "loss": 0.696, "num_input_tokens_seen": 72496264, "step": 124865 }, { "epoch": 18.598450997914806, "grad_norm": 0.9954257607460022, "learning_rate": 7.444002303679309e-07, "loss": 0.4907, "num_input_tokens_seen": 72499336, "step": 124870 }, { "epoch": 18.599195710455763, "grad_norm": 1.0038046836853027, "learning_rate": 7.436133947403695e-07, "loss": 0.6549, "num_input_tokens_seen": 72502440, "step": 124875 }, { "epoch": 18.599940422996724, "grad_norm": 1.4504717588424683, "learning_rate": 7.428269688998068e-07, "loss": 0.6658, "num_input_tokens_seen": 72505192, "step": 124880 }, { "epoch": 18.60068513553768, "grad_norm": 2.0000059604644775, "learning_rate": 7.420409528595296e-07, "loss": 0.806, "num_input_tokens_seen": 72508264, "step": 124885 }, { "epoch": 18.60142984807864, "grad_norm": 1.3604588508605957, "learning_rate": 7.412553466328131e-07, "loss": 0.6841, "num_input_tokens_seen": 72511176, "step": 124890 }, { "epoch": 18.602174560619602, "grad_norm": 1.2430487871170044, "learning_rate": 7.404701502329331e-07, "loss": 0.5969, "num_input_tokens_seen": 72514248, "step": 124895 }, { "epoch": 18.60291927316056, "grad_norm": 2.1827428340911865, "learning_rate": 7.396853636731537e-07, "loss": 0.6431, "num_input_tokens_seen": 72517192, "step": 124900 }, { "epoch": 18.60366398570152, "grad_norm": 1.6497992277145386, "learning_rate": 7.389009869667341e-07, "loss": 0.7037, "num_input_tokens_seen": 72520008, "step": 124905 }, { "epoch": 18.60440869824248, "grad_norm": 2.2228593826293945, "learning_rate": 7.381170201269244e-07, "loss": 0.6975, "num_input_tokens_seen": 72522760, "step": 124910 }, { "epoch": 18.605153410783437, "grad_norm": 1.7794370651245117, "learning_rate": 7.373334631669698e-07, "loss": 0.5836, "num_input_tokens_seen": 72525672, "step": 124915 }, { "epoch": 18.605898123324398, "grad_norm": 1.4879390001296997, "learning_rate": 7.365503161001013e-07, "loss": 0.504, "num_input_tokens_seen": 72528712, "step": 124920 }, { "epoch": 18.606642835865355, "grad_norm": 1.4521085023880005, "learning_rate": 7.357675789395613e-07, "loss": 0.6563, "num_input_tokens_seen": 72531432, "step": 124925 }, { "epoch": 18.607387548406315, "grad_norm": 2.7496087551116943, "learning_rate": 7.349852516985639e-07, "loss": 0.6581, "num_input_tokens_seen": 72534568, "step": 124930 }, { "epoch": 18.608132260947276, "grad_norm": 1.2820003032684326, "learning_rate": 7.342033343903293e-07, "loss": 0.5626, "num_input_tokens_seen": 72538024, "step": 124935 }, { "epoch": 18.608876973488233, "grad_norm": 1.827376365661621, "learning_rate": 7.33421827028069e-07, "loss": 0.6351, "num_input_tokens_seen": 72540808, "step": 124940 }, { "epoch": 18.609621686029193, "grad_norm": 0.6973927021026611, "learning_rate": 7.326407296249782e-07, "loss": 0.5171, "num_input_tokens_seen": 72543656, "step": 124945 }, { "epoch": 18.61036639857015, "grad_norm": 1.0405503511428833, "learning_rate": 7.318600421942628e-07, "loss": 0.4581, "num_input_tokens_seen": 72546312, "step": 124950 }, { "epoch": 18.61111111111111, "grad_norm": 1.6811230182647705, "learning_rate": 7.310797647491041e-07, "loss": 0.6667, "num_input_tokens_seen": 72549032, "step": 124955 }, { "epoch": 18.61185582365207, "grad_norm": 1.1025772094726562, "learning_rate": 7.302998973026887e-07, "loss": 0.5034, "num_input_tokens_seen": 72551848, "step": 124960 }, { "epoch": 18.61260053619303, "grad_norm": 1.3768138885498047, "learning_rate": 7.295204398681893e-07, "loss": 0.4945, "num_input_tokens_seen": 72554696, "step": 124965 }, { "epoch": 18.61334524873399, "grad_norm": 2.0713319778442383, "learning_rate": 7.287413924587733e-07, "loss": 0.6841, "num_input_tokens_seen": 72557832, "step": 124970 }, { "epoch": 18.61408996127495, "grad_norm": 1.2768210172653198, "learning_rate": 7.279627550876051e-07, "loss": 0.5675, "num_input_tokens_seen": 72561064, "step": 124975 }, { "epoch": 18.614834673815906, "grad_norm": 1.87924063205719, "learning_rate": 7.27184527767838e-07, "loss": 0.6495, "num_input_tokens_seen": 72563816, "step": 124980 }, { "epoch": 18.615579386356867, "grad_norm": 2.800236225128174, "learning_rate": 7.264067105126199e-07, "loss": 0.5933, "num_input_tokens_seen": 72567144, "step": 124985 }, { "epoch": 18.616324098897824, "grad_norm": 1.5224933624267578, "learning_rate": 7.256293033350847e-07, "loss": 0.5347, "num_input_tokens_seen": 72570024, "step": 124990 }, { "epoch": 18.617068811438784, "grad_norm": 1.8529084920883179, "learning_rate": 7.248523062483748e-07, "loss": 0.5389, "num_input_tokens_seen": 72572680, "step": 124995 }, { "epoch": 18.617813523979745, "grad_norm": 1.4849995374679565, "learning_rate": 7.2407571926561e-07, "loss": 0.6702, "num_input_tokens_seen": 72575432, "step": 125000 }, { "epoch": 18.618558236520702, "grad_norm": 1.8976861238479614, "learning_rate": 7.232995423999162e-07, "loss": 0.5868, "num_input_tokens_seen": 72578376, "step": 125005 }, { "epoch": 18.619302949061662, "grad_norm": 2.6479005813598633, "learning_rate": 7.225237756644021e-07, "loss": 0.5393, "num_input_tokens_seen": 72581416, "step": 125010 }, { "epoch": 18.620047661602623, "grad_norm": 2.5480775833129883, "learning_rate": 7.217484190721712e-07, "loss": 0.6918, "num_input_tokens_seen": 72584424, "step": 125015 }, { "epoch": 18.62079237414358, "grad_norm": 1.713582992553711, "learning_rate": 7.209734726363299e-07, "loss": 0.5901, "num_input_tokens_seen": 72587432, "step": 125020 }, { "epoch": 18.62153708668454, "grad_norm": 1.4231150150299072, "learning_rate": 7.201989363699618e-07, "loss": 0.5851, "num_input_tokens_seen": 72590184, "step": 125025 }, { "epoch": 18.622281799225497, "grad_norm": 1.8755725622177124, "learning_rate": 7.194248102861594e-07, "loss": 0.6297, "num_input_tokens_seen": 72593096, "step": 125030 }, { "epoch": 18.623026511766458, "grad_norm": 2.173520565032959, "learning_rate": 7.186510943979957e-07, "loss": 0.7121, "num_input_tokens_seen": 72595912, "step": 125035 }, { "epoch": 18.62377122430742, "grad_norm": 1.2457072734832764, "learning_rate": 7.178777887185434e-07, "loss": 0.6834, "num_input_tokens_seen": 72598760, "step": 125040 }, { "epoch": 18.624515936848375, "grad_norm": 1.3827916383743286, "learning_rate": 7.17104893260867e-07, "loss": 0.5469, "num_input_tokens_seen": 72601448, "step": 125045 }, { "epoch": 18.625260649389336, "grad_norm": 1.454246163368225, "learning_rate": 7.16332408038023e-07, "loss": 0.5865, "num_input_tokens_seen": 72604200, "step": 125050 }, { "epoch": 18.626005361930297, "grad_norm": 1.1024072170257568, "learning_rate": 7.155603330630617e-07, "loss": 0.7016, "num_input_tokens_seen": 72608168, "step": 125055 }, { "epoch": 18.626750074471254, "grad_norm": 2.213935613632202, "learning_rate": 7.147886683490256e-07, "loss": 0.6288, "num_input_tokens_seen": 72610984, "step": 125060 }, { "epoch": 18.627494787012214, "grad_norm": 2.3818626403808594, "learning_rate": 7.140174139089545e-07, "loss": 0.6189, "num_input_tokens_seen": 72613832, "step": 125065 }, { "epoch": 18.62823949955317, "grad_norm": 1.1267735958099365, "learning_rate": 7.132465697558737e-07, "loss": 0.6645, "num_input_tokens_seen": 72617000, "step": 125070 }, { "epoch": 18.62898421209413, "grad_norm": 1.7116789817810059, "learning_rate": 7.124761359028121e-07, "loss": 0.5676, "num_input_tokens_seen": 72619560, "step": 125075 }, { "epoch": 18.629728924635092, "grad_norm": 2.124957323074341, "learning_rate": 7.117061123627783e-07, "loss": 0.5415, "num_input_tokens_seen": 72622536, "step": 125080 }, { "epoch": 18.63047363717605, "grad_norm": 1.2114378213882446, "learning_rate": 7.109364991487872e-07, "loss": 0.5502, "num_input_tokens_seen": 72625416, "step": 125085 }, { "epoch": 18.63121834971701, "grad_norm": 1.4847272634506226, "learning_rate": 7.101672962738365e-07, "loss": 0.5439, "num_input_tokens_seen": 72628168, "step": 125090 }, { "epoch": 18.631963062257967, "grad_norm": 1.8898684978485107, "learning_rate": 7.093985037509188e-07, "loss": 0.5977, "num_input_tokens_seen": 72631176, "step": 125095 }, { "epoch": 18.632707774798927, "grad_norm": 1.494181752204895, "learning_rate": 7.086301215930291e-07, "loss": 0.4979, "num_input_tokens_seen": 72634024, "step": 125100 }, { "epoch": 18.633452487339888, "grad_norm": 1.1472446918487549, "learning_rate": 7.078621498131461e-07, "loss": 0.6254, "num_input_tokens_seen": 72636904, "step": 125105 }, { "epoch": 18.634197199880845, "grad_norm": 1.5899162292480469, "learning_rate": 7.070945884242397e-07, "loss": 0.6543, "num_input_tokens_seen": 72639976, "step": 125110 }, { "epoch": 18.634941912421805, "grad_norm": 1.0138431787490845, "learning_rate": 7.063274374392803e-07, "loss": 0.6084, "num_input_tokens_seen": 72642888, "step": 125115 }, { "epoch": 18.635686624962766, "grad_norm": 1.169480800628662, "learning_rate": 7.055606968712297e-07, "loss": 0.4898, "num_input_tokens_seen": 72645800, "step": 125120 }, { "epoch": 18.636431337503723, "grad_norm": 2.083831310272217, "learning_rate": 7.047943667330386e-07, "loss": 0.75, "num_input_tokens_seen": 72648584, "step": 125125 }, { "epoch": 18.637176050044683, "grad_norm": 1.5540776252746582, "learning_rate": 7.040284470376523e-07, "loss": 0.6765, "num_input_tokens_seen": 72651496, "step": 125130 }, { "epoch": 18.63792076258564, "grad_norm": 1.9219692945480347, "learning_rate": 7.032629377980133e-07, "loss": 0.6672, "num_input_tokens_seen": 72654472, "step": 125135 }, { "epoch": 18.6386654751266, "grad_norm": 1.460829496383667, "learning_rate": 7.024978390270526e-07, "loss": 0.7433, "num_input_tokens_seen": 72657416, "step": 125140 }, { "epoch": 18.63941018766756, "grad_norm": 0.9808641672134399, "learning_rate": 7.017331507376962e-07, "loss": 0.5095, "num_input_tokens_seen": 72660424, "step": 125145 }, { "epoch": 18.64015490020852, "grad_norm": 1.3058483600616455, "learning_rate": 7.009688729428615e-07, "loss": 0.5868, "num_input_tokens_seen": 72663080, "step": 125150 }, { "epoch": 18.64089961274948, "grad_norm": 2.217992067337036, "learning_rate": 7.002050056554632e-07, "loss": 0.842, "num_input_tokens_seen": 72666216, "step": 125155 }, { "epoch": 18.64164432529044, "grad_norm": 2.0053744316101074, "learning_rate": 6.994415488884021e-07, "loss": 0.6043, "num_input_tokens_seen": 72669032, "step": 125160 }, { "epoch": 18.642389037831396, "grad_norm": 2.0118408203125, "learning_rate": 6.986785026545789e-07, "loss": 0.4738, "num_input_tokens_seen": 72671592, "step": 125165 }, { "epoch": 18.643133750372357, "grad_norm": 1.2494746446609497, "learning_rate": 6.979158669668862e-07, "loss": 0.5841, "num_input_tokens_seen": 72674472, "step": 125170 }, { "epoch": 18.643878462913314, "grad_norm": 1.4996660947799683, "learning_rate": 6.971536418382052e-07, "loss": 0.6904, "num_input_tokens_seen": 72677448, "step": 125175 }, { "epoch": 18.644623175454274, "grad_norm": 2.097607135772705, "learning_rate": 6.963918272814119e-07, "loss": 0.5399, "num_input_tokens_seen": 72680136, "step": 125180 }, { "epoch": 18.645367887995235, "grad_norm": 2.2236993312835693, "learning_rate": 6.956304233093736e-07, "loss": 0.4607, "num_input_tokens_seen": 72683272, "step": 125185 }, { "epoch": 18.646112600536192, "grad_norm": 1.6899181604385376, "learning_rate": 6.948694299349634e-07, "loss": 0.6643, "num_input_tokens_seen": 72686440, "step": 125190 }, { "epoch": 18.646857313077152, "grad_norm": 1.8822752237319946, "learning_rate": 6.941088471710266e-07, "loss": 0.4681, "num_input_tokens_seen": 72689000, "step": 125195 }, { "epoch": 18.647602025618113, "grad_norm": 1.5207170248031616, "learning_rate": 6.933486750304197e-07, "loss": 0.6208, "num_input_tokens_seen": 72691816, "step": 125200 }, { "epoch": 18.64834673815907, "grad_norm": 1.72395658493042, "learning_rate": 6.92588913525985e-07, "loss": 0.5205, "num_input_tokens_seen": 72694888, "step": 125205 }, { "epoch": 18.64909145070003, "grad_norm": 1.5229761600494385, "learning_rate": 6.918295626705512e-07, "loss": 0.4778, "num_input_tokens_seen": 72697896, "step": 125210 }, { "epoch": 18.649836163240987, "grad_norm": 2.2850987911224365, "learning_rate": 6.910706224769553e-07, "loss": 0.4569, "num_input_tokens_seen": 72700872, "step": 125215 }, { "epoch": 18.650580875781948, "grad_norm": 1.9318525791168213, "learning_rate": 6.903120929580092e-07, "loss": 0.5649, "num_input_tokens_seen": 72703912, "step": 125220 }, { "epoch": 18.65132558832291, "grad_norm": 1.3673129081726074, "learning_rate": 6.895539741265389e-07, "loss": 0.4604, "num_input_tokens_seen": 72706824, "step": 125225 }, { "epoch": 18.652070300863866, "grad_norm": 1.890376329421997, "learning_rate": 6.887962659953423e-07, "loss": 0.5744, "num_input_tokens_seen": 72709576, "step": 125230 }, { "epoch": 18.652815013404826, "grad_norm": 0.7899904251098633, "learning_rate": 6.88038968577226e-07, "loss": 0.5163, "num_input_tokens_seen": 72712456, "step": 125235 }, { "epoch": 18.653559725945787, "grad_norm": 1.3383585214614868, "learning_rate": 6.872820818849823e-07, "loss": 0.6637, "num_input_tokens_seen": 72715304, "step": 125240 }, { "epoch": 18.654304438486744, "grad_norm": 2.501786470413208, "learning_rate": 6.865256059313985e-07, "loss": 0.774, "num_input_tokens_seen": 72718120, "step": 125245 }, { "epoch": 18.655049151027704, "grad_norm": 3.4796831607818604, "learning_rate": 6.857695407292503e-07, "loss": 0.5661, "num_input_tokens_seen": 72720872, "step": 125250 }, { "epoch": 18.65579386356866, "grad_norm": 3.9434549808502197, "learning_rate": 6.850138862913136e-07, "loss": 0.5704, "num_input_tokens_seen": 72723656, "step": 125255 }, { "epoch": 18.65653857610962, "grad_norm": 2.3333752155303955, "learning_rate": 6.842586426303588e-07, "loss": 0.5538, "num_input_tokens_seen": 72726504, "step": 125260 }, { "epoch": 18.657283288650582, "grad_norm": 1.6450600624084473, "learning_rate": 6.835038097591367e-07, "loss": 0.7589, "num_input_tokens_seen": 72729832, "step": 125265 }, { "epoch": 18.65802800119154, "grad_norm": 1.4422435760498047, "learning_rate": 6.82749387690404e-07, "loss": 0.6144, "num_input_tokens_seen": 72732584, "step": 125270 }, { "epoch": 18.6587727137325, "grad_norm": 1.2822171449661255, "learning_rate": 6.819953764369058e-07, "loss": 0.591, "num_input_tokens_seen": 72735208, "step": 125275 }, { "epoch": 18.65951742627346, "grad_norm": 1.0068498849868774, "learning_rate": 6.812417760113821e-07, "loss": 0.5809, "num_input_tokens_seen": 72738152, "step": 125280 }, { "epoch": 18.660262138814417, "grad_norm": 1.2931337356567383, "learning_rate": 6.804885864265587e-07, "loss": 0.5462, "num_input_tokens_seen": 72740936, "step": 125285 }, { "epoch": 18.661006851355378, "grad_norm": 1.101853609085083, "learning_rate": 6.79735807695167e-07, "loss": 0.4766, "num_input_tokens_seen": 72743752, "step": 125290 }, { "epoch": 18.661751563896335, "grad_norm": 1.18036687374115, "learning_rate": 6.789834398299194e-07, "loss": 0.5695, "num_input_tokens_seen": 72746472, "step": 125295 }, { "epoch": 18.662496276437295, "grad_norm": 1.2730711698532104, "learning_rate": 6.782314828435249e-07, "loss": 0.5959, "num_input_tokens_seen": 72749320, "step": 125300 }, { "epoch": 18.663240988978256, "grad_norm": 2.0382983684539795, "learning_rate": 6.774799367486956e-07, "loss": 0.7598, "num_input_tokens_seen": 72752488, "step": 125305 }, { "epoch": 18.663985701519213, "grad_norm": 2.2250711917877197, "learning_rate": 6.767288015581186e-07, "loss": 0.7022, "num_input_tokens_seen": 72755240, "step": 125310 }, { "epoch": 18.664730414060173, "grad_norm": 2.092355966567993, "learning_rate": 6.759780772844892e-07, "loss": 0.6852, "num_input_tokens_seen": 72758152, "step": 125315 }, { "epoch": 18.66547512660113, "grad_norm": 2.0513172149658203, "learning_rate": 6.752277639404863e-07, "loss": 0.6627, "num_input_tokens_seen": 72761128, "step": 125320 }, { "epoch": 18.66621983914209, "grad_norm": 2.2281711101531982, "learning_rate": 6.744778615387914e-07, "loss": 0.5826, "num_input_tokens_seen": 72763816, "step": 125325 }, { "epoch": 18.66696455168305, "grad_norm": 1.325305461883545, "learning_rate": 6.737283700920666e-07, "loss": 0.5052, "num_input_tokens_seen": 72766376, "step": 125330 }, { "epoch": 18.66770926422401, "grad_norm": 2.395170211791992, "learning_rate": 6.729792896129767e-07, "loss": 0.6627, "num_input_tokens_seen": 72769160, "step": 125335 }, { "epoch": 18.66845397676497, "grad_norm": 1.690546989440918, "learning_rate": 6.722306201141781e-07, "loss": 0.7998, "num_input_tokens_seen": 72771944, "step": 125340 }, { "epoch": 18.66919868930593, "grad_norm": 1.1014350652694702, "learning_rate": 6.714823616083165e-07, "loss": 0.6377, "num_input_tokens_seen": 72774760, "step": 125345 }, { "epoch": 18.669943401846886, "grad_norm": 1.4673497676849365, "learning_rate": 6.707345141080345e-07, "loss": 0.5338, "num_input_tokens_seen": 72777480, "step": 125350 }, { "epoch": 18.670688114387847, "grad_norm": 2.583833932876587, "learning_rate": 6.699870776259637e-07, "loss": 0.6725, "num_input_tokens_seen": 72780328, "step": 125355 }, { "epoch": 18.671432826928804, "grad_norm": 1.1636648178100586, "learning_rate": 6.692400521747355e-07, "loss": 0.6413, "num_input_tokens_seen": 72783048, "step": 125360 }, { "epoch": 18.672177539469764, "grad_norm": 0.9255695343017578, "learning_rate": 6.684934377669705e-07, "loss": 0.5136, "num_input_tokens_seen": 72786056, "step": 125365 }, { "epoch": 18.672922252010725, "grad_norm": 2.097991704940796, "learning_rate": 6.67747234415278e-07, "loss": 0.4022, "num_input_tokens_seen": 72788744, "step": 125370 }, { "epoch": 18.673666964551682, "grad_norm": 1.3835179805755615, "learning_rate": 6.670014421322618e-07, "loss": 0.435, "num_input_tokens_seen": 72791560, "step": 125375 }, { "epoch": 18.674411677092642, "grad_norm": 1.2578470706939697, "learning_rate": 6.662560609305285e-07, "loss": 0.5948, "num_input_tokens_seen": 72794600, "step": 125380 }, { "epoch": 18.675156389633603, "grad_norm": 2.059467315673828, "learning_rate": 6.655110908226681e-07, "loss": 0.6466, "num_input_tokens_seen": 72797416, "step": 125385 }, { "epoch": 18.67590110217456, "grad_norm": 0.9277536869049072, "learning_rate": 6.647665318212621e-07, "loss": 0.5539, "num_input_tokens_seen": 72800296, "step": 125390 }, { "epoch": 18.67664581471552, "grad_norm": 0.5975915789604187, "learning_rate": 6.640223839388948e-07, "loss": 0.6818, "num_input_tokens_seen": 72803144, "step": 125395 }, { "epoch": 18.677390527256478, "grad_norm": 0.9571354389190674, "learning_rate": 6.632786471881342e-07, "loss": 0.4982, "num_input_tokens_seen": 72806184, "step": 125400 }, { "epoch": 18.678135239797438, "grad_norm": 2.7284557819366455, "learning_rate": 6.625353215815478e-07, "loss": 0.678, "num_input_tokens_seen": 72809032, "step": 125405 }, { "epoch": 18.6788799523384, "grad_norm": 2.865927219390869, "learning_rate": 6.617924071316894e-07, "loss": 0.671, "num_input_tokens_seen": 72811816, "step": 125410 }, { "epoch": 18.679624664879356, "grad_norm": 1.321597695350647, "learning_rate": 6.610499038511131e-07, "loss": 0.3488, "num_input_tokens_seen": 72814568, "step": 125415 }, { "epoch": 18.680369377420316, "grad_norm": 1.5176033973693848, "learning_rate": 6.603078117523615e-07, "loss": 0.4316, "num_input_tokens_seen": 72817192, "step": 125420 }, { "epoch": 18.681114089961277, "grad_norm": 1.5349273681640625, "learning_rate": 6.595661308479717e-07, "loss": 0.4282, "num_input_tokens_seen": 72819880, "step": 125425 }, { "epoch": 18.681858802502234, "grad_norm": 1.7198405265808105, "learning_rate": 6.588248611504755e-07, "loss": 0.5933, "num_input_tokens_seen": 72822760, "step": 125430 }, { "epoch": 18.682603515043194, "grad_norm": 0.9015238285064697, "learning_rate": 6.580840026723934e-07, "loss": 0.6881, "num_input_tokens_seen": 72825704, "step": 125435 }, { "epoch": 18.68334822758415, "grad_norm": 1.7624084949493408, "learning_rate": 6.573435554262403e-07, "loss": 0.6385, "num_input_tokens_seen": 72828520, "step": 125440 }, { "epoch": 18.68409294012511, "grad_norm": 1.4805350303649902, "learning_rate": 6.566035194245257e-07, "loss": 0.5437, "num_input_tokens_seen": 72831208, "step": 125445 }, { "epoch": 18.684837652666072, "grad_norm": 1.9800609350204468, "learning_rate": 6.558638946797563e-07, "loss": 0.5462, "num_input_tokens_seen": 72834152, "step": 125450 }, { "epoch": 18.68558236520703, "grad_norm": 1.4422487020492554, "learning_rate": 6.551246812044248e-07, "loss": 0.7824, "num_input_tokens_seen": 72837096, "step": 125455 }, { "epoch": 18.68632707774799, "grad_norm": 2.502650499343872, "learning_rate": 6.543858790110158e-07, "loss": 0.4003, "num_input_tokens_seen": 72839976, "step": 125460 }, { "epoch": 18.687071790288947, "grad_norm": 2.300229072570801, "learning_rate": 6.536474881120164e-07, "loss": 0.6711, "num_input_tokens_seen": 72842664, "step": 125465 }, { "epoch": 18.687816502829907, "grad_norm": 1.2002038955688477, "learning_rate": 6.529095085198944e-07, "loss": 0.4589, "num_input_tokens_seen": 72845832, "step": 125470 }, { "epoch": 18.688561215370868, "grad_norm": 1.08457350730896, "learning_rate": 6.521719402471233e-07, "loss": 0.4972, "num_input_tokens_seen": 72848648, "step": 125475 }, { "epoch": 18.689305927911825, "grad_norm": 3.5971851348876953, "learning_rate": 6.514347833061596e-07, "loss": 0.4645, "num_input_tokens_seen": 72851624, "step": 125480 }, { "epoch": 18.690050640452785, "grad_norm": 2.1497206687927246, "learning_rate": 6.506980377094601e-07, "loss": 0.5198, "num_input_tokens_seen": 72854504, "step": 125485 }, { "epoch": 18.690795352993746, "grad_norm": 1.7521896362304688, "learning_rate": 6.499617034694705e-07, "loss": 0.6153, "num_input_tokens_seen": 72857544, "step": 125490 }, { "epoch": 18.691540065534703, "grad_norm": 2.5413668155670166, "learning_rate": 6.492257805986279e-07, "loss": 0.598, "num_input_tokens_seen": 72860840, "step": 125495 }, { "epoch": 18.692284778075663, "grad_norm": 1.3497405052185059, "learning_rate": 6.48490269109367e-07, "loss": 0.5071, "num_input_tokens_seen": 72863944, "step": 125500 }, { "epoch": 18.69302949061662, "grad_norm": 0.9971739053726196, "learning_rate": 6.477551690141165e-07, "loss": 0.6604, "num_input_tokens_seen": 72866600, "step": 125505 }, { "epoch": 18.69377420315758, "grad_norm": 2.0838701725006104, "learning_rate": 6.470204803252888e-07, "loss": 0.4928, "num_input_tokens_seen": 72869352, "step": 125510 }, { "epoch": 18.69451891569854, "grad_norm": 0.9132327437400818, "learning_rate": 6.462862030552991e-07, "loss": 0.5174, "num_input_tokens_seen": 72872168, "step": 125515 }, { "epoch": 18.6952636282395, "grad_norm": 1.953072428703308, "learning_rate": 6.455523372165512e-07, "loss": 0.5085, "num_input_tokens_seen": 72874952, "step": 125520 }, { "epoch": 18.69600834078046, "grad_norm": 1.3541141748428345, "learning_rate": 6.448188828214435e-07, "loss": 0.491, "num_input_tokens_seen": 72878280, "step": 125525 }, { "epoch": 18.69675305332142, "grad_norm": 1.4328093528747559, "learning_rate": 6.44085839882369e-07, "loss": 0.6085, "num_input_tokens_seen": 72881480, "step": 125530 }, { "epoch": 18.697497765862376, "grad_norm": 1.273848533630371, "learning_rate": 6.433532084117122e-07, "loss": 0.7082, "num_input_tokens_seen": 72884456, "step": 125535 }, { "epoch": 18.698242478403337, "grad_norm": 2.726571798324585, "learning_rate": 6.426209884218437e-07, "loss": 0.6133, "num_input_tokens_seen": 72887560, "step": 125540 }, { "epoch": 18.698987190944294, "grad_norm": 1.3113120794296265, "learning_rate": 6.418891799251397e-07, "loss": 0.6324, "num_input_tokens_seen": 72890376, "step": 125545 }, { "epoch": 18.699731903485254, "grad_norm": 1.1242536306381226, "learning_rate": 6.411577829339599e-07, "loss": 0.4731, "num_input_tokens_seen": 72893288, "step": 125550 }, { "epoch": 18.700476616026215, "grad_norm": 1.0372164249420166, "learning_rate": 6.404267974606637e-07, "loss": 0.5395, "num_input_tokens_seen": 72896168, "step": 125555 }, { "epoch": 18.701221328567172, "grad_norm": 1.5975247621536255, "learning_rate": 6.396962235175968e-07, "loss": 0.5486, "num_input_tokens_seen": 72899432, "step": 125560 }, { "epoch": 18.701966041108133, "grad_norm": 2.195699453353882, "learning_rate": 6.38966061117105e-07, "loss": 0.6673, "num_input_tokens_seen": 72902344, "step": 125565 }, { "epoch": 18.702710753649093, "grad_norm": 0.9297757744789124, "learning_rate": 6.382363102715255e-07, "loss": 0.6392, "num_input_tokens_seen": 72905128, "step": 125570 }, { "epoch": 18.70345546619005, "grad_norm": 2.565169095993042, "learning_rate": 6.375069709931792e-07, "loss": 0.7391, "num_input_tokens_seen": 72907784, "step": 125575 }, { "epoch": 18.70420017873101, "grad_norm": 1.465775728225708, "learning_rate": 6.367780432943948e-07, "loss": 0.744, "num_input_tokens_seen": 72910536, "step": 125580 }, { "epoch": 18.704944891271968, "grad_norm": 2.663822889328003, "learning_rate": 6.360495271874794e-07, "loss": 0.8048, "num_input_tokens_seen": 72913256, "step": 125585 }, { "epoch": 18.705689603812928, "grad_norm": 1.4098814725875854, "learning_rate": 6.353214226847482e-07, "loss": 0.5695, "num_input_tokens_seen": 72916328, "step": 125590 }, { "epoch": 18.70643431635389, "grad_norm": 1.288681983947754, "learning_rate": 6.345937297984966e-07, "loss": 0.6352, "num_input_tokens_seen": 72919432, "step": 125595 }, { "epoch": 18.707179028894846, "grad_norm": 1.7622568607330322, "learning_rate": 6.338664485410206e-07, "loss": 0.5648, "num_input_tokens_seen": 72922344, "step": 125600 }, { "epoch": 18.707923741435806, "grad_norm": 1.3419054746627808, "learning_rate": 6.331395789246048e-07, "loss": 0.5915, "num_input_tokens_seen": 72925320, "step": 125605 }, { "epoch": 18.708668453976763, "grad_norm": 2.3079700469970703, "learning_rate": 6.324131209615336e-07, "loss": 0.6892, "num_input_tokens_seen": 72928264, "step": 125610 }, { "epoch": 18.709413166517724, "grad_norm": 1.4124774932861328, "learning_rate": 6.316870746640751e-07, "loss": 0.5226, "num_input_tokens_seen": 72930728, "step": 125615 }, { "epoch": 18.710157879058684, "grad_norm": 1.6507965326309204, "learning_rate": 6.309614400444946e-07, "loss": 0.522, "num_input_tokens_seen": 72933672, "step": 125620 }, { "epoch": 18.71090259159964, "grad_norm": 1.0515480041503906, "learning_rate": 6.302362171150572e-07, "loss": 0.5601, "num_input_tokens_seen": 72936552, "step": 125625 }, { "epoch": 18.7116473041406, "grad_norm": 1.1461141109466553, "learning_rate": 6.295114058880059e-07, "loss": 0.6489, "num_input_tokens_seen": 72939944, "step": 125630 }, { "epoch": 18.712392016681562, "grad_norm": 1.4927550554275513, "learning_rate": 6.287870063755946e-07, "loss": 0.5356, "num_input_tokens_seen": 72942632, "step": 125635 }, { "epoch": 18.71313672922252, "grad_norm": 1.2695657014846802, "learning_rate": 6.280630185900555e-07, "loss": 0.6605, "num_input_tokens_seen": 72945672, "step": 125640 }, { "epoch": 18.71388144176348, "grad_norm": 1.8083257675170898, "learning_rate": 6.273394425436202e-07, "loss": 0.628, "num_input_tokens_seen": 72948520, "step": 125645 }, { "epoch": 18.714626154304437, "grad_norm": 1.014614462852478, "learning_rate": 6.26616278248518e-07, "loss": 0.6474, "num_input_tokens_seen": 72951400, "step": 125650 }, { "epoch": 18.715370866845397, "grad_norm": 1.4281727075576782, "learning_rate": 6.258935257169557e-07, "loss": 0.704, "num_input_tokens_seen": 72954408, "step": 125655 }, { "epoch": 18.716115579386358, "grad_norm": 0.9793062806129456, "learning_rate": 6.251711849611513e-07, "loss": 0.631, "num_input_tokens_seen": 72957608, "step": 125660 }, { "epoch": 18.716860291927315, "grad_norm": 1.8002647161483765, "learning_rate": 6.244492559933063e-07, "loss": 0.6116, "num_input_tokens_seen": 72960328, "step": 125665 }, { "epoch": 18.717605004468275, "grad_norm": 2.3036460876464844, "learning_rate": 6.237277388256191e-07, "loss": 0.5723, "num_input_tokens_seen": 72963304, "step": 125670 }, { "epoch": 18.718349717009236, "grad_norm": 1.0345535278320312, "learning_rate": 6.230066334702744e-07, "loss": 0.4784, "num_input_tokens_seen": 72966056, "step": 125675 }, { "epoch": 18.719094429550193, "grad_norm": 1.232603669166565, "learning_rate": 6.22285939939457e-07, "loss": 0.5254, "num_input_tokens_seen": 72968744, "step": 125680 }, { "epoch": 18.719839142091153, "grad_norm": 1.755842924118042, "learning_rate": 6.215656582453433e-07, "loss": 0.6751, "num_input_tokens_seen": 72971720, "step": 125685 }, { "epoch": 18.72058385463211, "grad_norm": 1.618899941444397, "learning_rate": 6.208457884001012e-07, "loss": 0.7008, "num_input_tokens_seen": 72974600, "step": 125690 }, { "epoch": 18.72132856717307, "grad_norm": 1.409326434135437, "learning_rate": 6.201263304158905e-07, "loss": 0.5093, "num_input_tokens_seen": 72977320, "step": 125695 }, { "epoch": 18.72207327971403, "grad_norm": 1.5526095628738403, "learning_rate": 6.194072843048681e-07, "loss": 0.5419, "num_input_tokens_seen": 72980200, "step": 125700 }, { "epoch": 18.72281799225499, "grad_norm": 1.9051347970962524, "learning_rate": 6.1868865007918e-07, "loss": 0.5356, "num_input_tokens_seen": 72983208, "step": 125705 }, { "epoch": 18.72356270479595, "grad_norm": 2.299032211303711, "learning_rate": 6.179704277509662e-07, "loss": 0.6964, "num_input_tokens_seen": 72986056, "step": 125710 }, { "epoch": 18.72430741733691, "grad_norm": 1.7062991857528687, "learning_rate": 6.172526173323617e-07, "loss": 0.6154, "num_input_tokens_seen": 72989000, "step": 125715 }, { "epoch": 18.725052129877866, "grad_norm": 1.7735249996185303, "learning_rate": 6.165352188354928e-07, "loss": 0.3789, "num_input_tokens_seen": 72991976, "step": 125720 }, { "epoch": 18.725796842418827, "grad_norm": 3.3014934062957764, "learning_rate": 6.158182322724804e-07, "loss": 0.5005, "num_input_tokens_seen": 72995080, "step": 125725 }, { "epoch": 18.726541554959784, "grad_norm": 1.2976343631744385, "learning_rate": 6.151016576554341e-07, "loss": 0.5281, "num_input_tokens_seen": 72998088, "step": 125730 }, { "epoch": 18.727286267500745, "grad_norm": 2.041156530380249, "learning_rate": 6.143854949964611e-07, "loss": 0.5525, "num_input_tokens_seen": 73000744, "step": 125735 }, { "epoch": 18.728030980041705, "grad_norm": 1.4882140159606934, "learning_rate": 6.136697443076628e-07, "loss": 0.5611, "num_input_tokens_seen": 73003368, "step": 125740 }, { "epoch": 18.728775692582662, "grad_norm": 1.2484228610992432, "learning_rate": 6.129544056011266e-07, "loss": 0.4753, "num_input_tokens_seen": 73006248, "step": 125745 }, { "epoch": 18.729520405123623, "grad_norm": 1.544656753540039, "learning_rate": 6.122394788889402e-07, "loss": 0.4936, "num_input_tokens_seen": 73009032, "step": 125750 }, { "epoch": 18.730265117664583, "grad_norm": 2.042664051055908, "learning_rate": 6.115249641831828e-07, "loss": 0.6064, "num_input_tokens_seen": 73011624, "step": 125755 }, { "epoch": 18.73100983020554, "grad_norm": 1.4105298519134521, "learning_rate": 6.108108614959224e-07, "loss": 0.557, "num_input_tokens_seen": 73014440, "step": 125760 }, { "epoch": 18.7317545427465, "grad_norm": 1.3217449188232422, "learning_rate": 6.100971708392272e-07, "loss": 0.5689, "num_input_tokens_seen": 73017704, "step": 125765 }, { "epoch": 18.732499255287458, "grad_norm": 4.037950038909912, "learning_rate": 6.093838922251488e-07, "loss": 0.4662, "num_input_tokens_seen": 73020296, "step": 125770 }, { "epoch": 18.733243967828418, "grad_norm": 1.2890734672546387, "learning_rate": 6.086710256657413e-07, "loss": 0.652, "num_input_tokens_seen": 73023176, "step": 125775 }, { "epoch": 18.73398868036938, "grad_norm": 1.5917675495147705, "learning_rate": 6.079585711730451e-07, "loss": 0.7515, "num_input_tokens_seen": 73025992, "step": 125780 }, { "epoch": 18.734733392910336, "grad_norm": 2.8503000736236572, "learning_rate": 6.072465287591005e-07, "loss": 0.445, "num_input_tokens_seen": 73028968, "step": 125785 }, { "epoch": 18.735478105451296, "grad_norm": 1.1211165189743042, "learning_rate": 6.065348984359314e-07, "loss": 0.4979, "num_input_tokens_seen": 73031784, "step": 125790 }, { "epoch": 18.736222817992257, "grad_norm": 2.4392380714416504, "learning_rate": 6.058236802155643e-07, "loss": 0.4457, "num_input_tokens_seen": 73034728, "step": 125795 }, { "epoch": 18.736967530533214, "grad_norm": 1.5305171012878418, "learning_rate": 6.051128741100115e-07, "loss": 0.8437, "num_input_tokens_seen": 73037928, "step": 125800 }, { "epoch": 18.737712243074174, "grad_norm": 1.3477753400802612, "learning_rate": 6.044024801312831e-07, "loss": 0.6059, "num_input_tokens_seen": 73040744, "step": 125805 }, { "epoch": 18.73845695561513, "grad_norm": 1.776593565940857, "learning_rate": 6.036924982913805e-07, "loss": 0.5599, "num_input_tokens_seen": 73043848, "step": 125810 }, { "epoch": 18.739201668156092, "grad_norm": 2.1526808738708496, "learning_rate": 6.029829286022998e-07, "loss": 0.6133, "num_input_tokens_seen": 73046472, "step": 125815 }, { "epoch": 18.739946380697052, "grad_norm": 1.0649882555007935, "learning_rate": 6.022737710760256e-07, "loss": 0.6423, "num_input_tokens_seen": 73049512, "step": 125820 }, { "epoch": 18.74069109323801, "grad_norm": 2.581563711166382, "learning_rate": 6.015650257245348e-07, "loss": 0.6862, "num_input_tokens_seen": 73052520, "step": 125825 }, { "epoch": 18.74143580577897, "grad_norm": 1.4386186599731445, "learning_rate": 6.008566925598119e-07, "loss": 0.4893, "num_input_tokens_seen": 73055240, "step": 125830 }, { "epoch": 18.742180518319927, "grad_norm": 1.6345524787902832, "learning_rate": 6.001487715938142e-07, "loss": 0.5542, "num_input_tokens_seen": 73058152, "step": 125835 }, { "epoch": 18.742925230860887, "grad_norm": 2.2226741313934326, "learning_rate": 5.994412628385043e-07, "loss": 0.6261, "num_input_tokens_seen": 73061096, "step": 125840 }, { "epoch": 18.743669943401848, "grad_norm": 2.401901960372925, "learning_rate": 5.987341663058338e-07, "loss": 0.4956, "num_input_tokens_seen": 73063720, "step": 125845 }, { "epoch": 18.744414655942805, "grad_norm": 1.2005902528762817, "learning_rate": 5.980274820077514e-07, "loss": 0.5026, "num_input_tokens_seen": 73066728, "step": 125850 }, { "epoch": 18.745159368483765, "grad_norm": 3.534635543823242, "learning_rate": 5.97321209956192e-07, "loss": 0.5699, "num_input_tokens_seen": 73069512, "step": 125855 }, { "epoch": 18.745904081024726, "grad_norm": 2.3137221336364746, "learning_rate": 5.966153501630877e-07, "loss": 0.6091, "num_input_tokens_seen": 73072328, "step": 125860 }, { "epoch": 18.746648793565683, "grad_norm": 1.9571205377578735, "learning_rate": 5.95909902640368e-07, "loss": 0.4719, "num_input_tokens_seen": 73075176, "step": 125865 }, { "epoch": 18.747393506106643, "grad_norm": 2.290621042251587, "learning_rate": 5.952048673999427e-07, "loss": 0.6144, "num_input_tokens_seen": 73077928, "step": 125870 }, { "epoch": 18.7481382186476, "grad_norm": 1.3720804452896118, "learning_rate": 5.945002444537329e-07, "loss": 0.7425, "num_input_tokens_seen": 73081032, "step": 125875 }, { "epoch": 18.74888293118856, "grad_norm": 2.209074020385742, "learning_rate": 5.937960338136317e-07, "loss": 0.5687, "num_input_tokens_seen": 73084136, "step": 125880 }, { "epoch": 18.74962764372952, "grad_norm": 1.277598261833191, "learning_rate": 5.930922354915436e-07, "loss": 0.559, "num_input_tokens_seen": 73087112, "step": 125885 }, { "epoch": 18.75037235627048, "grad_norm": 1.6558079719543457, "learning_rate": 5.923888494993562e-07, "loss": 0.4835, "num_input_tokens_seen": 73089992, "step": 125890 }, { "epoch": 18.75111706881144, "grad_norm": 2.326082229614258, "learning_rate": 5.916858758489519e-07, "loss": 0.5657, "num_input_tokens_seen": 73092808, "step": 125895 }, { "epoch": 18.7518617813524, "grad_norm": 1.9141225814819336, "learning_rate": 5.90983314552207e-07, "loss": 0.6904, "num_input_tokens_seen": 73095784, "step": 125900 }, { "epoch": 18.752606493893357, "grad_norm": 2.10280179977417, "learning_rate": 5.902811656209927e-07, "loss": 0.5366, "num_input_tokens_seen": 73098824, "step": 125905 }, { "epoch": 18.753351206434317, "grad_norm": 1.15151846408844, "learning_rate": 5.895794290671691e-07, "loss": 0.4507, "num_input_tokens_seen": 73101320, "step": 125910 }, { "epoch": 18.754095918975274, "grad_norm": 1.0857690572738647, "learning_rate": 5.888781049025877e-07, "loss": 0.7458, "num_input_tokens_seen": 73104104, "step": 125915 }, { "epoch": 18.754840631516235, "grad_norm": 1.0211883783340454, "learning_rate": 5.881771931391028e-07, "loss": 0.4088, "num_input_tokens_seen": 73106984, "step": 125920 }, { "epoch": 18.755585344057195, "grad_norm": 2.6459720134735107, "learning_rate": 5.874766937885523e-07, "loss": 0.5666, "num_input_tokens_seen": 73110696, "step": 125925 }, { "epoch": 18.756330056598152, "grad_norm": 1.124220848083496, "learning_rate": 5.867766068627739e-07, "loss": 0.3788, "num_input_tokens_seen": 73113704, "step": 125930 }, { "epoch": 18.757074769139113, "grad_norm": 1.697325587272644, "learning_rate": 5.860769323735887e-07, "loss": 0.4196, "num_input_tokens_seen": 73116584, "step": 125935 }, { "epoch": 18.757819481680073, "grad_norm": 1.9903416633605957, "learning_rate": 5.853776703328207e-07, "loss": 0.7025, "num_input_tokens_seen": 73119240, "step": 125940 }, { "epoch": 18.75856419422103, "grad_norm": 2.5649185180664062, "learning_rate": 5.846788207522852e-07, "loss": 0.6939, "num_input_tokens_seen": 73122280, "step": 125945 }, { "epoch": 18.75930890676199, "grad_norm": 1.5541691780090332, "learning_rate": 5.83980383643784e-07, "loss": 0.5881, "num_input_tokens_seen": 73125128, "step": 125950 }, { "epoch": 18.760053619302948, "grad_norm": 1.1784566640853882, "learning_rate": 5.832823590191216e-07, "loss": 0.5686, "num_input_tokens_seen": 73128072, "step": 125955 }, { "epoch": 18.760798331843908, "grad_norm": 1.2618956565856934, "learning_rate": 5.825847468900858e-07, "loss": 0.4917, "num_input_tokens_seen": 73131048, "step": 125960 }, { "epoch": 18.76154304438487, "grad_norm": 1.1004220247268677, "learning_rate": 5.81887547268467e-07, "loss": 0.497, "num_input_tokens_seen": 73133768, "step": 125965 }, { "epoch": 18.762287756925826, "grad_norm": 2.3043551445007324, "learning_rate": 5.811907601660393e-07, "loss": 0.6518, "num_input_tokens_seen": 73136904, "step": 125970 }, { "epoch": 18.763032469466786, "grad_norm": 2.3545796871185303, "learning_rate": 5.804943855945738e-07, "loss": 0.6233, "num_input_tokens_seen": 73139944, "step": 125975 }, { "epoch": 18.763777182007743, "grad_norm": 1.870138168334961, "learning_rate": 5.797984235658388e-07, "loss": 0.5345, "num_input_tokens_seen": 73142984, "step": 125980 }, { "epoch": 18.764521894548704, "grad_norm": 1.830440640449524, "learning_rate": 5.791028740915888e-07, "loss": 0.6066, "num_input_tokens_seen": 73145864, "step": 125985 }, { "epoch": 18.765266607089664, "grad_norm": 1.3248262405395508, "learning_rate": 5.784077371835756e-07, "loss": 0.8008, "num_input_tokens_seen": 73148584, "step": 125990 }, { "epoch": 18.76601131963062, "grad_norm": 0.5411946773529053, "learning_rate": 5.777130128535396e-07, "loss": 0.3685, "num_input_tokens_seen": 73151400, "step": 125995 }, { "epoch": 18.766756032171582, "grad_norm": 1.6663682460784912, "learning_rate": 5.770187011132244e-07, "loss": 0.5125, "num_input_tokens_seen": 73154152, "step": 126000 }, { "epoch": 18.767500744712542, "grad_norm": 2.600348711013794, "learning_rate": 5.763248019743539e-07, "loss": 0.5647, "num_input_tokens_seen": 73156808, "step": 126005 }, { "epoch": 18.7682454572535, "grad_norm": 1.481675148010254, "learning_rate": 5.756313154486547e-07, "loss": 0.5962, "num_input_tokens_seen": 73159688, "step": 126010 }, { "epoch": 18.76899016979446, "grad_norm": 1.614025354385376, "learning_rate": 5.7493824154784e-07, "loss": 0.7199, "num_input_tokens_seen": 73162376, "step": 126015 }, { "epoch": 18.769734882335417, "grad_norm": 1.4541295766830444, "learning_rate": 5.742455802836166e-07, "loss": 0.5142, "num_input_tokens_seen": 73165032, "step": 126020 }, { "epoch": 18.770479594876377, "grad_norm": 3.3222148418426514, "learning_rate": 5.735533316676922e-07, "loss": 0.6982, "num_input_tokens_seen": 73167816, "step": 126025 }, { "epoch": 18.771224307417338, "grad_norm": 1.7750917673110962, "learning_rate": 5.728614957117573e-07, "loss": 0.6607, "num_input_tokens_seen": 73170504, "step": 126030 }, { "epoch": 18.771969019958295, "grad_norm": 1.4006153345108032, "learning_rate": 5.721700724274997e-07, "loss": 0.5551, "num_input_tokens_seen": 73173128, "step": 126035 }, { "epoch": 18.772713732499255, "grad_norm": 2.122307062149048, "learning_rate": 5.714790618266019e-07, "loss": 0.724, "num_input_tokens_seen": 73175880, "step": 126040 }, { "epoch": 18.773458445040216, "grad_norm": 2.341456890106201, "learning_rate": 5.707884639207406e-07, "loss": 0.5672, "num_input_tokens_seen": 73178760, "step": 126045 }, { "epoch": 18.774203157581173, "grad_norm": 1.3179571628570557, "learning_rate": 5.700982787215759e-07, "loss": 0.6356, "num_input_tokens_seen": 73181512, "step": 126050 }, { "epoch": 18.774947870122134, "grad_norm": 1.0190664529800415, "learning_rate": 5.694085062407705e-07, "loss": 0.5085, "num_input_tokens_seen": 73184648, "step": 126055 }, { "epoch": 18.77569258266309, "grad_norm": 3.819793701171875, "learning_rate": 5.687191464899821e-07, "loss": 0.6491, "num_input_tokens_seen": 73187784, "step": 126060 }, { "epoch": 18.77643729520405, "grad_norm": 1.8133735656738281, "learning_rate": 5.680301994808485e-07, "loss": 0.7054, "num_input_tokens_seen": 73190760, "step": 126065 }, { "epoch": 18.77718200774501, "grad_norm": 1.4864264726638794, "learning_rate": 5.673416652250158e-07, "loss": 0.5774, "num_input_tokens_seen": 73193416, "step": 126070 }, { "epoch": 18.77792672028597, "grad_norm": 1.6993672847747803, "learning_rate": 5.666535437341108e-07, "loss": 0.4805, "num_input_tokens_seen": 73196424, "step": 126075 }, { "epoch": 18.77867143282693, "grad_norm": 0.7892978191375732, "learning_rate": 5.659658350197661e-07, "loss": 0.6182, "num_input_tokens_seen": 73199240, "step": 126080 }, { "epoch": 18.77941614536789, "grad_norm": 1.2360135316848755, "learning_rate": 5.652785390935889e-07, "loss": 0.5596, "num_input_tokens_seen": 73202056, "step": 126085 }, { "epoch": 18.780160857908847, "grad_norm": 2.8839783668518066, "learning_rate": 5.645916559672004e-07, "loss": 0.5548, "num_input_tokens_seen": 73205096, "step": 126090 }, { "epoch": 18.780905570449807, "grad_norm": 1.741491436958313, "learning_rate": 5.639051856522026e-07, "loss": 0.608, "num_input_tokens_seen": 73207624, "step": 126095 }, { "epoch": 18.781650282990764, "grad_norm": 1.3040378093719482, "learning_rate": 5.63219128160189e-07, "loss": 0.4775, "num_input_tokens_seen": 73210504, "step": 126100 }, { "epoch": 18.782394995531725, "grad_norm": 1.441557765007019, "learning_rate": 5.625334835027502e-07, "loss": 0.4911, "num_input_tokens_seen": 73213256, "step": 126105 }, { "epoch": 18.783139708072685, "grad_norm": 1.39152193069458, "learning_rate": 5.618482516914714e-07, "loss": 0.6504, "num_input_tokens_seen": 73216680, "step": 126110 }, { "epoch": 18.783884420613642, "grad_norm": 3.3045575618743896, "learning_rate": 5.611634327379295e-07, "loss": 0.5972, "num_input_tokens_seen": 73219848, "step": 126115 }, { "epoch": 18.784629133154603, "grad_norm": 1.1798704862594604, "learning_rate": 5.60479026653693e-07, "loss": 0.6238, "num_input_tokens_seen": 73222856, "step": 126120 }, { "epoch": 18.785373845695563, "grad_norm": 1.2325798273086548, "learning_rate": 5.59795033450325e-07, "loss": 0.5024, "num_input_tokens_seen": 73225608, "step": 126125 }, { "epoch": 18.78611855823652, "grad_norm": 2.1744213104248047, "learning_rate": 5.591114531393771e-07, "loss": 0.5595, "num_input_tokens_seen": 73228712, "step": 126130 }, { "epoch": 18.78686327077748, "grad_norm": 0.8210430145263672, "learning_rate": 5.584282857324014e-07, "loss": 0.4574, "num_input_tokens_seen": 73231560, "step": 126135 }, { "epoch": 18.787607983318438, "grad_norm": 2.8275039196014404, "learning_rate": 5.577455312409413e-07, "loss": 0.4951, "num_input_tokens_seen": 73234760, "step": 126140 }, { "epoch": 18.7883526958594, "grad_norm": 1.004967451095581, "learning_rate": 5.570631896765239e-07, "loss": 0.4298, "num_input_tokens_seen": 73237544, "step": 126145 }, { "epoch": 18.78909740840036, "grad_norm": 1.0177795886993408, "learning_rate": 5.563812610506841e-07, "loss": 0.692, "num_input_tokens_seen": 73240520, "step": 126150 }, { "epoch": 18.789842120941316, "grad_norm": 4.6248555183410645, "learning_rate": 5.556997453749379e-07, "loss": 0.5905, "num_input_tokens_seen": 73243560, "step": 126155 }, { "epoch": 18.790586833482276, "grad_norm": 1.4089407920837402, "learning_rate": 5.550186426608039e-07, "loss": 0.6608, "num_input_tokens_seen": 73246536, "step": 126160 }, { "epoch": 18.791331546023237, "grad_norm": 2.7400310039520264, "learning_rate": 5.543379529197839e-07, "loss": 0.5704, "num_input_tokens_seen": 73249640, "step": 126165 }, { "epoch": 18.792076258564194, "grad_norm": 1.6891838312149048, "learning_rate": 5.536576761633772e-07, "loss": 0.6251, "num_input_tokens_seen": 73252712, "step": 126170 }, { "epoch": 18.792820971105154, "grad_norm": 1.8597618341445923, "learning_rate": 5.529778124030799e-07, "loss": 0.5566, "num_input_tokens_seen": 73255656, "step": 126175 }, { "epoch": 18.79356568364611, "grad_norm": 0.9178826212882996, "learning_rate": 5.522983616503746e-07, "loss": 0.8351, "num_input_tokens_seen": 73258440, "step": 126180 }, { "epoch": 18.794310396187072, "grad_norm": 1.483245611190796, "learning_rate": 5.51619323916741e-07, "loss": 0.7402, "num_input_tokens_seen": 73261448, "step": 126185 }, { "epoch": 18.795055108728032, "grad_norm": 1.1370587348937988, "learning_rate": 5.509406992136479e-07, "loss": 0.6206, "num_input_tokens_seen": 73264232, "step": 126190 }, { "epoch": 18.79579982126899, "grad_norm": 2.724207878112793, "learning_rate": 5.502624875525664e-07, "loss": 0.6388, "num_input_tokens_seen": 73266856, "step": 126195 }, { "epoch": 18.79654453380995, "grad_norm": 1.2851324081420898, "learning_rate": 5.495846889449485e-07, "loss": 0.623, "num_input_tokens_seen": 73269544, "step": 126200 }, { "epoch": 18.797289246350907, "grad_norm": 1.4378771781921387, "learning_rate": 5.48907303402249e-07, "loss": 0.6153, "num_input_tokens_seen": 73272488, "step": 126205 }, { "epoch": 18.798033958891867, "grad_norm": 1.3157581090927124, "learning_rate": 5.48230330935906e-07, "loss": 0.5507, "num_input_tokens_seen": 73275016, "step": 126210 }, { "epoch": 18.798778671432828, "grad_norm": 1.5045666694641113, "learning_rate": 5.475537715573631e-07, "loss": 0.6085, "num_input_tokens_seen": 73278120, "step": 126215 }, { "epoch": 18.799523383973785, "grad_norm": 1.314628005027771, "learning_rate": 5.468776252780472e-07, "loss": 0.5888, "num_input_tokens_seen": 73281224, "step": 126220 }, { "epoch": 18.800268096514746, "grad_norm": 0.8365334868431091, "learning_rate": 5.46201892109377e-07, "loss": 0.6841, "num_input_tokens_seen": 73284232, "step": 126225 }, { "epoch": 18.801012809055706, "grad_norm": 1.6906588077545166, "learning_rate": 5.455265720627767e-07, "loss": 0.6193, "num_input_tokens_seen": 73286984, "step": 126230 }, { "epoch": 18.801757521596663, "grad_norm": 1.8811371326446533, "learning_rate": 5.448516651496482e-07, "loss": 0.6245, "num_input_tokens_seen": 73289768, "step": 126235 }, { "epoch": 18.802502234137624, "grad_norm": 1.624198317527771, "learning_rate": 5.441771713813992e-07, "loss": 0.6488, "num_input_tokens_seen": 73292712, "step": 126240 }, { "epoch": 18.80324694667858, "grad_norm": 0.8771525025367737, "learning_rate": 5.435030907694149e-07, "loss": 0.5785, "num_input_tokens_seen": 73295720, "step": 126245 }, { "epoch": 18.80399165921954, "grad_norm": 1.7586864233016968, "learning_rate": 5.428294233250947e-07, "loss": 0.4929, "num_input_tokens_seen": 73298568, "step": 126250 }, { "epoch": 18.8047363717605, "grad_norm": 2.3898682594299316, "learning_rate": 5.421561690598126e-07, "loss": 0.6504, "num_input_tokens_seen": 73301320, "step": 126255 }, { "epoch": 18.80548108430146, "grad_norm": 1.7507935762405396, "learning_rate": 5.414833279849429e-07, "loss": 0.8207, "num_input_tokens_seen": 73304104, "step": 126260 }, { "epoch": 18.80622579684242, "grad_norm": 2.8642232418060303, "learning_rate": 5.408109001118544e-07, "loss": 0.6007, "num_input_tokens_seen": 73306952, "step": 126265 }, { "epoch": 18.80697050938338, "grad_norm": 0.7008928656578064, "learning_rate": 5.401388854519046e-07, "loss": 0.6199, "num_input_tokens_seen": 73309800, "step": 126270 }, { "epoch": 18.807715221924337, "grad_norm": 1.6126465797424316, "learning_rate": 5.394672840164511e-07, "loss": 0.6503, "num_input_tokens_seen": 73312808, "step": 126275 }, { "epoch": 18.808459934465297, "grad_norm": 1.788811445236206, "learning_rate": 5.387960958168375e-07, "loss": 0.5739, "num_input_tokens_seen": 73315752, "step": 126280 }, { "epoch": 18.809204647006254, "grad_norm": 2.9187510013580322, "learning_rate": 5.381253208644021e-07, "loss": 0.6689, "num_input_tokens_seen": 73318760, "step": 126285 }, { "epoch": 18.809949359547215, "grad_norm": 1.251587152481079, "learning_rate": 5.374549591704747e-07, "loss": 0.5815, "num_input_tokens_seen": 73321576, "step": 126290 }, { "epoch": 18.810694072088175, "grad_norm": 1.6400467157363892, "learning_rate": 5.367850107463879e-07, "loss": 0.596, "num_input_tokens_seen": 73324744, "step": 126295 }, { "epoch": 18.811438784629132, "grad_norm": 0.7605485320091248, "learning_rate": 5.36115475603452e-07, "loss": 0.4958, "num_input_tokens_seen": 73327560, "step": 126300 }, { "epoch": 18.812183497170093, "grad_norm": 1.6869689226150513, "learning_rate": 5.354463537529831e-07, "loss": 0.7215, "num_input_tokens_seen": 73330600, "step": 126305 }, { "epoch": 18.812928209711053, "grad_norm": 2.1821842193603516, "learning_rate": 5.347776452062831e-07, "loss": 0.5093, "num_input_tokens_seen": 73333320, "step": 126310 }, { "epoch": 18.81367292225201, "grad_norm": 1.457680344581604, "learning_rate": 5.341093499746485e-07, "loss": 0.5045, "num_input_tokens_seen": 73336616, "step": 126315 }, { "epoch": 18.81441763479297, "grad_norm": 1.3293993473052979, "learning_rate": 5.334414680693705e-07, "loss": 0.6406, "num_input_tokens_seen": 73339496, "step": 126320 }, { "epoch": 18.815162347333928, "grad_norm": 1.7803797721862793, "learning_rate": 5.327739995017316e-07, "loss": 0.5199, "num_input_tokens_seen": 73342248, "step": 126325 }, { "epoch": 18.81590705987489, "grad_norm": 1.8704413175582886, "learning_rate": 5.32106944283009e-07, "loss": 0.4892, "num_input_tokens_seen": 73344872, "step": 126330 }, { "epoch": 18.81665177241585, "grad_norm": 1.220691204071045, "learning_rate": 5.31440302424474e-07, "loss": 0.6038, "num_input_tokens_seen": 73347976, "step": 126335 }, { "epoch": 18.817396484956806, "grad_norm": 1.6921474933624268, "learning_rate": 5.307740739373818e-07, "loss": 0.6809, "num_input_tokens_seen": 73351016, "step": 126340 }, { "epoch": 18.818141197497766, "grad_norm": 1.3062434196472168, "learning_rate": 5.301082588329953e-07, "loss": 0.7284, "num_input_tokens_seen": 73354248, "step": 126345 }, { "epoch": 18.818885910038723, "grad_norm": 1.5612396001815796, "learning_rate": 5.294428571225585e-07, "loss": 0.5616, "num_input_tokens_seen": 73357224, "step": 126350 }, { "epoch": 18.819630622579684, "grad_norm": 1.08145010471344, "learning_rate": 5.287778688173151e-07, "loss": 0.6629, "num_input_tokens_seen": 73360104, "step": 126355 }, { "epoch": 18.820375335120644, "grad_norm": 1.904167890548706, "learning_rate": 5.281132939284977e-07, "loss": 0.589, "num_input_tokens_seen": 73362984, "step": 126360 }, { "epoch": 18.8211200476616, "grad_norm": 1.21565580368042, "learning_rate": 5.274491324673309e-07, "loss": 0.5986, "num_input_tokens_seen": 73365992, "step": 126365 }, { "epoch": 18.821864760202562, "grad_norm": 1.0813539028167725, "learning_rate": 5.267853844450416e-07, "loss": 0.5147, "num_input_tokens_seen": 73368744, "step": 126370 }, { "epoch": 18.822609472743522, "grad_norm": 1.6967387199401855, "learning_rate": 5.261220498728403e-07, "loss": 0.5074, "num_input_tokens_seen": 73371656, "step": 126375 }, { "epoch": 18.82335418528448, "grad_norm": 1.342260479927063, "learning_rate": 5.254591287619348e-07, "loss": 0.5503, "num_input_tokens_seen": 73374792, "step": 126380 }, { "epoch": 18.82409889782544, "grad_norm": 1.9790595769882202, "learning_rate": 5.247966211235161e-07, "loss": 0.7241, "num_input_tokens_seen": 73378024, "step": 126385 }, { "epoch": 18.824843610366397, "grad_norm": 1.8152821063995361, "learning_rate": 5.241345269687864e-07, "loss": 0.5861, "num_input_tokens_seen": 73381192, "step": 126390 }, { "epoch": 18.825588322907358, "grad_norm": 1.311015009880066, "learning_rate": 5.234728463089284e-07, "loss": 0.5159, "num_input_tokens_seen": 73384104, "step": 126395 }, { "epoch": 18.826333035448318, "grad_norm": 3.315390110015869, "learning_rate": 5.228115791551191e-07, "loss": 0.6209, "num_input_tokens_seen": 73387080, "step": 126400 }, { "epoch": 18.827077747989275, "grad_norm": 0.9465765357017517, "learning_rate": 5.221507255185304e-07, "loss": 0.4815, "num_input_tokens_seen": 73389928, "step": 126405 }, { "epoch": 18.827822460530236, "grad_norm": 0.6675618290901184, "learning_rate": 5.214902854103282e-07, "loss": 0.5128, "num_input_tokens_seen": 73392488, "step": 126410 }, { "epoch": 18.828567173071196, "grad_norm": 1.231310486793518, "learning_rate": 5.208302588416647e-07, "loss": 0.7245, "num_input_tokens_seen": 73395240, "step": 126415 }, { "epoch": 18.829311885612153, "grad_norm": 2.7259700298309326, "learning_rate": 5.201706458236977e-07, "loss": 0.7331, "num_input_tokens_seen": 73398216, "step": 126420 }, { "epoch": 18.830056598153114, "grad_norm": 1.3723368644714355, "learning_rate": 5.195114463675682e-07, "loss": 0.6707, "num_input_tokens_seen": 73401128, "step": 126425 }, { "epoch": 18.83080131069407, "grad_norm": 1.7141891717910767, "learning_rate": 5.188526604844118e-07, "loss": 0.611, "num_input_tokens_seen": 73404328, "step": 126430 }, { "epoch": 18.83154602323503, "grad_norm": 2.5547823905944824, "learning_rate": 5.181942881853585e-07, "loss": 0.3937, "num_input_tokens_seen": 73406952, "step": 126435 }, { "epoch": 18.83229073577599, "grad_norm": 1.248071551322937, "learning_rate": 5.1753632948153e-07, "loss": 0.6272, "num_input_tokens_seen": 73410152, "step": 126440 }, { "epoch": 18.83303544831695, "grad_norm": 1.4054772853851318, "learning_rate": 5.168787843840423e-07, "loss": 0.6675, "num_input_tokens_seen": 73412744, "step": 126445 }, { "epoch": 18.83378016085791, "grad_norm": 1.4914463758468628, "learning_rate": 5.162216529040004e-07, "loss": 0.477, "num_input_tokens_seen": 73415688, "step": 126450 }, { "epoch": 18.83452487339887, "grad_norm": 1.6705702543258667, "learning_rate": 5.155649350525149e-07, "loss": 0.6182, "num_input_tokens_seen": 73418440, "step": 126455 }, { "epoch": 18.835269585939827, "grad_norm": 1.8341652154922485, "learning_rate": 5.149086308406742e-07, "loss": 0.6289, "num_input_tokens_seen": 73421288, "step": 126460 }, { "epoch": 18.836014298480787, "grad_norm": 1.4309736490249634, "learning_rate": 5.142527402795638e-07, "loss": 0.6958, "num_input_tokens_seen": 73424264, "step": 126465 }, { "epoch": 18.836759011021744, "grad_norm": 2.186316967010498, "learning_rate": 5.135972633802694e-07, "loss": 0.6038, "num_input_tokens_seen": 73427272, "step": 126470 }, { "epoch": 18.837503723562705, "grad_norm": 1.991206407546997, "learning_rate": 5.129422001538597e-07, "loss": 0.6247, "num_input_tokens_seen": 73430472, "step": 126475 }, { "epoch": 18.838248436103665, "grad_norm": 2.5528554916381836, "learning_rate": 5.122875506114067e-07, "loss": 0.6452, "num_input_tokens_seen": 73433672, "step": 126480 }, { "epoch": 18.838993148644622, "grad_norm": 1.2351664304733276, "learning_rate": 5.116333147639651e-07, "loss": 0.7138, "num_input_tokens_seen": 73436552, "step": 126485 }, { "epoch": 18.839737861185583, "grad_norm": 1.9086753129959106, "learning_rate": 5.109794926225903e-07, "loss": 0.5731, "num_input_tokens_seen": 73439272, "step": 126490 }, { "epoch": 18.84048257372654, "grad_norm": 0.9386258721351624, "learning_rate": 5.103260841983287e-07, "loss": 0.4647, "num_input_tokens_seen": 73442216, "step": 126495 }, { "epoch": 18.8412272862675, "grad_norm": 1.1597998142242432, "learning_rate": 5.096730895022189e-07, "loss": 0.684, "num_input_tokens_seen": 73445160, "step": 126500 }, { "epoch": 18.84197199880846, "grad_norm": 2.7804291248321533, "learning_rate": 5.090205085452909e-07, "loss": 0.4204, "num_input_tokens_seen": 73448072, "step": 126505 }, { "epoch": 18.842716711349418, "grad_norm": 1.1756058931350708, "learning_rate": 5.083683413385665e-07, "loss": 0.4553, "num_input_tokens_seen": 73451144, "step": 126510 }, { "epoch": 18.84346142389038, "grad_norm": 1.16720449924469, "learning_rate": 5.077165878930701e-07, "loss": 0.5563, "num_input_tokens_seen": 73454056, "step": 126515 }, { "epoch": 18.84420613643134, "grad_norm": 1.2476494312286377, "learning_rate": 5.070652482198069e-07, "loss": 0.6411, "num_input_tokens_seen": 73457064, "step": 126520 }, { "epoch": 18.844950848972296, "grad_norm": 2.2719080448150635, "learning_rate": 5.064143223297845e-07, "loss": 0.6853, "num_input_tokens_seen": 73459816, "step": 126525 }, { "epoch": 18.845695561513256, "grad_norm": 1.31930673122406, "learning_rate": 5.057638102339945e-07, "loss": 0.6593, "num_input_tokens_seen": 73462248, "step": 126530 }, { "epoch": 18.846440274054213, "grad_norm": 2.1490328311920166, "learning_rate": 5.051137119434362e-07, "loss": 0.5316, "num_input_tokens_seen": 73465160, "step": 126535 }, { "epoch": 18.847184986595174, "grad_norm": 2.3212246894836426, "learning_rate": 5.044640274690815e-07, "loss": 0.5937, "num_input_tokens_seen": 73468072, "step": 126540 }, { "epoch": 18.847929699136134, "grad_norm": 2.2307629585266113, "learning_rate": 5.038147568219131e-07, "loss": 0.4365, "num_input_tokens_seen": 73471176, "step": 126545 }, { "epoch": 18.84867441167709, "grad_norm": 2.387244701385498, "learning_rate": 5.031659000128974e-07, "loss": 0.7878, "num_input_tokens_seen": 73473896, "step": 126550 }, { "epoch": 18.849419124218052, "grad_norm": 1.676695704460144, "learning_rate": 5.02517457052995e-07, "loss": 0.6352, "num_input_tokens_seen": 73476968, "step": 126555 }, { "epoch": 18.850163836759013, "grad_norm": 3.3399791717529297, "learning_rate": 5.018694279531638e-07, "loss": 0.6742, "num_input_tokens_seen": 73480168, "step": 126560 }, { "epoch": 18.85090854929997, "grad_norm": 1.4469183683395386, "learning_rate": 5.012218127243478e-07, "loss": 0.5738, "num_input_tokens_seen": 73483016, "step": 126565 }, { "epoch": 18.85165326184093, "grad_norm": 1.751834750175476, "learning_rate": 5.005746113774912e-07, "loss": 0.5063, "num_input_tokens_seen": 73485800, "step": 126570 }, { "epoch": 18.852397974381887, "grad_norm": 1.4077017307281494, "learning_rate": 4.999278239235267e-07, "loss": 0.4556, "num_input_tokens_seen": 73488616, "step": 126575 }, { "epoch": 18.853142686922848, "grad_norm": 1.4492779970169067, "learning_rate": 4.992814503733817e-07, "loss": 0.7168, "num_input_tokens_seen": 73491560, "step": 126580 }, { "epoch": 18.853887399463808, "grad_norm": 1.6504615545272827, "learning_rate": 4.986354907379726e-07, "loss": 0.534, "num_input_tokens_seen": 73494504, "step": 126585 }, { "epoch": 18.854632112004765, "grad_norm": 2.268186092376709, "learning_rate": 4.979899450282155e-07, "loss": 0.537, "num_input_tokens_seen": 73497672, "step": 126590 }, { "epoch": 18.855376824545726, "grad_norm": 1.000109314918518, "learning_rate": 4.973448132550157e-07, "loss": 0.4111, "num_input_tokens_seen": 73500552, "step": 126595 }, { "epoch": 18.856121537086686, "grad_norm": 1.1580890417099, "learning_rate": 4.967000954292728e-07, "loss": 0.6075, "num_input_tokens_seen": 73503592, "step": 126600 }, { "epoch": 18.856866249627643, "grad_norm": 1.662936806678772, "learning_rate": 4.96055791561878e-07, "loss": 0.4455, "num_input_tokens_seen": 73506664, "step": 126605 }, { "epoch": 18.857610962168604, "grad_norm": 2.1710901260375977, "learning_rate": 4.954119016637115e-07, "loss": 0.7415, "num_input_tokens_seen": 73509480, "step": 126610 }, { "epoch": 18.85835567470956, "grad_norm": 2.0250062942504883, "learning_rate": 4.94768425745662e-07, "loss": 0.4817, "num_input_tokens_seen": 73512392, "step": 126615 }, { "epoch": 18.85910038725052, "grad_norm": 1.590040922164917, "learning_rate": 4.9412536381859e-07, "loss": 0.5971, "num_input_tokens_seen": 73515208, "step": 126620 }, { "epoch": 18.85984509979148, "grad_norm": 1.8091925382614136, "learning_rate": 4.934827158933647e-07, "loss": 0.5185, "num_input_tokens_seen": 73518376, "step": 126625 }, { "epoch": 18.86058981233244, "grad_norm": 3.284522533416748, "learning_rate": 4.928404819808413e-07, "loss": 0.7932, "num_input_tokens_seen": 73521096, "step": 126630 }, { "epoch": 18.8613345248734, "grad_norm": 1.915434718132019, "learning_rate": 4.921986620918723e-07, "loss": 0.5282, "num_input_tokens_seen": 73524008, "step": 126635 }, { "epoch": 18.86207923741436, "grad_norm": 1.5742911100387573, "learning_rate": 4.915572562372961e-07, "loss": 0.6796, "num_input_tokens_seen": 73526792, "step": 126640 }, { "epoch": 18.862823949955317, "grad_norm": 2.5433781147003174, "learning_rate": 4.909162644279486e-07, "loss": 0.4263, "num_input_tokens_seen": 73529352, "step": 126645 }, { "epoch": 18.863568662496277, "grad_norm": 1.2062363624572754, "learning_rate": 4.902756866746627e-07, "loss": 0.4068, "num_input_tokens_seen": 73532392, "step": 126650 }, { "epoch": 18.864313375037234, "grad_norm": 1.2919280529022217, "learning_rate": 4.896355229882576e-07, "loss": 0.5701, "num_input_tokens_seen": 73535400, "step": 126655 }, { "epoch": 18.865058087578195, "grad_norm": 1.1074142456054688, "learning_rate": 4.889957733795525e-07, "loss": 0.6167, "num_input_tokens_seen": 73538536, "step": 126660 }, { "epoch": 18.865802800119155, "grad_norm": 1.4403913021087646, "learning_rate": 4.883564378593497e-07, "loss": 0.6664, "num_input_tokens_seen": 73541288, "step": 126665 }, { "epoch": 18.866547512660112, "grad_norm": 1.3382093906402588, "learning_rate": 4.877175164384518e-07, "loss": 0.6127, "num_input_tokens_seen": 73544232, "step": 126670 }, { "epoch": 18.867292225201073, "grad_norm": 1.59366774559021, "learning_rate": 4.870790091276555e-07, "loss": 0.4946, "num_input_tokens_seen": 73547176, "step": 126675 }, { "epoch": 18.868036937742033, "grad_norm": 0.9531731605529785, "learning_rate": 4.864409159377415e-07, "loss": 0.4532, "num_input_tokens_seen": 73550056, "step": 126680 }, { "epoch": 18.86878165028299, "grad_norm": 1.4619247913360596, "learning_rate": 4.858032368794979e-07, "loss": 0.5712, "num_input_tokens_seen": 73553096, "step": 126685 }, { "epoch": 18.86952636282395, "grad_norm": 1.1661796569824219, "learning_rate": 4.851659719636915e-07, "loss": 0.6331, "num_input_tokens_seen": 73556200, "step": 126690 }, { "epoch": 18.870271075364908, "grad_norm": 1.3862985372543335, "learning_rate": 4.845291212010883e-07, "loss": 0.7718, "num_input_tokens_seen": 73558824, "step": 126695 }, { "epoch": 18.87101578790587, "grad_norm": 1.0853376388549805, "learning_rate": 4.838926846024522e-07, "loss": 0.6456, "num_input_tokens_seen": 73561512, "step": 126700 }, { "epoch": 18.87176050044683, "grad_norm": 1.8345669507980347, "learning_rate": 4.832566621785329e-07, "loss": 0.8754, "num_input_tokens_seen": 73564648, "step": 126705 }, { "epoch": 18.872505212987786, "grad_norm": 0.9920920133590698, "learning_rate": 4.826210539400744e-07, "loss": 0.5327, "num_input_tokens_seen": 73567368, "step": 126710 }, { "epoch": 18.873249925528746, "grad_norm": 1.5767163038253784, "learning_rate": 4.819858598978127e-07, "loss": 0.6836, "num_input_tokens_seen": 73570216, "step": 126715 }, { "epoch": 18.873994638069703, "grad_norm": 1.1503936052322388, "learning_rate": 4.81351080062481e-07, "loss": 0.4334, "num_input_tokens_seen": 73572968, "step": 126720 }, { "epoch": 18.874739350610664, "grad_norm": 0.9823482036590576, "learning_rate": 4.807167144448039e-07, "loss": 0.4845, "num_input_tokens_seen": 73576008, "step": 126725 }, { "epoch": 18.875484063151625, "grad_norm": 1.8280105590820312, "learning_rate": 4.800827630554977e-07, "loss": 0.6913, "num_input_tokens_seen": 73578856, "step": 126730 }, { "epoch": 18.87622877569258, "grad_norm": 1.351155400276184, "learning_rate": 4.794492259052708e-07, "loss": 0.4446, "num_input_tokens_seen": 73581704, "step": 126735 }, { "epoch": 18.876973488233542, "grad_norm": 2.6941514015197754, "learning_rate": 4.788161030048282e-07, "loss": 0.533, "num_input_tokens_seen": 73584776, "step": 126740 }, { "epoch": 18.877718200774503, "grad_norm": 1.280462384223938, "learning_rate": 4.781833943648672e-07, "loss": 0.5179, "num_input_tokens_seen": 73588008, "step": 126745 }, { "epoch": 18.87846291331546, "grad_norm": 1.908739447593689, "learning_rate": 4.775510999960736e-07, "loss": 0.5163, "num_input_tokens_seen": 73591048, "step": 126750 }, { "epoch": 18.87920762585642, "grad_norm": 1.3472578525543213, "learning_rate": 4.769192199091305e-07, "loss": 0.6781, "num_input_tokens_seen": 73594248, "step": 126755 }, { "epoch": 18.879952338397377, "grad_norm": 1.274165391921997, "learning_rate": 4.7628775411471536e-07, "loss": 0.4321, "num_input_tokens_seen": 73597096, "step": 126760 }, { "epoch": 18.880697050938338, "grad_norm": 1.7367231845855713, "learning_rate": 4.7565670262349207e-07, "loss": 0.8886, "num_input_tokens_seen": 73600168, "step": 126765 }, { "epoch": 18.881441763479298, "grad_norm": 1.809473991394043, "learning_rate": 4.750260654461214e-07, "loss": 0.6412, "num_input_tokens_seen": 73603144, "step": 126770 }, { "epoch": 18.882186476020255, "grad_norm": 1.3781325817108154, "learning_rate": 4.743958425932615e-07, "loss": 0.5562, "num_input_tokens_seen": 73606248, "step": 126775 }, { "epoch": 18.882931188561216, "grad_norm": 1.4825631380081177, "learning_rate": 4.737660340755595e-07, "loss": 0.5083, "num_input_tokens_seen": 73609256, "step": 126780 }, { "epoch": 18.883675901102176, "grad_norm": 1.0083041191101074, "learning_rate": 4.731366399036485e-07, "loss": 0.6441, "num_input_tokens_seen": 73611816, "step": 126785 }, { "epoch": 18.884420613643133, "grad_norm": 1.5786668062210083, "learning_rate": 4.7250766008816726e-07, "loss": 0.6442, "num_input_tokens_seen": 73614696, "step": 126790 }, { "epoch": 18.885165326184094, "grad_norm": 3.344818115234375, "learning_rate": 4.7187909463974054e-07, "loss": 0.6968, "num_input_tokens_seen": 73617736, "step": 126795 }, { "epoch": 18.88591003872505, "grad_norm": 1.3637959957122803, "learning_rate": 4.712509435689877e-07, "loss": 0.5586, "num_input_tokens_seen": 73620616, "step": 126800 }, { "epoch": 18.88665475126601, "grad_norm": 1.1285278797149658, "learning_rate": 4.706232068865196e-07, "loss": 0.7147, "num_input_tokens_seen": 73624040, "step": 126805 }, { "epoch": 18.88739946380697, "grad_norm": 1.428229570388794, "learning_rate": 4.6999588460294177e-07, "loss": 0.5264, "num_input_tokens_seen": 73627176, "step": 126810 }, { "epoch": 18.88814417634793, "grad_norm": 0.8625401258468628, "learning_rate": 4.6936897672885117e-07, "loss": 0.4679, "num_input_tokens_seen": 73630280, "step": 126815 }, { "epoch": 18.88888888888889, "grad_norm": 3.7627246379852295, "learning_rate": 4.6874248327484494e-07, "loss": 0.6215, "num_input_tokens_seen": 73632968, "step": 126820 }, { "epoch": 18.88963360142985, "grad_norm": 2.015610694885254, "learning_rate": 4.681164042514979e-07, "loss": 0.5931, "num_input_tokens_seen": 73636008, "step": 126825 }, { "epoch": 18.890378313970807, "grad_norm": 2.7209219932556152, "learning_rate": 4.674907396693934e-07, "loss": 0.649, "num_input_tokens_seen": 73638664, "step": 126830 }, { "epoch": 18.891123026511767, "grad_norm": 1.3643966913223267, "learning_rate": 4.668654895390978e-07, "loss": 0.6209, "num_input_tokens_seen": 73641608, "step": 126835 }, { "epoch": 18.891867739052724, "grad_norm": 3.2971200942993164, "learning_rate": 4.66240653871175e-07, "loss": 0.6317, "num_input_tokens_seen": 73644392, "step": 126840 }, { "epoch": 18.892612451593685, "grad_norm": 1.494426965713501, "learning_rate": 4.6561623267618037e-07, "loss": 0.5878, "num_input_tokens_seen": 73647400, "step": 126845 }, { "epoch": 18.893357164134645, "grad_norm": 1.6657568216323853, "learning_rate": 4.6499222596466386e-07, "loss": 0.5474, "num_input_tokens_seen": 73650152, "step": 126850 }, { "epoch": 18.894101876675602, "grad_norm": 1.0851941108703613, "learning_rate": 4.6436863374716976e-07, "loss": 0.5529, "num_input_tokens_seen": 73653032, "step": 126855 }, { "epoch": 18.894846589216563, "grad_norm": 0.9173110723495483, "learning_rate": 4.6374545603423134e-07, "loss": 0.5682, "num_input_tokens_seen": 73656264, "step": 126860 }, { "epoch": 18.89559130175752, "grad_norm": 1.5891844034194946, "learning_rate": 4.6312269283637357e-07, "loss": 0.6853, "num_input_tokens_seen": 73658952, "step": 126865 }, { "epoch": 18.89633601429848, "grad_norm": 2.142033338546753, "learning_rate": 4.6250034416411845e-07, "loss": 0.4701, "num_input_tokens_seen": 73661896, "step": 126870 }, { "epoch": 18.89708072683944, "grad_norm": 1.313827633857727, "learning_rate": 4.618784100279827e-07, "loss": 0.4761, "num_input_tokens_seen": 73664648, "step": 126875 }, { "epoch": 18.897825439380398, "grad_norm": 1.2995816469192505, "learning_rate": 4.6125689043847453e-07, "loss": 0.5278, "num_input_tokens_seen": 73667368, "step": 126880 }, { "epoch": 18.89857015192136, "grad_norm": 2.073629856109619, "learning_rate": 4.606357854060855e-07, "loss": 0.6122, "num_input_tokens_seen": 73670056, "step": 126885 }, { "epoch": 18.89931486446232, "grad_norm": 1.2317101955413818, "learning_rate": 4.6001509494131846e-07, "loss": 0.3746, "num_input_tokens_seen": 73672744, "step": 126890 }, { "epoch": 18.900059577003276, "grad_norm": 1.5480188131332397, "learning_rate": 4.5939481905465655e-07, "loss": 0.6238, "num_input_tokens_seen": 73675496, "step": 126895 }, { "epoch": 18.900804289544237, "grad_norm": 1.474937915802002, "learning_rate": 4.5877495775657476e-07, "loss": 0.6868, "num_input_tokens_seen": 73678600, "step": 126900 }, { "epoch": 18.901549002085194, "grad_norm": 0.8405341506004333, "learning_rate": 4.5815551105754804e-07, "loss": 0.4228, "num_input_tokens_seen": 73681224, "step": 126905 }, { "epoch": 18.902293714626154, "grad_norm": 1.7489362955093384, "learning_rate": 4.575364789680375e-07, "loss": 0.4831, "num_input_tokens_seen": 73684328, "step": 126910 }, { "epoch": 18.903038427167115, "grad_norm": 1.6110602617263794, "learning_rate": 4.5691786149850977e-07, "loss": 0.4975, "num_input_tokens_seen": 73686920, "step": 126915 }, { "epoch": 18.90378313970807, "grad_norm": 2.636401891708374, "learning_rate": 4.562996586594037e-07, "loss": 0.6906, "num_input_tokens_seen": 73689704, "step": 126920 }, { "epoch": 18.904527852249032, "grad_norm": 0.998770534992218, "learning_rate": 4.5568187046117484e-07, "loss": 0.607, "num_input_tokens_seen": 73692552, "step": 126925 }, { "epoch": 18.905272564789993, "grad_norm": 2.101623296737671, "learning_rate": 4.550644969142537e-07, "loss": 0.6279, "num_input_tokens_seen": 73695624, "step": 126930 }, { "epoch": 18.90601727733095, "grad_norm": 0.7515569925308228, "learning_rate": 4.544475380290708e-07, "loss": 0.5884, "num_input_tokens_seen": 73698280, "step": 126935 }, { "epoch": 18.90676198987191, "grad_norm": 1.2609593868255615, "learning_rate": 4.538309938160512e-07, "loss": 0.6911, "num_input_tokens_seen": 73701064, "step": 126940 }, { "epoch": 18.907506702412867, "grad_norm": 1.2551367282867432, "learning_rate": 4.532148642856088e-07, "loss": 0.5945, "num_input_tokens_seen": 73703880, "step": 126945 }, { "epoch": 18.908251414953828, "grad_norm": 1.5113606452941895, "learning_rate": 4.5259914944815184e-07, "loss": 0.6137, "num_input_tokens_seen": 73706792, "step": 126950 }, { "epoch": 18.908996127494788, "grad_norm": 2.0850114822387695, "learning_rate": 4.519838493140832e-07, "loss": 0.7439, "num_input_tokens_seen": 73709672, "step": 126955 }, { "epoch": 18.909740840035745, "grad_norm": 2.520566463470459, "learning_rate": 4.513689638938001e-07, "loss": 0.5995, "num_input_tokens_seen": 73712616, "step": 126960 }, { "epoch": 18.910485552576706, "grad_norm": 1.9899072647094727, "learning_rate": 4.507544931976887e-07, "loss": 0.6025, "num_input_tokens_seen": 73715496, "step": 126965 }, { "epoch": 18.911230265117666, "grad_norm": 1.3196386098861694, "learning_rate": 4.501404372361295e-07, "loss": 0.4353, "num_input_tokens_seen": 73718248, "step": 126970 }, { "epoch": 18.911974977658623, "grad_norm": 2.0110888481140137, "learning_rate": 4.495267960194921e-07, "loss": 0.5173, "num_input_tokens_seen": 73721320, "step": 126975 }, { "epoch": 18.912719690199584, "grad_norm": 0.3985820710659027, "learning_rate": 4.4891356955815145e-07, "loss": 0.4354, "num_input_tokens_seen": 73724136, "step": 126980 }, { "epoch": 18.91346440274054, "grad_norm": 1.8620117902755737, "learning_rate": 4.483007578624632e-07, "loss": 0.5675, "num_input_tokens_seen": 73726952, "step": 126985 }, { "epoch": 18.9142091152815, "grad_norm": 1.7174595594406128, "learning_rate": 4.476883609427773e-07, "loss": 0.6441, "num_input_tokens_seen": 73730184, "step": 126990 }, { "epoch": 18.914953827822462, "grad_norm": 1.6463730335235596, "learning_rate": 4.4707637880944675e-07, "loss": 0.6407, "num_input_tokens_seen": 73733224, "step": 126995 }, { "epoch": 18.91569854036342, "grad_norm": 1.1019020080566406, "learning_rate": 4.4646481147280206e-07, "loss": 0.6338, "num_input_tokens_seen": 73736040, "step": 127000 }, { "epoch": 18.91644325290438, "grad_norm": 2.4091079235076904, "learning_rate": 4.458536589431822e-07, "loss": 0.3944, "num_input_tokens_seen": 73739016, "step": 127005 }, { "epoch": 18.917187965445336, "grad_norm": 1.9670276641845703, "learning_rate": 4.4524292123090673e-07, "loss": 0.6277, "num_input_tokens_seen": 73741672, "step": 127010 }, { "epoch": 18.917932677986297, "grad_norm": 2.560594320297241, "learning_rate": 4.4463259834630066e-07, "loss": 0.8064, "num_input_tokens_seen": 73744904, "step": 127015 }, { "epoch": 18.918677390527257, "grad_norm": 1.1970821619033813, "learning_rate": 4.440226902996669e-07, "loss": 0.6588, "num_input_tokens_seen": 73747560, "step": 127020 }, { "epoch": 18.919422103068214, "grad_norm": 2.1073825359344482, "learning_rate": 4.4341319710131115e-07, "loss": 0.494, "num_input_tokens_seen": 73750472, "step": 127025 }, { "epoch": 18.920166815609175, "grad_norm": 1.8344347476959229, "learning_rate": 4.428041187615306e-07, "loss": 0.4862, "num_input_tokens_seen": 73753128, "step": 127030 }, { "epoch": 18.920911528150135, "grad_norm": 1.1123021841049194, "learning_rate": 4.421954552906199e-07, "loss": 0.4867, "num_input_tokens_seen": 73756296, "step": 127035 }, { "epoch": 18.921656240691092, "grad_norm": 1.5259273052215576, "learning_rate": 4.415872066988541e-07, "loss": 0.7982, "num_input_tokens_seen": 73760552, "step": 127040 }, { "epoch": 18.922400953232053, "grad_norm": 1.5938514471054077, "learning_rate": 4.4097937299651115e-07, "loss": 0.5291, "num_input_tokens_seen": 73763528, "step": 127045 }, { "epoch": 18.92314566577301, "grad_norm": 1.3388687372207642, "learning_rate": 4.4037195419386336e-07, "loss": 0.7194, "num_input_tokens_seen": 73766600, "step": 127050 }, { "epoch": 18.92389037831397, "grad_norm": 0.9627029299736023, "learning_rate": 4.3976495030116915e-07, "loss": 0.4561, "num_input_tokens_seen": 73769416, "step": 127055 }, { "epoch": 18.92463509085493, "grad_norm": 1.6937352418899536, "learning_rate": 4.3915836132868426e-07, "loss": 0.604, "num_input_tokens_seen": 73772232, "step": 127060 }, { "epoch": 18.925379803395888, "grad_norm": 1.3137239217758179, "learning_rate": 4.3855218728665883e-07, "loss": 0.4269, "num_input_tokens_seen": 73775272, "step": 127065 }, { "epoch": 18.92612451593685, "grad_norm": 1.9915798902511597, "learning_rate": 4.3794642818532905e-07, "loss": 0.939, "num_input_tokens_seen": 73778376, "step": 127070 }, { "epoch": 18.92686922847781, "grad_norm": 3.3956189155578613, "learning_rate": 4.3734108403493125e-07, "loss": 0.7247, "num_input_tokens_seen": 73781320, "step": 127075 }, { "epoch": 18.927613941018766, "grad_norm": 1.2867761850357056, "learning_rate": 4.3673615484568776e-07, "loss": 0.4656, "num_input_tokens_seen": 73784136, "step": 127080 }, { "epoch": 18.928358653559727, "grad_norm": 2.3597185611724854, "learning_rate": 4.3613164062782653e-07, "loss": 0.6007, "num_input_tokens_seen": 73787272, "step": 127085 }, { "epoch": 18.929103366100684, "grad_norm": 1.3174046277999878, "learning_rate": 4.3552754139155327e-07, "loss": 0.4468, "num_input_tokens_seen": 73790088, "step": 127090 }, { "epoch": 18.929848078641644, "grad_norm": 1.2470251321792603, "learning_rate": 4.3492385714707927e-07, "loss": 0.5369, "num_input_tokens_seen": 73792904, "step": 127095 }, { "epoch": 18.930592791182605, "grad_norm": 1.558193564414978, "learning_rate": 4.343205879045964e-07, "loss": 0.5856, "num_input_tokens_seen": 73795976, "step": 127100 }, { "epoch": 18.93133750372356, "grad_norm": 1.1996443271636963, "learning_rate": 4.3371773367429924e-07, "loss": 0.6432, "num_input_tokens_seen": 73799080, "step": 127105 }, { "epoch": 18.932082216264522, "grad_norm": 1.2013931274414062, "learning_rate": 4.331152944663769e-07, "loss": 0.6152, "num_input_tokens_seen": 73801992, "step": 127110 }, { "epoch": 18.932826928805483, "grad_norm": 2.2124409675598145, "learning_rate": 4.3251327029099897e-07, "loss": 0.5983, "num_input_tokens_seen": 73804872, "step": 127115 }, { "epoch": 18.93357164134644, "grad_norm": 2.3286828994750977, "learning_rate": 4.319116611583407e-07, "loss": 0.7176, "num_input_tokens_seen": 73807720, "step": 127120 }, { "epoch": 18.9343163538874, "grad_norm": 1.1570547819137573, "learning_rate": 4.3131046707856613e-07, "loss": 0.5847, "num_input_tokens_seen": 73810568, "step": 127125 }, { "epoch": 18.935061066428357, "grad_norm": 1.947076439857483, "learning_rate": 4.307096880618311e-07, "loss": 0.4217, "num_input_tokens_seen": 73813352, "step": 127130 }, { "epoch": 18.935805778969318, "grad_norm": 1.818388819694519, "learning_rate": 4.30109324118283e-07, "loss": 0.7563, "num_input_tokens_seen": 73816456, "step": 127135 }, { "epoch": 18.93655049151028, "grad_norm": 1.1755677461624146, "learning_rate": 4.295093752580664e-07, "loss": 0.6459, "num_input_tokens_seen": 73819656, "step": 127140 }, { "epoch": 18.937295204051235, "grad_norm": 1.5272079706192017, "learning_rate": 4.289098414913206e-07, "loss": 0.63, "num_input_tokens_seen": 73822440, "step": 127145 }, { "epoch": 18.938039916592196, "grad_norm": 2.0647499561309814, "learning_rate": 4.283107228281652e-07, "loss": 0.4238, "num_input_tokens_seen": 73825256, "step": 127150 }, { "epoch": 18.938784629133156, "grad_norm": 1.6552798748016357, "learning_rate": 4.277120192787282e-07, "loss": 0.505, "num_input_tokens_seen": 73828008, "step": 127155 }, { "epoch": 18.939529341674113, "grad_norm": 1.8449374437332153, "learning_rate": 4.271137308531237e-07, "loss": 0.4636, "num_input_tokens_seen": 73831080, "step": 127160 }, { "epoch": 18.940274054215074, "grad_norm": 3.5367987155914307, "learning_rate": 4.265158575614575e-07, "loss": 0.7256, "num_input_tokens_seen": 73834344, "step": 127165 }, { "epoch": 18.94101876675603, "grad_norm": 1.5908513069152832, "learning_rate": 4.259183994138299e-07, "loss": 0.5672, "num_input_tokens_seen": 73837352, "step": 127170 }, { "epoch": 18.94176347929699, "grad_norm": 1.2384867668151855, "learning_rate": 4.2532135642033565e-07, "loss": 0.5651, "num_input_tokens_seen": 73840616, "step": 127175 }, { "epoch": 18.942508191837952, "grad_norm": 3.3571927547454834, "learning_rate": 4.2472472859105827e-07, "loss": 0.4816, "num_input_tokens_seen": 73843240, "step": 127180 }, { "epoch": 18.94325290437891, "grad_norm": 1.52632737159729, "learning_rate": 4.241285159360814e-07, "loss": 0.4598, "num_input_tokens_seen": 73846280, "step": 127185 }, { "epoch": 18.94399761691987, "grad_norm": 1.9427722692489624, "learning_rate": 4.235327184654747e-07, "loss": 0.4786, "num_input_tokens_seen": 73849320, "step": 127190 }, { "epoch": 18.94474232946083, "grad_norm": 1.1364792585372925, "learning_rate": 4.229373361893024e-07, "loss": 0.4903, "num_input_tokens_seen": 73852200, "step": 127195 }, { "epoch": 18.945487042001787, "grad_norm": 2.179849624633789, "learning_rate": 4.223423691176287e-07, "loss": 0.5235, "num_input_tokens_seen": 73855368, "step": 127200 }, { "epoch": 18.946231754542747, "grad_norm": 1.7720352411270142, "learning_rate": 4.2174781726049826e-07, "loss": 0.6294, "num_input_tokens_seen": 73858184, "step": 127205 }, { "epoch": 18.946976467083704, "grad_norm": 1.8482639789581299, "learning_rate": 4.2115368062796147e-07, "loss": 0.4606, "num_input_tokens_seen": 73861000, "step": 127210 }, { "epoch": 18.947721179624665, "grad_norm": 1.5773186683654785, "learning_rate": 4.205599592300491e-07, "loss": 0.4686, "num_input_tokens_seen": 73863848, "step": 127215 }, { "epoch": 18.948465892165625, "grad_norm": 1.421385645866394, "learning_rate": 4.199666530767948e-07, "loss": 0.6391, "num_input_tokens_seen": 73866728, "step": 127220 }, { "epoch": 18.949210604706582, "grad_norm": 1.7308905124664307, "learning_rate": 4.19373762178224e-07, "loss": 0.3652, "num_input_tokens_seen": 73869384, "step": 127225 }, { "epoch": 18.949955317247543, "grad_norm": 1.1374098062515259, "learning_rate": 4.187812865443508e-07, "loss": 0.3192, "num_input_tokens_seen": 73872168, "step": 127230 }, { "epoch": 18.9507000297885, "grad_norm": 1.9605917930603027, "learning_rate": 4.1818922618518386e-07, "loss": 0.5518, "num_input_tokens_seen": 73874856, "step": 127235 }, { "epoch": 18.95144474232946, "grad_norm": 1.1300932168960571, "learning_rate": 4.175975811107263e-07, "loss": 0.4834, "num_input_tokens_seen": 73877960, "step": 127240 }, { "epoch": 18.95218945487042, "grad_norm": 3.862468957901001, "learning_rate": 4.17006351330973e-07, "loss": 0.6273, "num_input_tokens_seen": 73880808, "step": 127245 }, { "epoch": 18.952934167411378, "grad_norm": 1.8501176834106445, "learning_rate": 4.164155368559103e-07, "loss": 0.5112, "num_input_tokens_seen": 73883880, "step": 127250 }, { "epoch": 18.95367887995234, "grad_norm": 1.255698323249817, "learning_rate": 4.1582513769552467e-07, "loss": 0.5645, "num_input_tokens_seen": 73886792, "step": 127255 }, { "epoch": 18.9544235924933, "grad_norm": 1.595740795135498, "learning_rate": 4.1523515385978317e-07, "loss": 0.6333, "num_input_tokens_seen": 73889352, "step": 127260 }, { "epoch": 18.955168305034256, "grad_norm": 2.1786110401153564, "learning_rate": 4.1464558535866117e-07, "loss": 0.739, "num_input_tokens_seen": 73892296, "step": 127265 }, { "epoch": 18.955913017575217, "grad_norm": 2.4209659099578857, "learning_rate": 4.140564322021145e-07, "loss": 0.5538, "num_input_tokens_seen": 73894888, "step": 127270 }, { "epoch": 18.956657730116174, "grad_norm": 1.112375020980835, "learning_rate": 4.1346769440009094e-07, "loss": 0.5395, "num_input_tokens_seen": 73897704, "step": 127275 }, { "epoch": 18.957402442657134, "grad_norm": 1.1906083822250366, "learning_rate": 4.12879371962549e-07, "loss": 0.5805, "num_input_tokens_seen": 73900680, "step": 127280 }, { "epoch": 18.958147155198095, "grad_norm": 2.465532064437866, "learning_rate": 4.1229146489941416e-07, "loss": 0.6175, "num_input_tokens_seen": 73903688, "step": 127285 }, { "epoch": 18.95889186773905, "grad_norm": 2.2616488933563232, "learning_rate": 4.1170397322063125e-07, "loss": 0.6047, "num_input_tokens_seen": 73906280, "step": 127290 }, { "epoch": 18.959636580280012, "grad_norm": 1.4915049076080322, "learning_rate": 4.111168969361173e-07, "loss": 0.7353, "num_input_tokens_seen": 73909288, "step": 127295 }, { "epoch": 18.960381292820973, "grad_norm": 0.8699467182159424, "learning_rate": 4.1053023605579223e-07, "loss": 0.4739, "num_input_tokens_seen": 73912200, "step": 127300 }, { "epoch": 18.96112600536193, "grad_norm": 1.7447810173034668, "learning_rate": 4.0994399058956743e-07, "loss": 0.551, "num_input_tokens_seen": 73914760, "step": 127305 }, { "epoch": 18.96187071790289, "grad_norm": 1.1263643503189087, "learning_rate": 4.0935816054734343e-07, "loss": 0.5323, "num_input_tokens_seen": 73917576, "step": 127310 }, { "epoch": 18.962615430443847, "grad_norm": 2.4770941734313965, "learning_rate": 4.0877274593902335e-07, "loss": 0.6097, "num_input_tokens_seen": 73920456, "step": 127315 }, { "epoch": 18.963360142984808, "grad_norm": 3.427732229232788, "learning_rate": 4.0818774677449377e-07, "loss": 0.7833, "num_input_tokens_seen": 73923720, "step": 127320 }, { "epoch": 18.96410485552577, "grad_norm": 2.3023505210876465, "learning_rate": 4.0760316306363844e-07, "loss": 0.6825, "num_input_tokens_seen": 73926504, "step": 127325 }, { "epoch": 18.964849568066725, "grad_norm": 2.45995831489563, "learning_rate": 4.0701899481633277e-07, "loss": 0.6733, "num_input_tokens_seen": 73929384, "step": 127330 }, { "epoch": 18.965594280607686, "grad_norm": 1.2823190689086914, "learning_rate": 4.0643524204244665e-07, "loss": 0.5367, "num_input_tokens_seen": 73932328, "step": 127335 }, { "epoch": 18.966338993148646, "grad_norm": 1.0505341291427612, "learning_rate": 4.0585190475184166e-07, "loss": 0.664, "num_input_tokens_seen": 73935176, "step": 127340 }, { "epoch": 18.967083705689603, "grad_norm": 1.8826675415039062, "learning_rate": 4.05268982954371e-07, "loss": 0.5705, "num_input_tokens_seen": 73937896, "step": 127345 }, { "epoch": 18.967828418230564, "grad_norm": 1.546701192855835, "learning_rate": 4.0468647665988513e-07, "loss": 0.4946, "num_input_tokens_seen": 73941064, "step": 127350 }, { "epoch": 18.96857313077152, "grad_norm": 1.3254857063293457, "learning_rate": 4.041043858782234e-07, "loss": 0.5816, "num_input_tokens_seen": 73943880, "step": 127355 }, { "epoch": 18.96931784331248, "grad_norm": 2.2332658767700195, "learning_rate": 4.0352271061921966e-07, "loss": 0.7836, "num_input_tokens_seen": 73946728, "step": 127360 }, { "epoch": 18.970062555853442, "grad_norm": 1.3921148777008057, "learning_rate": 4.0294145089270205e-07, "loss": 0.6636, "num_input_tokens_seen": 73949672, "step": 127365 }, { "epoch": 18.9708072683944, "grad_norm": 2.697066307067871, "learning_rate": 4.0236060670848783e-07, "loss": 0.8131, "num_input_tokens_seen": 73952264, "step": 127370 }, { "epoch": 18.97155198093536, "grad_norm": 1.1372346878051758, "learning_rate": 4.0178017807639136e-07, "loss": 0.7238, "num_input_tokens_seen": 73955240, "step": 127375 }, { "epoch": 18.972296693476316, "grad_norm": 2.577636957168579, "learning_rate": 4.012001650062186e-07, "loss": 0.7865, "num_input_tokens_seen": 73958120, "step": 127380 }, { "epoch": 18.973041406017277, "grad_norm": 2.2044687271118164, "learning_rate": 4.0062056750776734e-07, "loss": 0.6212, "num_input_tokens_seen": 73961128, "step": 127385 }, { "epoch": 18.973786118558237, "grad_norm": 1.987457036972046, "learning_rate": 4.000413855908297e-07, "loss": 0.7051, "num_input_tokens_seen": 73964040, "step": 127390 }, { "epoch": 18.974530831099194, "grad_norm": 1.88448166847229, "learning_rate": 3.9946261926519233e-07, "loss": 0.6937, "num_input_tokens_seen": 73966888, "step": 127395 }, { "epoch": 18.975275543640155, "grad_norm": 2.088984251022339, "learning_rate": 3.9888426854063075e-07, "loss": 0.5351, "num_input_tokens_seen": 73969960, "step": 127400 }, { "epoch": 18.976020256181116, "grad_norm": 1.3880672454833984, "learning_rate": 3.9830633342691494e-07, "loss": 0.5806, "num_input_tokens_seen": 73972648, "step": 127405 }, { "epoch": 18.976764968722073, "grad_norm": 1.0998835563659668, "learning_rate": 3.9772881393380923e-07, "loss": 0.7113, "num_input_tokens_seen": 73975496, "step": 127410 }, { "epoch": 18.977509681263033, "grad_norm": 1.6703617572784424, "learning_rate": 3.9715171007107256e-07, "loss": 0.5795, "num_input_tokens_seen": 73978440, "step": 127415 }, { "epoch": 18.97825439380399, "grad_norm": 1.661102056503296, "learning_rate": 3.9657502184844983e-07, "loss": 0.7776, "num_input_tokens_seen": 73981224, "step": 127420 }, { "epoch": 18.97899910634495, "grad_norm": 1.3012222051620483, "learning_rate": 3.959987492756889e-07, "loss": 0.5359, "num_input_tokens_seen": 73984456, "step": 127425 }, { "epoch": 18.97974381888591, "grad_norm": 1.2850079536437988, "learning_rate": 3.9542289236252363e-07, "loss": 0.6175, "num_input_tokens_seen": 73987304, "step": 127430 }, { "epoch": 18.980488531426868, "grad_norm": 1.528256893157959, "learning_rate": 3.948474511186767e-07, "loss": 0.5637, "num_input_tokens_seen": 73990056, "step": 127435 }, { "epoch": 18.98123324396783, "grad_norm": 2.115814208984375, "learning_rate": 3.9427242555387935e-07, "loss": 0.5918, "num_input_tokens_seen": 73993416, "step": 127440 }, { "epoch": 18.98197795650879, "grad_norm": 2.3449478149414062, "learning_rate": 3.936978156778376e-07, "loss": 0.5886, "num_input_tokens_seen": 73996104, "step": 127445 }, { "epoch": 18.982722669049746, "grad_norm": 1.3724600076675415, "learning_rate": 3.9312362150026594e-07, "loss": 0.5881, "num_input_tokens_seen": 73998600, "step": 127450 }, { "epoch": 18.983467381590707, "grad_norm": 1.2334742546081543, "learning_rate": 3.925498430308594e-07, "loss": 0.4329, "num_input_tokens_seen": 74001256, "step": 127455 }, { "epoch": 18.984212094131664, "grad_norm": 1.2442965507507324, "learning_rate": 3.91976480279313e-07, "loss": 0.5578, "num_input_tokens_seen": 74004104, "step": 127460 }, { "epoch": 18.984956806672624, "grad_norm": 1.7570993900299072, "learning_rate": 3.914035332553162e-07, "loss": 0.6159, "num_input_tokens_seen": 74007208, "step": 127465 }, { "epoch": 18.985701519213585, "grad_norm": 3.0449676513671875, "learning_rate": 3.9083100196854183e-07, "loss": 0.757, "num_input_tokens_seen": 74009864, "step": 127470 }, { "epoch": 18.98644623175454, "grad_norm": 1.4257009029388428, "learning_rate": 3.9025888642866827e-07, "loss": 0.5958, "num_input_tokens_seen": 74013224, "step": 127475 }, { "epoch": 18.987190944295502, "grad_norm": 1.5877041816711426, "learning_rate": 3.896871866453572e-07, "loss": 0.5593, "num_input_tokens_seen": 74016264, "step": 127480 }, { "epoch": 18.987935656836463, "grad_norm": 1.370444893836975, "learning_rate": 3.891159026282704e-07, "loss": 0.534, "num_input_tokens_seen": 74019496, "step": 127485 }, { "epoch": 18.98868036937742, "grad_norm": 1.7157307863235474, "learning_rate": 3.885450343870556e-07, "loss": 0.4392, "num_input_tokens_seen": 74022120, "step": 127490 }, { "epoch": 18.98942508191838, "grad_norm": 1.1105877161026, "learning_rate": 3.8797458193135793e-07, "loss": 0.532, "num_input_tokens_seen": 74024936, "step": 127495 }, { "epoch": 18.990169794459337, "grad_norm": 1.8360049724578857, "learning_rate": 3.8740454527081693e-07, "loss": 0.5109, "num_input_tokens_seen": 74028008, "step": 127500 }, { "epoch": 18.990914507000298, "grad_norm": 2.7484426498413086, "learning_rate": 3.8683492441506097e-07, "loss": 0.6589, "num_input_tokens_seen": 74030888, "step": 127505 }, { "epoch": 18.99165921954126, "grad_norm": 1.4255434274673462, "learning_rate": 3.862657193737129e-07, "loss": 0.6239, "num_input_tokens_seen": 74033480, "step": 127510 }, { "epoch": 18.992403932082215, "grad_norm": 0.9562949538230896, "learning_rate": 3.856969301563873e-07, "loss": 0.5657, "num_input_tokens_seen": 74036424, "step": 127515 }, { "epoch": 18.993148644623176, "grad_norm": 2.1116220951080322, "learning_rate": 3.8512855677269586e-07, "loss": 0.5334, "num_input_tokens_seen": 74039464, "step": 127520 }, { "epoch": 18.993893357164133, "grad_norm": 2.508087158203125, "learning_rate": 3.845605992322393e-07, "loss": 0.5083, "num_input_tokens_seen": 74042056, "step": 127525 }, { "epoch": 18.994638069705093, "grad_norm": 1.5813437700271606, "learning_rate": 3.8399305754461546e-07, "loss": 0.4647, "num_input_tokens_seen": 74045096, "step": 127530 }, { "epoch": 18.995382782246054, "grad_norm": 1.124250888824463, "learning_rate": 3.834259317194083e-07, "loss": 0.8414, "num_input_tokens_seen": 74048168, "step": 127535 }, { "epoch": 18.99612749478701, "grad_norm": 1.0368744134902954, "learning_rate": 3.828592217662047e-07, "loss": 0.3954, "num_input_tokens_seen": 74050984, "step": 127540 }, { "epoch": 18.99687220732797, "grad_norm": 3.052643060684204, "learning_rate": 3.8229292769457193e-07, "loss": 0.6009, "num_input_tokens_seen": 74054344, "step": 127545 }, { "epoch": 18.997616919868932, "grad_norm": 1.943865180015564, "learning_rate": 3.8172704951408013e-07, "loss": 0.5511, "num_input_tokens_seen": 74057288, "step": 127550 }, { "epoch": 18.99836163240989, "grad_norm": 1.4988020658493042, "learning_rate": 3.811615872342883e-07, "loss": 0.6922, "num_input_tokens_seen": 74060264, "step": 127555 }, { "epoch": 18.99910634495085, "grad_norm": 1.7663640975952148, "learning_rate": 3.805965408647527e-07, "loss": 0.6475, "num_input_tokens_seen": 74063336, "step": 127560 }, { "epoch": 18.999851057491806, "grad_norm": 1.9296519756317139, "learning_rate": 3.8003191041501575e-07, "loss": 0.6381, "num_input_tokens_seen": 74065864, "step": 127565 }, { "epoch": 19.0, "eval_loss": 0.6566820740699768, "eval_runtime": 47.0018, "eval_samples_per_second": 63.487, "eval_steps_per_second": 15.872, "num_input_tokens_seen": 74065984, "step": 127566 }, { "epoch": 19.000595770032767, "grad_norm": 0.964998185634613, "learning_rate": 3.794676958946142e-07, "loss": 0.6151, "num_input_tokens_seen": 74068512, "step": 127570 }, { "epoch": 19.001340482573728, "grad_norm": 1.0474038124084473, "learning_rate": 3.7890389731308486e-07, "loss": 0.5453, "num_input_tokens_seen": 74071232, "step": 127575 }, { "epoch": 19.002085195114685, "grad_norm": 1.7019760608673096, "learning_rate": 3.783405146799479e-07, "loss": 0.37, "num_input_tokens_seen": 74074112, "step": 127580 }, { "epoch": 19.002829907655645, "grad_norm": 2.115605354309082, "learning_rate": 3.777775480047263e-07, "loss": 0.8458, "num_input_tokens_seen": 74076992, "step": 127585 }, { "epoch": 19.003574620196606, "grad_norm": 1.7703161239624023, "learning_rate": 3.772149972969291e-07, "loss": 0.7344, "num_input_tokens_seen": 74080160, "step": 127590 }, { "epoch": 19.004319332737563, "grad_norm": 0.9486633539199829, "learning_rate": 3.76652862566057e-07, "loss": 0.5996, "num_input_tokens_seen": 74083264, "step": 127595 }, { "epoch": 19.005064045278523, "grad_norm": 3.7532505989074707, "learning_rate": 3.7609114382160803e-07, "loss": 0.5702, "num_input_tokens_seen": 74086176, "step": 127600 }, { "epoch": 19.00580875781948, "grad_norm": 1.1033985614776611, "learning_rate": 3.7552984107307177e-07, "loss": 0.5577, "num_input_tokens_seen": 74089184, "step": 127605 }, { "epoch": 19.00655347036044, "grad_norm": 1.7806150913238525, "learning_rate": 3.7496895432993505e-07, "loss": 0.5929, "num_input_tokens_seen": 74092096, "step": 127610 }, { "epoch": 19.0072981829014, "grad_norm": 1.5374473333358765, "learning_rate": 3.7440848360166813e-07, "loss": 0.5921, "num_input_tokens_seen": 74094976, "step": 127615 }, { "epoch": 19.008042895442358, "grad_norm": 1.1380373239517212, "learning_rate": 3.738484288977412e-07, "loss": 0.4582, "num_input_tokens_seen": 74097856, "step": 127620 }, { "epoch": 19.00878760798332, "grad_norm": 1.2855030298233032, "learning_rate": 3.7328879022761886e-07, "loss": 0.6222, "num_input_tokens_seen": 74100768, "step": 127625 }, { "epoch": 19.00953232052428, "grad_norm": 0.837874174118042, "learning_rate": 3.7272956760075197e-07, "loss": 0.4701, "num_input_tokens_seen": 74103488, "step": 127630 }, { "epoch": 19.010277033065236, "grad_norm": 2.131864070892334, "learning_rate": 3.7217076102658845e-07, "loss": 0.5461, "num_input_tokens_seen": 74106400, "step": 127635 }, { "epoch": 19.011021745606197, "grad_norm": 1.3229916095733643, "learning_rate": 3.7161237051456796e-07, "loss": 0.4842, "num_input_tokens_seen": 74109184, "step": 127640 }, { "epoch": 19.011766458147154, "grad_norm": 2.1634905338287354, "learning_rate": 3.710543960741275e-07, "loss": 0.4966, "num_input_tokens_seen": 74112096, "step": 127645 }, { "epoch": 19.012511170688114, "grad_norm": 2.2304134368896484, "learning_rate": 3.7049683771468723e-07, "loss": 0.5661, "num_input_tokens_seen": 74115040, "step": 127650 }, { "epoch": 19.013255883229075, "grad_norm": 1.3570666313171387, "learning_rate": 3.6993969544567575e-07, "loss": 0.5597, "num_input_tokens_seen": 74118176, "step": 127655 }, { "epoch": 19.01400059577003, "grad_norm": 1.7770053148269653, "learning_rate": 3.693829692764966e-07, "loss": 0.5453, "num_input_tokens_seen": 74120800, "step": 127660 }, { "epoch": 19.014745308310992, "grad_norm": 1.8243069648742676, "learning_rate": 3.68826659216559e-07, "loss": 0.5491, "num_input_tokens_seen": 74123680, "step": 127665 }, { "epoch": 19.015490020851953, "grad_norm": 4.137773513793945, "learning_rate": 3.682707652752637e-07, "loss": 0.6084, "num_input_tokens_seen": 74126528, "step": 127670 }, { "epoch": 19.01623473339291, "grad_norm": 3.2349767684936523, "learning_rate": 3.677152874619949e-07, "loss": 0.5681, "num_input_tokens_seen": 74129280, "step": 127675 }, { "epoch": 19.01697944593387, "grad_norm": 1.6623011827468872, "learning_rate": 3.671602257861451e-07, "loss": 0.5698, "num_input_tokens_seen": 74132384, "step": 127680 }, { "epoch": 19.017724158474827, "grad_norm": 2.2530605792999268, "learning_rate": 3.666055802570845e-07, "loss": 0.5377, "num_input_tokens_seen": 74134976, "step": 127685 }, { "epoch": 19.018468871015788, "grad_norm": 2.765760660171509, "learning_rate": 3.6605135088418895e-07, "loss": 0.6644, "num_input_tokens_seen": 74138208, "step": 127690 }, { "epoch": 19.01921358355675, "grad_norm": 2.3368396759033203, "learning_rate": 3.654975376768205e-07, "loss": 0.4824, "num_input_tokens_seen": 74141088, "step": 127695 }, { "epoch": 19.019958296097705, "grad_norm": 1.17275071144104, "learning_rate": 3.649441406443327e-07, "loss": 0.7956, "num_input_tokens_seen": 74143936, "step": 127700 }, { "epoch": 19.020703008638666, "grad_norm": 1.239554524421692, "learning_rate": 3.643911597960736e-07, "loss": 0.6134, "num_input_tokens_seen": 74146688, "step": 127705 }, { "epoch": 19.021447721179623, "grad_norm": 1.7620184421539307, "learning_rate": 3.6383859514138864e-07, "loss": 0.6223, "num_input_tokens_seen": 74149472, "step": 127710 }, { "epoch": 19.022192433720583, "grad_norm": 1.6048215627670288, "learning_rate": 3.6328644668961187e-07, "loss": 0.6946, "num_input_tokens_seen": 74152512, "step": 127715 }, { "epoch": 19.022937146261544, "grad_norm": 1.1032075881958008, "learning_rate": 3.6273471445006923e-07, "loss": 0.4972, "num_input_tokens_seen": 74155488, "step": 127720 }, { "epoch": 19.0236818588025, "grad_norm": 1.9413118362426758, "learning_rate": 3.621833984320838e-07, "loss": 0.4367, "num_input_tokens_seen": 74158240, "step": 127725 }, { "epoch": 19.02442657134346, "grad_norm": 1.9638649225234985, "learning_rate": 3.616324986449676e-07, "loss": 0.5235, "num_input_tokens_seen": 74160992, "step": 127730 }, { "epoch": 19.025171283884422, "grad_norm": 1.4732073545455933, "learning_rate": 3.6108201509803263e-07, "loss": 0.6121, "num_input_tokens_seen": 74164192, "step": 127735 }, { "epoch": 19.02591599642538, "grad_norm": 0.9290209412574768, "learning_rate": 3.605319478005714e-07, "loss": 0.4598, "num_input_tokens_seen": 74167104, "step": 127740 }, { "epoch": 19.02666070896634, "grad_norm": 1.2222676277160645, "learning_rate": 3.599822967618849e-07, "loss": 0.6018, "num_input_tokens_seen": 74169888, "step": 127745 }, { "epoch": 19.027405421507297, "grad_norm": 2.1293179988861084, "learning_rate": 3.594330619912517e-07, "loss": 0.6435, "num_input_tokens_seen": 74172960, "step": 127750 }, { "epoch": 19.028150134048257, "grad_norm": 3.419857978820801, "learning_rate": 3.5888424349795615e-07, "loss": 0.4993, "num_input_tokens_seen": 74175712, "step": 127755 }, { "epoch": 19.028894846589218, "grad_norm": 1.7036806344985962, "learning_rate": 3.5833584129126574e-07, "loss": 0.5218, "num_input_tokens_seen": 74178592, "step": 127760 }, { "epoch": 19.029639559130175, "grad_norm": 1.7971878051757812, "learning_rate": 3.5778785538044255e-07, "loss": 0.4934, "num_input_tokens_seen": 74181888, "step": 127765 }, { "epoch": 19.030384271671135, "grad_norm": 1.4755645990371704, "learning_rate": 3.572402857747542e-07, "loss": 0.6413, "num_input_tokens_seen": 74184672, "step": 127770 }, { "epoch": 19.031128984212096, "grad_norm": 1.5566719770431519, "learning_rate": 3.566931324834405e-07, "loss": 0.5923, "num_input_tokens_seen": 74187456, "step": 127775 }, { "epoch": 19.031873696753053, "grad_norm": 3.8859376907348633, "learning_rate": 3.5614639551575235e-07, "loss": 0.7974, "num_input_tokens_seen": 74190496, "step": 127780 }, { "epoch": 19.032618409294013, "grad_norm": 2.0824456214904785, "learning_rate": 3.5560007488092404e-07, "loss": 0.6614, "num_input_tokens_seen": 74193408, "step": 127785 }, { "epoch": 19.03336312183497, "grad_norm": 1.3207625150680542, "learning_rate": 3.5505417058818437e-07, "loss": 0.4497, "num_input_tokens_seen": 74196096, "step": 127790 }, { "epoch": 19.03410783437593, "grad_norm": 0.9345427751541138, "learning_rate": 3.545086826467592e-07, "loss": 0.6048, "num_input_tokens_seen": 74199104, "step": 127795 }, { "epoch": 19.03485254691689, "grad_norm": 1.8094500303268433, "learning_rate": 3.5396361106585787e-07, "loss": 0.6876, "num_input_tokens_seen": 74201952, "step": 127800 }, { "epoch": 19.035597259457848, "grad_norm": 1.6349825859069824, "learning_rate": 3.534189558546924e-07, "loss": 0.6443, "num_input_tokens_seen": 74204640, "step": 127805 }, { "epoch": 19.03634197199881, "grad_norm": 0.9201050996780396, "learning_rate": 3.5287471702246386e-07, "loss": 0.5437, "num_input_tokens_seen": 74207584, "step": 127810 }, { "epoch": 19.03708668453977, "grad_norm": 1.3029557466506958, "learning_rate": 3.5233089457837045e-07, "loss": 0.483, "num_input_tokens_seen": 74210304, "step": 127815 }, { "epoch": 19.037831397080726, "grad_norm": 1.3106212615966797, "learning_rate": 3.517874885315936e-07, "loss": 0.7127, "num_input_tokens_seen": 74213184, "step": 127820 }, { "epoch": 19.038576109621687, "grad_norm": 1.517462134361267, "learning_rate": 3.5124449889131495e-07, "loss": 0.7382, "num_input_tokens_seen": 74216512, "step": 127825 }, { "epoch": 19.039320822162644, "grad_norm": 1.1425164937973022, "learning_rate": 3.5070192566671046e-07, "loss": 0.6872, "num_input_tokens_seen": 74219328, "step": 127830 }, { "epoch": 19.040065534703604, "grad_norm": 1.1879868507385254, "learning_rate": 3.5015976886694226e-07, "loss": 0.6761, "num_input_tokens_seen": 74222432, "step": 127835 }, { "epoch": 19.040810247244565, "grad_norm": 1.443699836730957, "learning_rate": 3.496180285011724e-07, "loss": 0.6301, "num_input_tokens_seen": 74225376, "step": 127840 }, { "epoch": 19.041554959785522, "grad_norm": 1.7783480882644653, "learning_rate": 3.49076704578552e-07, "loss": 0.498, "num_input_tokens_seen": 74228384, "step": 127845 }, { "epoch": 19.042299672326482, "grad_norm": 2.6980631351470947, "learning_rate": 3.4853579710822923e-07, "loss": 0.595, "num_input_tokens_seen": 74231424, "step": 127850 }, { "epoch": 19.043044384867443, "grad_norm": 1.1096019744873047, "learning_rate": 3.4799530609933575e-07, "loss": 0.6652, "num_input_tokens_seen": 74234720, "step": 127855 }, { "epoch": 19.0437890974084, "grad_norm": 1.6529200077056885, "learning_rate": 3.474552315610086e-07, "loss": 0.6831, "num_input_tokens_seen": 74237472, "step": 127860 }, { "epoch": 19.04453380994936, "grad_norm": 2.0152242183685303, "learning_rate": 3.4691557350236827e-07, "loss": 0.5342, "num_input_tokens_seen": 74240288, "step": 127865 }, { "epoch": 19.045278522490317, "grad_norm": 1.1785410642623901, "learning_rate": 3.4637633193253525e-07, "loss": 0.5617, "num_input_tokens_seen": 74242976, "step": 127870 }, { "epoch": 19.046023235031278, "grad_norm": 1.6071597337722778, "learning_rate": 3.458375068606162e-07, "loss": 0.4387, "num_input_tokens_seen": 74245664, "step": 127875 }, { "epoch": 19.04676794757224, "grad_norm": 1.7862428426742554, "learning_rate": 3.4529909829571494e-07, "loss": 0.5772, "num_input_tokens_seen": 74248576, "step": 127880 }, { "epoch": 19.047512660113195, "grad_norm": 1.1592025756835938, "learning_rate": 3.447611062469269e-07, "loss": 0.5671, "num_input_tokens_seen": 74251392, "step": 127885 }, { "epoch": 19.048257372654156, "grad_norm": 1.4021767377853394, "learning_rate": 3.442235307233449e-07, "loss": 0.5023, "num_input_tokens_seen": 74254432, "step": 127890 }, { "epoch": 19.049002085195113, "grad_norm": 0.9619117975234985, "learning_rate": 3.4368637173404494e-07, "loss": 0.7188, "num_input_tokens_seen": 74257248, "step": 127895 }, { "epoch": 19.049746797736073, "grad_norm": 1.5063579082489014, "learning_rate": 3.4314962928810315e-07, "loss": 0.5389, "num_input_tokens_seen": 74260032, "step": 127900 }, { "epoch": 19.050491510277034, "grad_norm": 2.148824691772461, "learning_rate": 3.426133033945872e-07, "loss": 0.417, "num_input_tokens_seen": 74263040, "step": 127905 }, { "epoch": 19.05123622281799, "grad_norm": 3.612779378890991, "learning_rate": 3.420773940625621e-07, "loss": 0.6392, "num_input_tokens_seen": 74266176, "step": 127910 }, { "epoch": 19.05198093535895, "grad_norm": 1.2521003484725952, "learning_rate": 3.415419013010762e-07, "loss": 0.6379, "num_input_tokens_seen": 74268992, "step": 127915 }, { "epoch": 19.052725647899912, "grad_norm": 2.5039613246917725, "learning_rate": 3.410068251191806e-07, "loss": 0.6018, "num_input_tokens_seen": 74271872, "step": 127920 }, { "epoch": 19.05347036044087, "grad_norm": 1.503286600112915, "learning_rate": 3.4047216552590687e-07, "loss": 0.755, "num_input_tokens_seen": 74274688, "step": 127925 }, { "epoch": 19.05421507298183, "grad_norm": 2.6653690338134766, "learning_rate": 3.399379225302979e-07, "loss": 0.6867, "num_input_tokens_seen": 74277824, "step": 127930 }, { "epoch": 19.054959785522787, "grad_norm": 1.8328361511230469, "learning_rate": 3.3940409614137135e-07, "loss": 0.6428, "num_input_tokens_seen": 74280768, "step": 127935 }, { "epoch": 19.055704498063747, "grad_norm": 1.4197638034820557, "learning_rate": 3.3887068636815346e-07, "loss": 0.5326, "num_input_tokens_seen": 74283936, "step": 127940 }, { "epoch": 19.056449210604708, "grad_norm": 3.716104745864868, "learning_rate": 3.3833769321964527e-07, "loss": 0.507, "num_input_tokens_seen": 74286976, "step": 127945 }, { "epoch": 19.057193923145665, "grad_norm": 2.4035754203796387, "learning_rate": 3.378051167048618e-07, "loss": 0.6064, "num_input_tokens_seen": 74289888, "step": 127950 }, { "epoch": 19.057938635686625, "grad_norm": 1.2196108102798462, "learning_rate": 3.3727295683279314e-07, "loss": 0.611, "num_input_tokens_seen": 74292704, "step": 127955 }, { "epoch": 19.058683348227586, "grad_norm": 1.5219485759735107, "learning_rate": 3.367412136124321e-07, "loss": 0.4971, "num_input_tokens_seen": 74295968, "step": 127960 }, { "epoch": 19.059428060768543, "grad_norm": 1.5148851871490479, "learning_rate": 3.3620988705276023e-07, "loss": 0.5034, "num_input_tokens_seen": 74299168, "step": 127965 }, { "epoch": 19.060172773309503, "grad_norm": 1.1691843271255493, "learning_rate": 3.3567897716275663e-07, "loss": 0.5497, "num_input_tokens_seen": 74302112, "step": 127970 }, { "epoch": 19.06091748585046, "grad_norm": 0.36103081703186035, "learning_rate": 3.351484839513891e-07, "loss": 0.5388, "num_input_tokens_seen": 74304800, "step": 127975 }, { "epoch": 19.06166219839142, "grad_norm": 0.8884315490722656, "learning_rate": 3.3461840742761707e-07, "loss": 0.4643, "num_input_tokens_seen": 74307648, "step": 127980 }, { "epoch": 19.06240691093238, "grad_norm": 1.873616337776184, "learning_rate": 3.340887476004001e-07, "loss": 0.4022, "num_input_tokens_seen": 74310496, "step": 127985 }, { "epoch": 19.06315162347334, "grad_norm": 4.13788366317749, "learning_rate": 3.3355950447868657e-07, "loss": 0.9068, "num_input_tokens_seen": 74313600, "step": 127990 }, { "epoch": 19.0638963360143, "grad_norm": 1.5717459917068481, "learning_rate": 3.3303067807141095e-07, "loss": 0.685, "num_input_tokens_seen": 74316544, "step": 127995 }, { "epoch": 19.06464104855526, "grad_norm": 0.7860556840896606, "learning_rate": 3.325022683875162e-07, "loss": 0.5945, "num_input_tokens_seen": 74319392, "step": 128000 }, { "epoch": 19.065385761096216, "grad_norm": 1.2853269577026367, "learning_rate": 3.319742754359201e-07, "loss": 0.5742, "num_input_tokens_seen": 74322464, "step": 128005 }, { "epoch": 19.066130473637177, "grad_norm": 1.5557048320770264, "learning_rate": 3.314466992255516e-07, "loss": 0.5601, "num_input_tokens_seen": 74325376, "step": 128010 }, { "epoch": 19.066875186178134, "grad_norm": 1.6397486925125122, "learning_rate": 3.309195397653148e-07, "loss": 0.7346, "num_input_tokens_seen": 74328224, "step": 128015 }, { "epoch": 19.067619898719094, "grad_norm": 1.5958061218261719, "learning_rate": 3.3039279706412465e-07, "loss": 0.6219, "num_input_tokens_seen": 74331424, "step": 128020 }, { "epoch": 19.068364611260055, "grad_norm": 2.18261456489563, "learning_rate": 3.2986647113087134e-07, "loss": 0.5069, "num_input_tokens_seen": 74334464, "step": 128025 }, { "epoch": 19.069109323801012, "grad_norm": 1.7804040908813477, "learning_rate": 3.293405619744533e-07, "loss": 0.558, "num_input_tokens_seen": 74337504, "step": 128030 }, { "epoch": 19.069854036341972, "grad_norm": 1.8228141069412231, "learning_rate": 3.288150696037523e-07, "loss": 0.761, "num_input_tokens_seen": 74340352, "step": 128035 }, { "epoch": 19.070598748882933, "grad_norm": 2.533724069595337, "learning_rate": 3.282899940276418e-07, "loss": 0.6384, "num_input_tokens_seen": 74343264, "step": 128040 }, { "epoch": 19.07134346142389, "grad_norm": 1.4076268672943115, "learning_rate": 3.2776533525500085e-07, "loss": 0.5945, "num_input_tokens_seen": 74345952, "step": 128045 }, { "epoch": 19.07208817396485, "grad_norm": 2.755885124206543, "learning_rate": 3.272410932946862e-07, "loss": 0.705, "num_input_tokens_seen": 74348768, "step": 128050 }, { "epoch": 19.072832886505807, "grad_norm": 2.1653456687927246, "learning_rate": 3.267172681555575e-07, "loss": 0.7136, "num_input_tokens_seen": 74351776, "step": 128055 }, { "epoch": 19.073577599046768, "grad_norm": 1.9839932918548584, "learning_rate": 3.261938598464631e-07, "loss": 0.4873, "num_input_tokens_seen": 74354592, "step": 128060 }, { "epoch": 19.07432231158773, "grad_norm": 1.1070219278335571, "learning_rate": 3.256708683762488e-07, "loss": 0.6497, "num_input_tokens_seen": 74357664, "step": 128065 }, { "epoch": 19.075067024128685, "grad_norm": 1.134597659111023, "learning_rate": 3.2514829375374643e-07, "loss": 0.6546, "num_input_tokens_seen": 74360480, "step": 128070 }, { "epoch": 19.075811736669646, "grad_norm": 1.7473355531692505, "learning_rate": 3.24626135987785e-07, "loss": 0.5709, "num_input_tokens_seen": 74363584, "step": 128075 }, { "epoch": 19.076556449210603, "grad_norm": 0.9712202548980713, "learning_rate": 3.2410439508718527e-07, "loss": 0.5602, "num_input_tokens_seen": 74366432, "step": 128080 }, { "epoch": 19.077301161751564, "grad_norm": 2.3401095867156982, "learning_rate": 3.2358307106076234e-07, "loss": 0.6722, "num_input_tokens_seen": 74369472, "step": 128085 }, { "epoch": 19.078045874292524, "grad_norm": 2.464050531387329, "learning_rate": 3.2306216391732593e-07, "loss": 0.8257, "num_input_tokens_seen": 74372288, "step": 128090 }, { "epoch": 19.07879058683348, "grad_norm": 2.1146349906921387, "learning_rate": 3.225416736656689e-07, "loss": 0.6528, "num_input_tokens_seen": 74375040, "step": 128095 }, { "epoch": 19.07953529937444, "grad_norm": 1.2251423597335815, "learning_rate": 3.220216003145926e-07, "loss": 0.4732, "num_input_tokens_seen": 74377792, "step": 128100 }, { "epoch": 19.080280011915402, "grad_norm": 1.5579391717910767, "learning_rate": 3.215019438728789e-07, "loss": 0.4242, "num_input_tokens_seen": 74380864, "step": 128105 }, { "epoch": 19.08102472445636, "grad_norm": 1.348517656326294, "learning_rate": 3.209827043493097e-07, "loss": 0.6532, "num_input_tokens_seen": 74384160, "step": 128110 }, { "epoch": 19.08176943699732, "grad_norm": 1.3748724460601807, "learning_rate": 3.204638817526528e-07, "loss": 0.5628, "num_input_tokens_seen": 74387072, "step": 128115 }, { "epoch": 19.082514149538277, "grad_norm": 2.3323049545288086, "learning_rate": 3.1994547609167644e-07, "loss": 0.736, "num_input_tokens_seen": 74389920, "step": 128120 }, { "epoch": 19.083258862079237, "grad_norm": 2.5194034576416016, "learning_rate": 3.194274873751374e-07, "loss": 0.7041, "num_input_tokens_seen": 74392896, "step": 128125 }, { "epoch": 19.084003574620198, "grad_norm": 1.2782225608825684, "learning_rate": 3.189099156117842e-07, "loss": 0.7263, "num_input_tokens_seen": 74395776, "step": 128130 }, { "epoch": 19.084748287161155, "grad_norm": 1.4476243257522583, "learning_rate": 3.1839276081036816e-07, "loss": 0.4867, "num_input_tokens_seen": 74398624, "step": 128135 }, { "epoch": 19.085492999702115, "grad_norm": 1.2414977550506592, "learning_rate": 3.1787602297961574e-07, "loss": 0.6608, "num_input_tokens_seen": 74401568, "step": 128140 }, { "epoch": 19.086237712243076, "grad_norm": 2.0871357917785645, "learning_rate": 3.1735970212826705e-07, "loss": 0.8991, "num_input_tokens_seen": 74404544, "step": 128145 }, { "epoch": 19.086982424784033, "grad_norm": 1.1488453149795532, "learning_rate": 3.168437982650374e-07, "loss": 0.3721, "num_input_tokens_seen": 74407392, "step": 128150 }, { "epoch": 19.087727137324993, "grad_norm": 1.0260099172592163, "learning_rate": 3.1632831139864763e-07, "loss": 0.5845, "num_input_tokens_seen": 74410112, "step": 128155 }, { "epoch": 19.08847184986595, "grad_norm": 1.5920382738113403, "learning_rate": 3.158132415378018e-07, "loss": 0.5517, "num_input_tokens_seen": 74412832, "step": 128160 }, { "epoch": 19.08921656240691, "grad_norm": 1.5928934812545776, "learning_rate": 3.152985886912013e-07, "loss": 0.537, "num_input_tokens_seen": 74415648, "step": 128165 }, { "epoch": 19.08996127494787, "grad_norm": 1.8394628763198853, "learning_rate": 3.1478435286754483e-07, "loss": 0.7809, "num_input_tokens_seen": 74418592, "step": 128170 }, { "epoch": 19.09070598748883, "grad_norm": 2.385601758956909, "learning_rate": 3.14270534075517e-07, "loss": 0.5772, "num_input_tokens_seen": 74421632, "step": 128175 }, { "epoch": 19.09145070002979, "grad_norm": 1.1680291891098022, "learning_rate": 3.137571323237998e-07, "loss": 0.4227, "num_input_tokens_seen": 74424576, "step": 128180 }, { "epoch": 19.09219541257075, "grad_norm": 2.507662296295166, "learning_rate": 3.13244147621064e-07, "loss": 0.6126, "num_input_tokens_seen": 74427296, "step": 128185 }, { "epoch": 19.092940125111706, "grad_norm": 1.6904891729354858, "learning_rate": 3.1273157997598056e-07, "loss": 0.5187, "num_input_tokens_seen": 74430048, "step": 128190 }, { "epoch": 19.093684837652667, "grad_norm": 2.2637321949005127, "learning_rate": 3.122194293972064e-07, "loss": 0.6592, "num_input_tokens_seen": 74432992, "step": 128195 }, { "epoch": 19.094429550193624, "grad_norm": 3.1840014457702637, "learning_rate": 3.117076958933901e-07, "loss": 0.6657, "num_input_tokens_seen": 74436032, "step": 128200 }, { "epoch": 19.095174262734584, "grad_norm": 1.7051780223846436, "learning_rate": 3.111963794731831e-07, "loss": 0.5633, "num_input_tokens_seen": 74439008, "step": 128205 }, { "epoch": 19.095918975275545, "grad_norm": 1.531636357307434, "learning_rate": 3.106854801452175e-07, "loss": 0.7106, "num_input_tokens_seen": 74441760, "step": 128210 }, { "epoch": 19.096663687816502, "grad_norm": 1.3792823553085327, "learning_rate": 3.1017499791813067e-07, "loss": 0.6676, "num_input_tokens_seen": 74444544, "step": 128215 }, { "epoch": 19.097408400357462, "grad_norm": 1.9369343519210815, "learning_rate": 3.096649328005435e-07, "loss": 0.6886, "num_input_tokens_seen": 74447456, "step": 128220 }, { "epoch": 19.098153112898423, "grad_norm": 2.844639301300049, "learning_rate": 3.091552848010715e-07, "loss": 0.4781, "num_input_tokens_seen": 74450304, "step": 128225 }, { "epoch": 19.09889782543938, "grad_norm": 2.2216312885284424, "learning_rate": 3.08646053928327e-07, "loss": 0.6839, "num_input_tokens_seen": 74453184, "step": 128230 }, { "epoch": 19.09964253798034, "grad_norm": 1.743882417678833, "learning_rate": 3.081372401909116e-07, "loss": 0.626, "num_input_tokens_seen": 74456096, "step": 128235 }, { "epoch": 19.100387250521297, "grad_norm": 2.450965404510498, "learning_rate": 3.076288435974239e-07, "loss": 0.685, "num_input_tokens_seen": 74458816, "step": 128240 }, { "epoch": 19.101131963062258, "grad_norm": 0.8697304129600525, "learning_rate": 3.071208641564488e-07, "loss": 0.4721, "num_input_tokens_seen": 74461888, "step": 128245 }, { "epoch": 19.10187667560322, "grad_norm": 1.7403839826583862, "learning_rate": 3.06613301876571e-07, "loss": 0.6432, "num_input_tokens_seen": 74464960, "step": 128250 }, { "epoch": 19.102621388144176, "grad_norm": 1.7154566049575806, "learning_rate": 3.0610615676636144e-07, "loss": 0.5149, "num_input_tokens_seen": 74468000, "step": 128255 }, { "epoch": 19.103366100685136, "grad_norm": 1.5865029096603394, "learning_rate": 3.0559942883439387e-07, "loss": 0.7029, "num_input_tokens_seen": 74470976, "step": 128260 }, { "epoch": 19.104110813226093, "grad_norm": 2.0231094360351562, "learning_rate": 3.0509311808922526e-07, "loss": 0.4758, "num_input_tokens_seen": 74473920, "step": 128265 }, { "epoch": 19.104855525767054, "grad_norm": 1.2477682828903198, "learning_rate": 3.045872245394099e-07, "loss": 0.6793, "num_input_tokens_seen": 74476544, "step": 128270 }, { "epoch": 19.105600238308014, "grad_norm": 2.081498384475708, "learning_rate": 3.0408174819349377e-07, "loss": 0.4752, "num_input_tokens_seen": 74479424, "step": 128275 }, { "epoch": 19.10634495084897, "grad_norm": 1.4321160316467285, "learning_rate": 3.035766890600145e-07, "loss": 0.4146, "num_input_tokens_seen": 74482656, "step": 128280 }, { "epoch": 19.10708966338993, "grad_norm": 1.313629150390625, "learning_rate": 3.030720471475096e-07, "loss": 0.5383, "num_input_tokens_seen": 74485408, "step": 128285 }, { "epoch": 19.107834375930892, "grad_norm": 1.9167176485061646, "learning_rate": 3.025678224645001e-07, "loss": 0.7772, "num_input_tokens_seen": 74488800, "step": 128290 }, { "epoch": 19.10857908847185, "grad_norm": 1.490007758140564, "learning_rate": 3.02064015019507e-07, "loss": 0.6474, "num_input_tokens_seen": 74491904, "step": 128295 }, { "epoch": 19.10932380101281, "grad_norm": 1.3482942581176758, "learning_rate": 3.015606248210401e-07, "loss": 0.5325, "num_input_tokens_seen": 74494496, "step": 128300 }, { "epoch": 19.110068513553767, "grad_norm": 0.7718483209609985, "learning_rate": 3.010576518776037e-07, "loss": 0.4847, "num_input_tokens_seen": 74497472, "step": 128305 }, { "epoch": 19.110813226094727, "grad_norm": 1.400225043296814, "learning_rate": 3.005550961976938e-07, "loss": 0.4799, "num_input_tokens_seen": 74500288, "step": 128310 }, { "epoch": 19.111557938635688, "grad_norm": 1.098018765449524, "learning_rate": 3.0005295778980647e-07, "loss": 0.6922, "num_input_tokens_seen": 74504576, "step": 128315 }, { "epoch": 19.112302651176645, "grad_norm": 3.25508189201355, "learning_rate": 2.9955123666241814e-07, "loss": 0.4966, "num_input_tokens_seen": 74507360, "step": 128320 }, { "epoch": 19.113047363717605, "grad_norm": 1.3461295366287231, "learning_rate": 2.990499328240054e-07, "loss": 0.471, "num_input_tokens_seen": 74510112, "step": 128325 }, { "epoch": 19.113792076258566, "grad_norm": 2.825359344482422, "learning_rate": 2.9854904628304206e-07, "loss": 0.6883, "num_input_tokens_seen": 74512960, "step": 128330 }, { "epoch": 19.114536788799523, "grad_norm": 1.8769856691360474, "learning_rate": 2.980485770479824e-07, "loss": 0.7414, "num_input_tokens_seen": 74515936, "step": 128335 }, { "epoch": 19.115281501340483, "grad_norm": 1.4709829092025757, "learning_rate": 2.975485251272919e-07, "loss": 0.6003, "num_input_tokens_seen": 74518816, "step": 128340 }, { "epoch": 19.11602621388144, "grad_norm": 2.293665647506714, "learning_rate": 2.970488905294083e-07, "loss": 0.5386, "num_input_tokens_seen": 74521472, "step": 128345 }, { "epoch": 19.1167709264224, "grad_norm": 1.5163453817367554, "learning_rate": 2.965496732627804e-07, "loss": 0.5677, "num_input_tokens_seen": 74524352, "step": 128350 }, { "epoch": 19.11751563896336, "grad_norm": 1.4212967157363892, "learning_rate": 2.960508733358375e-07, "loss": 0.5458, "num_input_tokens_seen": 74527264, "step": 128355 }, { "epoch": 19.11826035150432, "grad_norm": 0.7776999473571777, "learning_rate": 2.955524907570062e-07, "loss": 0.6021, "num_input_tokens_seen": 74529952, "step": 128360 }, { "epoch": 19.11900506404528, "grad_norm": 1.1035574674606323, "learning_rate": 2.950545255347076e-07, "loss": 0.3964, "num_input_tokens_seen": 74532896, "step": 128365 }, { "epoch": 19.11974977658624, "grad_norm": 0.8203940987586975, "learning_rate": 2.9455697767735155e-07, "loss": 0.4951, "num_input_tokens_seen": 74535872, "step": 128370 }, { "epoch": 19.120494489127196, "grad_norm": 1.1735435724258423, "learning_rate": 2.9405984719334814e-07, "loss": 0.5942, "num_input_tokens_seen": 74538912, "step": 128375 }, { "epoch": 19.121239201668157, "grad_norm": 1.8841540813446045, "learning_rate": 2.935631340910933e-07, "loss": 0.6548, "num_input_tokens_seen": 74541760, "step": 128380 }, { "epoch": 19.121983914209114, "grad_norm": 1.207317590713501, "learning_rate": 2.930668383789775e-07, "loss": 0.777, "num_input_tokens_seen": 74544896, "step": 128385 }, { "epoch": 19.122728626750074, "grad_norm": 1.7667231559753418, "learning_rate": 2.925709600653859e-07, "loss": 0.7374, "num_input_tokens_seen": 74547616, "step": 128390 }, { "epoch": 19.123473339291035, "grad_norm": 1.2748464345932007, "learning_rate": 2.9207549915870045e-07, "loss": 0.5578, "num_input_tokens_seen": 74550432, "step": 128395 }, { "epoch": 19.124218051831992, "grad_norm": 1.1514105796813965, "learning_rate": 2.915804556672841e-07, "loss": 0.4899, "num_input_tokens_seen": 74553408, "step": 128400 }, { "epoch": 19.124962764372953, "grad_norm": 1.630632996559143, "learning_rate": 2.9108582959950504e-07, "loss": 0.6662, "num_input_tokens_seen": 74556480, "step": 128405 }, { "epoch": 19.12570747691391, "grad_norm": 1.194809079170227, "learning_rate": 2.9059162096371773e-07, "loss": 0.5573, "num_input_tokens_seen": 74559296, "step": 128410 }, { "epoch": 19.12645218945487, "grad_norm": 1.7817718982696533, "learning_rate": 2.9009782976827106e-07, "loss": 0.682, "num_input_tokens_seen": 74562400, "step": 128415 }, { "epoch": 19.12719690199583, "grad_norm": 1.4428224563598633, "learning_rate": 2.896044560215083e-07, "loss": 0.6225, "num_input_tokens_seen": 74565344, "step": 128420 }, { "epoch": 19.127941614536788, "grad_norm": 2.069392681121826, "learning_rate": 2.891114997317618e-07, "loss": 0.6522, "num_input_tokens_seen": 74568320, "step": 128425 }, { "epoch": 19.128686327077748, "grad_norm": 1.4241936206817627, "learning_rate": 2.8861896090736365e-07, "loss": 0.5908, "num_input_tokens_seen": 74571296, "step": 128430 }, { "epoch": 19.12943103961871, "grad_norm": 1.5482608079910278, "learning_rate": 2.881268395566322e-07, "loss": 0.4574, "num_input_tokens_seen": 74574240, "step": 128435 }, { "epoch": 19.130175752159666, "grad_norm": 1.2622977495193481, "learning_rate": 2.8763513568788036e-07, "loss": 0.7278, "num_input_tokens_seen": 74576992, "step": 128440 }, { "epoch": 19.130920464700626, "grad_norm": 1.5262975692749023, "learning_rate": 2.871438493094153e-07, "loss": 0.6035, "num_input_tokens_seen": 74579808, "step": 128445 }, { "epoch": 19.131665177241583, "grad_norm": 1.0599238872528076, "learning_rate": 2.866529804295387e-07, "loss": 0.4621, "num_input_tokens_seen": 74582784, "step": 128450 }, { "epoch": 19.132409889782544, "grad_norm": 1.7136609554290771, "learning_rate": 2.8616252905654393e-07, "loss": 0.6494, "num_input_tokens_seen": 74585632, "step": 128455 }, { "epoch": 19.133154602323504, "grad_norm": 0.8461619019508362, "learning_rate": 2.856724951987161e-07, "loss": 0.4344, "num_input_tokens_seen": 74588288, "step": 128460 }, { "epoch": 19.13389931486446, "grad_norm": 1.896456003189087, "learning_rate": 2.851828788643318e-07, "loss": 0.5919, "num_input_tokens_seen": 74591424, "step": 128465 }, { "epoch": 19.13464402740542, "grad_norm": 1.3023861646652222, "learning_rate": 2.846936800616623e-07, "loss": 0.4634, "num_input_tokens_seen": 74594368, "step": 128470 }, { "epoch": 19.135388739946382, "grad_norm": 1.9612783193588257, "learning_rate": 2.8420489879897595e-07, "loss": 0.4791, "num_input_tokens_seen": 74597056, "step": 128475 }, { "epoch": 19.13613345248734, "grad_norm": 2.0815746784210205, "learning_rate": 2.8371653508452725e-07, "loss": 0.4954, "num_input_tokens_seen": 74600000, "step": 128480 }, { "epoch": 19.1368781650283, "grad_norm": 2.693835973739624, "learning_rate": 2.832285889265651e-07, "loss": 0.5922, "num_input_tokens_seen": 74602944, "step": 128485 }, { "epoch": 19.137622877569257, "grad_norm": 3.227926015853882, "learning_rate": 2.827410603333386e-07, "loss": 0.6333, "num_input_tokens_seen": 74605760, "step": 128490 }, { "epoch": 19.138367590110217, "grad_norm": 1.7178808450698853, "learning_rate": 2.8225394931307715e-07, "loss": 0.5947, "num_input_tokens_seen": 74608800, "step": 128495 }, { "epoch": 19.139112302651178, "grad_norm": 2.362614393234253, "learning_rate": 2.817672558740131e-07, "loss": 0.6921, "num_input_tokens_seen": 74611648, "step": 128500 }, { "epoch": 19.139857015192135, "grad_norm": 2.8456971645355225, "learning_rate": 2.812809800243704e-07, "loss": 0.4739, "num_input_tokens_seen": 74614720, "step": 128505 }, { "epoch": 19.140601727733095, "grad_norm": 1.6463981866836548, "learning_rate": 2.80795121772362e-07, "loss": 0.6256, "num_input_tokens_seen": 74617696, "step": 128510 }, { "epoch": 19.141346440274056, "grad_norm": 2.084141254425049, "learning_rate": 2.803096811261979e-07, "loss": 0.6784, "num_input_tokens_seen": 74620512, "step": 128515 }, { "epoch": 19.142091152815013, "grad_norm": 2.6952762603759766, "learning_rate": 2.7982465809407443e-07, "loss": 0.7142, "num_input_tokens_seen": 74623424, "step": 128520 }, { "epoch": 19.142835865355973, "grad_norm": 1.2856190204620361, "learning_rate": 2.793400526841933e-07, "loss": 0.6691, "num_input_tokens_seen": 74626272, "step": 128525 }, { "epoch": 19.14358057789693, "grad_norm": 1.2481508255004883, "learning_rate": 2.7885586490473127e-07, "loss": 0.7414, "num_input_tokens_seen": 74628960, "step": 128530 }, { "epoch": 19.14432529043789, "grad_norm": 1.3308888673782349, "learning_rate": 2.7837209476387903e-07, "loss": 0.5981, "num_input_tokens_seen": 74631936, "step": 128535 }, { "epoch": 19.14507000297885, "grad_norm": 1.3619170188903809, "learning_rate": 2.7788874226980233e-07, "loss": 0.7005, "num_input_tokens_seen": 74634976, "step": 128540 }, { "epoch": 19.14581471551981, "grad_norm": 3.2781050205230713, "learning_rate": 2.774058074306696e-07, "loss": 0.4779, "num_input_tokens_seen": 74637888, "step": 128545 }, { "epoch": 19.14655942806077, "grad_norm": 1.209963083267212, "learning_rate": 2.7692329025463816e-07, "loss": 0.7475, "num_input_tokens_seen": 74640768, "step": 128550 }, { "epoch": 19.14730414060173, "grad_norm": 1.9434481859207153, "learning_rate": 2.7644119074986263e-07, "loss": 0.6002, "num_input_tokens_seen": 74643648, "step": 128555 }, { "epoch": 19.148048853142686, "grad_norm": 2.5679619312286377, "learning_rate": 2.7595950892448374e-07, "loss": 0.6221, "num_input_tokens_seen": 74646720, "step": 128560 }, { "epoch": 19.148793565683647, "grad_norm": 1.4410268068313599, "learning_rate": 2.754782447866394e-07, "loss": 0.6273, "num_input_tokens_seen": 74649312, "step": 128565 }, { "epoch": 19.149538278224604, "grad_norm": 1.0721030235290527, "learning_rate": 2.7499739834446204e-07, "loss": 0.4664, "num_input_tokens_seen": 74652256, "step": 128570 }, { "epoch": 19.150282990765565, "grad_norm": 1.8856642246246338, "learning_rate": 2.745169696060729e-07, "loss": 0.5967, "num_input_tokens_seen": 74654976, "step": 128575 }, { "epoch": 19.151027703306525, "grad_norm": 2.441633939743042, "learning_rate": 2.7403695857959046e-07, "loss": 0.6047, "num_input_tokens_seen": 74657856, "step": 128580 }, { "epoch": 19.151772415847482, "grad_norm": 1.4834105968475342, "learning_rate": 2.735573652731249e-07, "loss": 0.4857, "num_input_tokens_seen": 74660832, "step": 128585 }, { "epoch": 19.152517128388443, "grad_norm": 1.808655023574829, "learning_rate": 2.730781896947754e-07, "loss": 0.5931, "num_input_tokens_seen": 74663488, "step": 128590 }, { "epoch": 19.1532618409294, "grad_norm": 0.5089389085769653, "learning_rate": 2.7259943185263813e-07, "loss": 0.5364, "num_input_tokens_seen": 74666432, "step": 128595 }, { "epoch": 19.15400655347036, "grad_norm": 1.1183513402938843, "learning_rate": 2.7212109175480114e-07, "loss": 0.4599, "num_input_tokens_seen": 74669184, "step": 128600 }, { "epoch": 19.15475126601132, "grad_norm": 3.8196303844451904, "learning_rate": 2.7164316940934966e-07, "loss": 0.5197, "num_input_tokens_seen": 74671968, "step": 128605 }, { "epoch": 19.155495978552278, "grad_norm": 1.386157512664795, "learning_rate": 2.7116566482434936e-07, "loss": 0.6917, "num_input_tokens_seen": 74674816, "step": 128610 }, { "epoch": 19.156240691093238, "grad_norm": 0.808174729347229, "learning_rate": 2.706885780078744e-07, "loss": 0.5204, "num_input_tokens_seen": 74677728, "step": 128615 }, { "epoch": 19.1569854036342, "grad_norm": 2.155815839767456, "learning_rate": 2.7021190896798223e-07, "loss": 0.4552, "num_input_tokens_seen": 74680672, "step": 128620 }, { "epoch": 19.157730116175156, "grad_norm": 1.0755560398101807, "learning_rate": 2.6973565771272746e-07, "loss": 0.6222, "num_input_tokens_seen": 74683584, "step": 128625 }, { "epoch": 19.158474828716116, "grad_norm": 1.7420421838760376, "learning_rate": 2.6925982425015097e-07, "loss": 0.643, "num_input_tokens_seen": 74686368, "step": 128630 }, { "epoch": 19.159219541257073, "grad_norm": 1.3048593997955322, "learning_rate": 2.6878440858829626e-07, "loss": 0.3725, "num_input_tokens_seen": 74688992, "step": 128635 }, { "epoch": 19.159964253798034, "grad_norm": 1.3793781995773315, "learning_rate": 2.68309410735193e-07, "loss": 0.6255, "num_input_tokens_seen": 74691904, "step": 128640 }, { "epoch": 19.160708966338994, "grad_norm": 1.7823305130004883, "learning_rate": 2.678348306988626e-07, "loss": 0.6532, "num_input_tokens_seen": 74694848, "step": 128645 }, { "epoch": 19.16145367887995, "grad_norm": 1.179361343383789, "learning_rate": 2.67360668487332e-07, "loss": 0.5067, "num_input_tokens_seen": 74697952, "step": 128650 }, { "epoch": 19.16219839142091, "grad_norm": 1.895214319229126, "learning_rate": 2.6688692410860025e-07, "loss": 0.562, "num_input_tokens_seen": 74700736, "step": 128655 }, { "epoch": 19.162943103961872, "grad_norm": 1.3924912214279175, "learning_rate": 2.664135975706805e-07, "loss": 0.6401, "num_input_tokens_seen": 74703520, "step": 128660 }, { "epoch": 19.16368781650283, "grad_norm": 1.2617729902267456, "learning_rate": 2.659406888815608e-07, "loss": 0.5991, "num_input_tokens_seen": 74706336, "step": 128665 }, { "epoch": 19.16443252904379, "grad_norm": 1.1225171089172363, "learning_rate": 2.6546819804923737e-07, "loss": 0.7144, "num_input_tokens_seen": 74709088, "step": 128670 }, { "epoch": 19.165177241584747, "grad_norm": 1.5212626457214355, "learning_rate": 2.6499612508169016e-07, "loss": 0.6462, "num_input_tokens_seen": 74712224, "step": 128675 }, { "epoch": 19.165921954125707, "grad_norm": 1.3252792358398438, "learning_rate": 2.645244699868932e-07, "loss": 0.494, "num_input_tokens_seen": 74715200, "step": 128680 }, { "epoch": 19.166666666666668, "grad_norm": 1.700024127960205, "learning_rate": 2.6405323277281514e-07, "loss": 0.5379, "num_input_tokens_seen": 74718176, "step": 128685 }, { "epoch": 19.167411379207625, "grad_norm": 2.1617085933685303, "learning_rate": 2.6358241344741906e-07, "loss": 0.628, "num_input_tokens_seen": 74720928, "step": 128690 }, { "epoch": 19.168156091748585, "grad_norm": 2.1484475135803223, "learning_rate": 2.6311201201865423e-07, "loss": 0.6078, "num_input_tokens_seen": 74723936, "step": 128695 }, { "epoch": 19.168900804289546, "grad_norm": 2.1002936363220215, "learning_rate": 2.626420284944725e-07, "loss": 0.5312, "num_input_tokens_seen": 74726784, "step": 128700 }, { "epoch": 19.169645516830503, "grad_norm": 3.094477653503418, "learning_rate": 2.6217246288281205e-07, "loss": 0.6289, "num_input_tokens_seen": 74729408, "step": 128705 }, { "epoch": 19.170390229371463, "grad_norm": 2.151071310043335, "learning_rate": 2.6170331519160264e-07, "loss": 0.7419, "num_input_tokens_seen": 74732640, "step": 128710 }, { "epoch": 19.17113494191242, "grad_norm": 2.3120744228363037, "learning_rate": 2.61234585428774e-07, "loss": 0.7946, "num_input_tokens_seen": 74735296, "step": 128715 }, { "epoch": 19.17187965445338, "grad_norm": 1.9372880458831787, "learning_rate": 2.607662736022448e-07, "loss": 0.5709, "num_input_tokens_seen": 74738112, "step": 128720 }, { "epoch": 19.17262436699434, "grad_norm": 1.6411248445510864, "learning_rate": 2.6029837971992545e-07, "loss": 0.6189, "num_input_tokens_seen": 74740864, "step": 128725 }, { "epoch": 19.1733690795353, "grad_norm": 1.159252643585205, "learning_rate": 2.5983090378972064e-07, "loss": 0.5877, "num_input_tokens_seen": 74743520, "step": 128730 }, { "epoch": 19.17411379207626, "grad_norm": 1.1365909576416016, "learning_rate": 2.5936384581952686e-07, "loss": 0.5282, "num_input_tokens_seen": 74746400, "step": 128735 }, { "epoch": 19.17485850461722, "grad_norm": 2.8940465450286865, "learning_rate": 2.5889720581723506e-07, "loss": 0.6461, "num_input_tokens_seen": 74749280, "step": 128740 }, { "epoch": 19.175603217158177, "grad_norm": 1.1458324193954468, "learning_rate": 2.584309837907306e-07, "loss": 0.4526, "num_input_tokens_seen": 74752192, "step": 128745 }, { "epoch": 19.176347929699137, "grad_norm": 1.3442429304122925, "learning_rate": 2.5796517974789045e-07, "loss": 0.6959, "num_input_tokens_seen": 74755360, "step": 128750 }, { "epoch": 19.177092642240094, "grad_norm": 2.0597848892211914, "learning_rate": 2.5749979369657783e-07, "loss": 0.5587, "num_input_tokens_seen": 74758304, "step": 128755 }, { "epoch": 19.177837354781055, "grad_norm": 2.2230355739593506, "learning_rate": 2.570348256446614e-07, "loss": 0.5606, "num_input_tokens_seen": 74761856, "step": 128760 }, { "epoch": 19.178582067322015, "grad_norm": 1.5992661714553833, "learning_rate": 2.5657027559999327e-07, "loss": 0.6861, "num_input_tokens_seen": 74764864, "step": 128765 }, { "epoch": 19.179326779862972, "grad_norm": 1.2681753635406494, "learning_rate": 2.561061435704226e-07, "loss": 0.5615, "num_input_tokens_seen": 74767488, "step": 128770 }, { "epoch": 19.180071492403933, "grad_norm": 0.9913285374641418, "learning_rate": 2.556424295637905e-07, "loss": 0.4242, "num_input_tokens_seen": 74770368, "step": 128775 }, { "epoch": 19.18081620494489, "grad_norm": 0.9849601984024048, "learning_rate": 2.5517913358792945e-07, "loss": 0.5131, "num_input_tokens_seen": 74773120, "step": 128780 }, { "epoch": 19.18156091748585, "grad_norm": 1.4267821311950684, "learning_rate": 2.547162556506694e-07, "loss": 0.3615, "num_input_tokens_seen": 74776000, "step": 128785 }, { "epoch": 19.18230563002681, "grad_norm": 1.1912895441055298, "learning_rate": 2.5425379575982343e-07, "loss": 0.506, "num_input_tokens_seen": 74778720, "step": 128790 }, { "epoch": 19.183050342567768, "grad_norm": 1.265641212463379, "learning_rate": 2.537917539232132e-07, "loss": 0.5409, "num_input_tokens_seen": 74781824, "step": 128795 }, { "epoch": 19.183795055108728, "grad_norm": 1.0059716701507568, "learning_rate": 2.5333013014864073e-07, "loss": 0.5486, "num_input_tokens_seen": 74784800, "step": 128800 }, { "epoch": 19.18453976764969, "grad_norm": 1.5566096305847168, "learning_rate": 2.528689244439025e-07, "loss": 0.5649, "num_input_tokens_seen": 74787872, "step": 128805 }, { "epoch": 19.185284480190646, "grad_norm": 2.086026430130005, "learning_rate": 2.524081368167924e-07, "loss": 0.5306, "num_input_tokens_seen": 74790496, "step": 128810 }, { "epoch": 19.186029192731606, "grad_norm": 1.2685714960098267, "learning_rate": 2.5194776727509584e-07, "loss": 0.4221, "num_input_tokens_seen": 74793120, "step": 128815 }, { "epoch": 19.186773905272563, "grad_norm": 1.3757859468460083, "learning_rate": 2.5148781582658986e-07, "loss": 0.5947, "num_input_tokens_seen": 74795808, "step": 128820 }, { "epoch": 19.187518617813524, "grad_norm": 1.854717493057251, "learning_rate": 2.5102828247904055e-07, "loss": 0.3355, "num_input_tokens_seen": 74798688, "step": 128825 }, { "epoch": 19.188263330354484, "grad_norm": 1.8787715435028076, "learning_rate": 2.5056916724021663e-07, "loss": 0.5612, "num_input_tokens_seen": 74801248, "step": 128830 }, { "epoch": 19.18900804289544, "grad_norm": 1.5239431858062744, "learning_rate": 2.5011047011787026e-07, "loss": 0.7325, "num_input_tokens_seen": 74803936, "step": 128835 }, { "epoch": 19.189752755436402, "grad_norm": 3.363400936126709, "learning_rate": 2.4965219111975635e-07, "loss": 0.8641, "num_input_tokens_seen": 74807008, "step": 128840 }, { "epoch": 19.190497467977362, "grad_norm": 1.869387149810791, "learning_rate": 2.491943302536104e-07, "loss": 0.6918, "num_input_tokens_seen": 74809952, "step": 128845 }, { "epoch": 19.19124218051832, "grad_norm": 0.9620616436004639, "learning_rate": 2.487368875271706e-07, "loss": 0.6015, "num_input_tokens_seen": 74813024, "step": 128850 }, { "epoch": 19.19198689305928, "grad_norm": 1.9951989650726318, "learning_rate": 2.4827986294816696e-07, "loss": 0.6638, "num_input_tokens_seen": 74816032, "step": 128855 }, { "epoch": 19.192731605600237, "grad_norm": 0.9890443682670593, "learning_rate": 2.478232565243183e-07, "loss": 0.6097, "num_input_tokens_seen": 74819200, "step": 128860 }, { "epoch": 19.193476318141197, "grad_norm": 1.4859992265701294, "learning_rate": 2.4736706826333775e-07, "loss": 0.5251, "num_input_tokens_seen": 74821888, "step": 128865 }, { "epoch": 19.194221030682158, "grad_norm": 3.518059253692627, "learning_rate": 2.4691129817293324e-07, "loss": 0.7789, "num_input_tokens_seen": 74825952, "step": 128870 }, { "epoch": 19.194965743223115, "grad_norm": 1.4733060598373413, "learning_rate": 2.4645594626080405e-07, "loss": 0.5435, "num_input_tokens_seen": 74828896, "step": 128875 }, { "epoch": 19.195710455764075, "grad_norm": 1.2697672843933105, "learning_rate": 2.460010125346468e-07, "loss": 0.6537, "num_input_tokens_seen": 74831648, "step": 128880 }, { "epoch": 19.196455168305036, "grad_norm": 2.074881076812744, "learning_rate": 2.455464970021415e-07, "loss": 0.6188, "num_input_tokens_seen": 74834592, "step": 128885 }, { "epoch": 19.197199880845993, "grad_norm": 1.1975294351577759, "learning_rate": 2.450923996709681e-07, "loss": 0.6479, "num_input_tokens_seen": 74837504, "step": 128890 }, { "epoch": 19.197944593386953, "grad_norm": 1.1921988725662231, "learning_rate": 2.446387205487982e-07, "loss": 0.5996, "num_input_tokens_seen": 74840352, "step": 128895 }, { "epoch": 19.19868930592791, "grad_norm": 1.1812800168991089, "learning_rate": 2.44185459643298e-07, "loss": 0.6204, "num_input_tokens_seen": 74843456, "step": 128900 }, { "epoch": 19.19943401846887, "grad_norm": 1.1428200006484985, "learning_rate": 2.4373261696212237e-07, "loss": 0.4983, "num_input_tokens_seen": 74846240, "step": 128905 }, { "epoch": 19.20017873100983, "grad_norm": 1.3693724870681763, "learning_rate": 2.4328019251292355e-07, "loss": 0.5085, "num_input_tokens_seen": 74849184, "step": 128910 }, { "epoch": 19.20092344355079, "grad_norm": 1.4648587703704834, "learning_rate": 2.4282818630334547e-07, "loss": 0.6647, "num_input_tokens_seen": 74852160, "step": 128915 }, { "epoch": 19.20166815609175, "grad_norm": 2.6887564659118652, "learning_rate": 2.4237659834102364e-07, "loss": 0.5845, "num_input_tokens_seen": 74855136, "step": 128920 }, { "epoch": 19.202412868632706, "grad_norm": 1.2126872539520264, "learning_rate": 2.4192542863358534e-07, "loss": 0.6305, "num_input_tokens_seen": 74858272, "step": 128925 }, { "epoch": 19.203157581173667, "grad_norm": 1.2118855714797974, "learning_rate": 2.4147467718865227e-07, "loss": 0.5319, "num_input_tokens_seen": 74861440, "step": 128930 }, { "epoch": 19.203902293714627, "grad_norm": 1.4849774837493896, "learning_rate": 2.410243440138432e-07, "loss": 0.6619, "num_input_tokens_seen": 74864160, "step": 128935 }, { "epoch": 19.204647006255584, "grad_norm": 1.7324634790420532, "learning_rate": 2.405744291167633e-07, "loss": 0.722, "num_input_tokens_seen": 74866976, "step": 128940 }, { "epoch": 19.205391718796545, "grad_norm": 1.490032434463501, "learning_rate": 2.4012493250501476e-07, "loss": 0.5076, "num_input_tokens_seen": 74869664, "step": 128945 }, { "epoch": 19.206136431337505, "grad_norm": 1.4842283725738525, "learning_rate": 2.3967585418619153e-07, "loss": 0.628, "num_input_tokens_seen": 74872480, "step": 128950 }, { "epoch": 19.206881143878462, "grad_norm": 1.513866901397705, "learning_rate": 2.392271941678792e-07, "loss": 0.5186, "num_input_tokens_seen": 74875232, "step": 128955 }, { "epoch": 19.207625856419423, "grad_norm": 1.82511305809021, "learning_rate": 2.387789524576578e-07, "loss": 0.6791, "num_input_tokens_seen": 74878080, "step": 128960 }, { "epoch": 19.20837056896038, "grad_norm": 1.7109525203704834, "learning_rate": 2.383311290630963e-07, "loss": 0.6151, "num_input_tokens_seen": 74880992, "step": 128965 }, { "epoch": 19.20911528150134, "grad_norm": 1.2256650924682617, "learning_rate": 2.3788372399176638e-07, "loss": 0.6096, "num_input_tokens_seen": 74884288, "step": 128970 }, { "epoch": 19.2098599940423, "grad_norm": 2.01631498336792, "learning_rate": 2.3743673725122318e-07, "loss": 0.7852, "num_input_tokens_seen": 74887008, "step": 128975 }, { "epoch": 19.210604706583258, "grad_norm": 1.6226625442504883, "learning_rate": 2.3699016884901893e-07, "loss": 0.7286, "num_input_tokens_seen": 74889856, "step": 128980 }, { "epoch": 19.21134941912422, "grad_norm": 1.5753799676895142, "learning_rate": 2.365440187926976e-07, "loss": 0.5256, "num_input_tokens_seen": 74892544, "step": 128985 }, { "epoch": 19.21209413166518, "grad_norm": 2.3377978801727295, "learning_rate": 2.3609828708979765e-07, "loss": 0.6063, "num_input_tokens_seen": 74895360, "step": 128990 }, { "epoch": 19.212838844206136, "grad_norm": 1.5925655364990234, "learning_rate": 2.3565297374784635e-07, "loss": 0.5806, "num_input_tokens_seen": 74898240, "step": 128995 }, { "epoch": 19.213583556747096, "grad_norm": 1.1163744926452637, "learning_rate": 2.35208078774371e-07, "loss": 0.5319, "num_input_tokens_seen": 74901152, "step": 129000 }, { "epoch": 19.214328269288053, "grad_norm": 1.401721477508545, "learning_rate": 2.3476360217688508e-07, "loss": 0.6082, "num_input_tokens_seen": 74903968, "step": 129005 }, { "epoch": 19.215072981829014, "grad_norm": 0.9297223687171936, "learning_rate": 2.3431954396289645e-07, "loss": 0.6165, "num_input_tokens_seen": 74906816, "step": 129010 }, { "epoch": 19.215817694369974, "grad_norm": 1.8174142837524414, "learning_rate": 2.3387590413991022e-07, "loss": 0.6514, "num_input_tokens_seen": 74909632, "step": 129015 }, { "epoch": 19.21656240691093, "grad_norm": 1.0242177248001099, "learning_rate": 2.3343268271541764e-07, "loss": 0.4607, "num_input_tokens_seen": 74912224, "step": 129020 }, { "epoch": 19.217307119451892, "grad_norm": 1.5830857753753662, "learning_rate": 2.329898796969099e-07, "loss": 0.6152, "num_input_tokens_seen": 74915200, "step": 129025 }, { "epoch": 19.218051831992852, "grad_norm": 0.9828091859817505, "learning_rate": 2.3254749509186434e-07, "loss": 0.4682, "num_input_tokens_seen": 74917984, "step": 129030 }, { "epoch": 19.21879654453381, "grad_norm": 2.30234956741333, "learning_rate": 2.321055289077584e-07, "loss": 0.6209, "num_input_tokens_seen": 74920928, "step": 129035 }, { "epoch": 19.21954125707477, "grad_norm": 1.9878782033920288, "learning_rate": 2.3166398115205545e-07, "loss": 0.6105, "num_input_tokens_seen": 74923520, "step": 129040 }, { "epoch": 19.220285969615727, "grad_norm": 1.382944107055664, "learning_rate": 2.3122285183221627e-07, "loss": 0.5337, "num_input_tokens_seen": 74926528, "step": 129045 }, { "epoch": 19.221030682156687, "grad_norm": 2.8362441062927246, "learning_rate": 2.3078214095569318e-07, "loss": 0.5628, "num_input_tokens_seen": 74929152, "step": 129050 }, { "epoch": 19.221775394697648, "grad_norm": 2.4156548976898193, "learning_rate": 2.3034184852993025e-07, "loss": 0.6302, "num_input_tokens_seen": 74932160, "step": 129055 }, { "epoch": 19.222520107238605, "grad_norm": 1.3557205200195312, "learning_rate": 2.2990197456236873e-07, "loss": 0.5945, "num_input_tokens_seen": 74934688, "step": 129060 }, { "epoch": 19.223264819779565, "grad_norm": 1.3935216665267944, "learning_rate": 2.2946251906043604e-07, "loss": 0.485, "num_input_tokens_seen": 74937568, "step": 129065 }, { "epoch": 19.224009532320526, "grad_norm": 1.6610243320465088, "learning_rate": 2.2902348203155955e-07, "loss": 0.8283, "num_input_tokens_seen": 74940864, "step": 129070 }, { "epoch": 19.224754244861483, "grad_norm": 1.3509159088134766, "learning_rate": 2.2858486348315555e-07, "loss": 0.4454, "num_input_tokens_seen": 74943776, "step": 129075 }, { "epoch": 19.225498957402444, "grad_norm": 2.3809704780578613, "learning_rate": 2.281466634226348e-07, "loss": 0.8876, "num_input_tokens_seen": 74946592, "step": 129080 }, { "epoch": 19.2262436699434, "grad_norm": 1.1778696775436401, "learning_rate": 2.277088818573969e-07, "loss": 0.5253, "num_input_tokens_seen": 74949696, "step": 129085 }, { "epoch": 19.22698838248436, "grad_norm": 1.3687916994094849, "learning_rate": 2.2727151879484155e-07, "loss": 0.5626, "num_input_tokens_seen": 74952512, "step": 129090 }, { "epoch": 19.22773309502532, "grad_norm": 1.1373382806777954, "learning_rate": 2.2683457424235722e-07, "loss": 0.5544, "num_input_tokens_seen": 74955296, "step": 129095 }, { "epoch": 19.22847780756628, "grad_norm": 2.7967116832733154, "learning_rate": 2.2639804820732135e-07, "loss": 0.7178, "num_input_tokens_seen": 74958208, "step": 129100 }, { "epoch": 19.22922252010724, "grad_norm": 1.1892496347427368, "learning_rate": 2.259619406971142e-07, "loss": 0.5079, "num_input_tokens_seen": 74961056, "step": 129105 }, { "epoch": 19.229967232648196, "grad_norm": 0.9269422888755798, "learning_rate": 2.2552625171909925e-07, "loss": 0.5592, "num_input_tokens_seen": 74964064, "step": 129110 }, { "epoch": 19.230711945189157, "grad_norm": 1.3850189447402954, "learning_rate": 2.250909812806401e-07, "loss": 0.6267, "num_input_tokens_seen": 74966848, "step": 129115 }, { "epoch": 19.231456657730117, "grad_norm": 1.6733245849609375, "learning_rate": 2.246561293890892e-07, "loss": 0.6951, "num_input_tokens_seen": 74969664, "step": 129120 }, { "epoch": 19.232201370271074, "grad_norm": 1.4669034481048584, "learning_rate": 2.2422169605178788e-07, "loss": 0.523, "num_input_tokens_seen": 74972384, "step": 129125 }, { "epoch": 19.232946082812035, "grad_norm": 0.8088966608047485, "learning_rate": 2.2378768127608584e-07, "loss": 0.7029, "num_input_tokens_seen": 74975360, "step": 129130 }, { "epoch": 19.233690795352995, "grad_norm": 1.709090232849121, "learning_rate": 2.23354085069305e-07, "loss": 0.7003, "num_input_tokens_seen": 74978272, "step": 129135 }, { "epoch": 19.234435507893952, "grad_norm": 2.7239151000976562, "learning_rate": 2.2292090743877836e-07, "loss": 0.5455, "num_input_tokens_seen": 74981088, "step": 129140 }, { "epoch": 19.235180220434913, "grad_norm": 2.0129001140594482, "learning_rate": 2.2248814839181953e-07, "loss": 0.5639, "num_input_tokens_seen": 74983840, "step": 129145 }, { "epoch": 19.23592493297587, "grad_norm": 1.424055576324463, "learning_rate": 2.2205580793573932e-07, "loss": 0.5336, "num_input_tokens_seen": 74986784, "step": 129150 }, { "epoch": 19.23666964551683, "grad_norm": 1.041540503501892, "learning_rate": 2.2162388607784578e-07, "loss": 0.655, "num_input_tokens_seen": 74989472, "step": 129155 }, { "epoch": 19.23741435805779, "grad_norm": 1.6755971908569336, "learning_rate": 2.2119238282543032e-07, "loss": 0.478, "num_input_tokens_seen": 74992416, "step": 129160 }, { "epoch": 19.238159070598748, "grad_norm": 1.0479710102081299, "learning_rate": 2.2076129818578706e-07, "loss": 0.5836, "num_input_tokens_seen": 74995392, "step": 129165 }, { "epoch": 19.23890378313971, "grad_norm": 1.6422255039215088, "learning_rate": 2.203306321661963e-07, "loss": 0.4588, "num_input_tokens_seen": 74998112, "step": 129170 }, { "epoch": 19.23964849568067, "grad_norm": 2.0122461318969727, "learning_rate": 2.1990038477393559e-07, "loss": 0.626, "num_input_tokens_seen": 75001024, "step": 129175 }, { "epoch": 19.240393208221626, "grad_norm": 1.816483736038208, "learning_rate": 2.1947055601627132e-07, "loss": 0.5787, "num_input_tokens_seen": 75004256, "step": 129180 }, { "epoch": 19.241137920762586, "grad_norm": 0.9258838891983032, "learning_rate": 2.190411459004671e-07, "loss": 0.4575, "num_input_tokens_seen": 75007040, "step": 129185 }, { "epoch": 19.241882633303543, "grad_norm": 1.5553628206253052, "learning_rate": 2.1861215443377547e-07, "loss": 0.4681, "num_input_tokens_seen": 75009888, "step": 129190 }, { "epoch": 19.242627345844504, "grad_norm": 2.0301804542541504, "learning_rate": 2.1818358162344622e-07, "loss": 0.6061, "num_input_tokens_seen": 75012448, "step": 129195 }, { "epoch": 19.243372058385464, "grad_norm": 1.5986279249191284, "learning_rate": 2.1775542747671795e-07, "loss": 0.8999, "num_input_tokens_seen": 75015360, "step": 129200 }, { "epoch": 19.24411677092642, "grad_norm": 2.1615302562713623, "learning_rate": 2.173276920008238e-07, "loss": 0.6075, "num_input_tokens_seen": 75018464, "step": 129205 }, { "epoch": 19.244861483467382, "grad_norm": 1.0863316059112549, "learning_rate": 2.1690037520299134e-07, "loss": 0.6791, "num_input_tokens_seen": 75021760, "step": 129210 }, { "epoch": 19.245606196008342, "grad_norm": 1.673682689666748, "learning_rate": 2.1647347709043696e-07, "loss": 0.4997, "num_input_tokens_seen": 75024864, "step": 129215 }, { "epoch": 19.2463509085493, "grad_norm": 1.7609210014343262, "learning_rate": 2.160469976703744e-07, "loss": 0.6713, "num_input_tokens_seen": 75027296, "step": 129220 }, { "epoch": 19.24709562109026, "grad_norm": 2.1668319702148438, "learning_rate": 2.1562093695000897e-07, "loss": 0.6434, "num_input_tokens_seen": 75030144, "step": 129225 }, { "epoch": 19.247840333631217, "grad_norm": 1.5349562168121338, "learning_rate": 2.1519529493654045e-07, "loss": 0.65, "num_input_tokens_seen": 75033088, "step": 129230 }, { "epoch": 19.248585046172177, "grad_norm": 3.8175323009490967, "learning_rate": 2.14770071637152e-07, "loss": 0.5676, "num_input_tokens_seen": 75035968, "step": 129235 }, { "epoch": 19.249329758713138, "grad_norm": 1.5302387475967407, "learning_rate": 2.143452670590379e-07, "loss": 0.4906, "num_input_tokens_seen": 75038944, "step": 129240 }, { "epoch": 19.250074471254095, "grad_norm": 1.3692563772201538, "learning_rate": 2.1392088120936737e-07, "loss": 0.5968, "num_input_tokens_seen": 75041984, "step": 129245 }, { "epoch": 19.250819183795056, "grad_norm": 1.6649744510650635, "learning_rate": 2.1349691409530968e-07, "loss": 0.6474, "num_input_tokens_seen": 75044736, "step": 129250 }, { "epoch": 19.251563896336016, "grad_norm": 1.1893012523651123, "learning_rate": 2.1307336572403415e-07, "loss": 0.557, "num_input_tokens_seen": 75047808, "step": 129255 }, { "epoch": 19.252308608876973, "grad_norm": 1.8768105506896973, "learning_rate": 2.1265023610268776e-07, "loss": 0.7704, "num_input_tokens_seen": 75050688, "step": 129260 }, { "epoch": 19.253053321417934, "grad_norm": 1.5243147611618042, "learning_rate": 2.1222752523842594e-07, "loss": 0.5472, "num_input_tokens_seen": 75053760, "step": 129265 }, { "epoch": 19.25379803395889, "grad_norm": 3.1147053241729736, "learning_rate": 2.1180523313838462e-07, "loss": 0.7292, "num_input_tokens_seen": 75056544, "step": 129270 }, { "epoch": 19.25454274649985, "grad_norm": 2.1371591091156006, "learning_rate": 2.1138335980970258e-07, "loss": 0.5417, "num_input_tokens_seen": 75059520, "step": 129275 }, { "epoch": 19.25528745904081, "grad_norm": 0.6444303393363953, "learning_rate": 2.1096190525950464e-07, "loss": 0.2912, "num_input_tokens_seen": 75062208, "step": 129280 }, { "epoch": 19.25603217158177, "grad_norm": 1.7157981395721436, "learning_rate": 2.1054086949491013e-07, "loss": 0.5143, "num_input_tokens_seen": 75065184, "step": 129285 }, { "epoch": 19.25677688412273, "grad_norm": 2.4643704891204834, "learning_rate": 2.101202525230328e-07, "loss": 0.5711, "num_input_tokens_seen": 75067904, "step": 129290 }, { "epoch": 19.257521596663686, "grad_norm": 1.79557204246521, "learning_rate": 2.0970005435097807e-07, "loss": 0.707, "num_input_tokens_seen": 75070848, "step": 129295 }, { "epoch": 19.258266309204647, "grad_norm": 1.498618245124817, "learning_rate": 2.0928027498584579e-07, "loss": 0.7774, "num_input_tokens_seen": 75073984, "step": 129300 }, { "epoch": 19.259011021745607, "grad_norm": 1.4334948062896729, "learning_rate": 2.0886091443472477e-07, "loss": 0.5427, "num_input_tokens_seen": 75076768, "step": 129305 }, { "epoch": 19.259755734286564, "grad_norm": 1.6045620441436768, "learning_rate": 2.084419727047038e-07, "loss": 0.4029, "num_input_tokens_seen": 75079968, "step": 129310 }, { "epoch": 19.260500446827525, "grad_norm": 1.30390465259552, "learning_rate": 2.0802344980285771e-07, "loss": 0.4708, "num_input_tokens_seen": 75082592, "step": 129315 }, { "epoch": 19.261245159368485, "grad_norm": 1.5510073900222778, "learning_rate": 2.0760534573626144e-07, "loss": 0.5736, "num_input_tokens_seen": 75085696, "step": 129320 }, { "epoch": 19.261989871909442, "grad_norm": 1.1206958293914795, "learning_rate": 2.0718766051197048e-07, "loss": 0.6797, "num_input_tokens_seen": 75089184, "step": 129325 }, { "epoch": 19.262734584450403, "grad_norm": 1.4046515226364136, "learning_rate": 2.0677039413704857e-07, "loss": 0.5637, "num_input_tokens_seen": 75091872, "step": 129330 }, { "epoch": 19.26347929699136, "grad_norm": 1.3594484329223633, "learning_rate": 2.063535466185429e-07, "loss": 0.6408, "num_input_tokens_seen": 75094944, "step": 129335 }, { "epoch": 19.26422400953232, "grad_norm": 1.4615494012832642, "learning_rate": 2.0593711796349225e-07, "loss": 0.6046, "num_input_tokens_seen": 75097568, "step": 129340 }, { "epoch": 19.26496872207328, "grad_norm": 3.029944896697998, "learning_rate": 2.0552110817893544e-07, "loss": 0.7683, "num_input_tokens_seen": 75100160, "step": 129345 }, { "epoch": 19.265713434614238, "grad_norm": 1.34158456325531, "learning_rate": 2.051055172719002e-07, "loss": 0.7272, "num_input_tokens_seen": 75103232, "step": 129350 }, { "epoch": 19.2664581471552, "grad_norm": 0.9187223315238953, "learning_rate": 2.0469034524940588e-07, "loss": 0.4914, "num_input_tokens_seen": 75106656, "step": 129355 }, { "epoch": 19.26720285969616, "grad_norm": 1.3946768045425415, "learning_rate": 2.0427559211846915e-07, "loss": 0.5166, "num_input_tokens_seen": 75109504, "step": 129360 }, { "epoch": 19.267947572237116, "grad_norm": 1.2408857345581055, "learning_rate": 2.0386125788609266e-07, "loss": 0.7208, "num_input_tokens_seen": 75112384, "step": 129365 }, { "epoch": 19.268692284778076, "grad_norm": 1.6605277061462402, "learning_rate": 2.034473425592792e-07, "loss": 0.3724, "num_input_tokens_seen": 75115648, "step": 129370 }, { "epoch": 19.269436997319033, "grad_norm": 2.4388294219970703, "learning_rate": 2.0303384614502042e-07, "loss": 0.557, "num_input_tokens_seen": 75118496, "step": 129375 }, { "epoch": 19.270181709859994, "grad_norm": 1.4886102676391602, "learning_rate": 2.0262076865030232e-07, "loss": 0.5024, "num_input_tokens_seen": 75121088, "step": 129380 }, { "epoch": 19.270926422400954, "grad_norm": 2.5962257385253906, "learning_rate": 2.022081100821055e-07, "loss": 0.5512, "num_input_tokens_seen": 75123776, "step": 129385 }, { "epoch": 19.27167113494191, "grad_norm": 1.308861494064331, "learning_rate": 2.0179587044739655e-07, "loss": 0.6316, "num_input_tokens_seen": 75126912, "step": 129390 }, { "epoch": 19.272415847482872, "grad_norm": 1.6716161966323853, "learning_rate": 2.0138404975314495e-07, "loss": 0.5254, "num_input_tokens_seen": 75129696, "step": 129395 }, { "epoch": 19.273160560023832, "grad_norm": 2.219970464706421, "learning_rate": 2.0097264800630344e-07, "loss": 0.7527, "num_input_tokens_seen": 75132512, "step": 129400 }, { "epoch": 19.27390527256479, "grad_norm": 1.8442461490631104, "learning_rate": 2.0056166521382759e-07, "loss": 0.6488, "num_input_tokens_seen": 75135360, "step": 129405 }, { "epoch": 19.27464998510575, "grad_norm": 1.8371187448501587, "learning_rate": 2.0015110138265624e-07, "loss": 0.7336, "num_input_tokens_seen": 75138240, "step": 129410 }, { "epoch": 19.275394697646707, "grad_norm": 1.0580857992172241, "learning_rate": 1.997409565197228e-07, "loss": 0.6781, "num_input_tokens_seen": 75141152, "step": 129415 }, { "epoch": 19.276139410187668, "grad_norm": 1.3085740804672241, "learning_rate": 1.9933123063196335e-07, "loss": 0.4223, "num_input_tokens_seen": 75143840, "step": 129420 }, { "epoch": 19.276884122728628, "grad_norm": 1.798923373222351, "learning_rate": 1.9892192372629737e-07, "loss": 0.6856, "num_input_tokens_seen": 75146592, "step": 129425 }, { "epoch": 19.277628835269585, "grad_norm": 1.8338851928710938, "learning_rate": 1.9851303580963599e-07, "loss": 0.6226, "num_input_tokens_seen": 75149216, "step": 129430 }, { "epoch": 19.278373547810546, "grad_norm": 2.6018245220184326, "learning_rate": 1.9810456688889313e-07, "loss": 0.5275, "num_input_tokens_seen": 75151744, "step": 129435 }, { "epoch": 19.279118260351503, "grad_norm": 1.783555030822754, "learning_rate": 1.9769651697096326e-07, "loss": 0.6009, "num_input_tokens_seen": 75154688, "step": 129440 }, { "epoch": 19.279862972892463, "grad_norm": 2.8541948795318604, "learning_rate": 1.9728888606274365e-07, "loss": 0.658, "num_input_tokens_seen": 75157408, "step": 129445 }, { "epoch": 19.280607685433424, "grad_norm": 1.8719127178192139, "learning_rate": 1.9688167417112047e-07, "loss": 0.7008, "num_input_tokens_seen": 75160352, "step": 129450 }, { "epoch": 19.28135239797438, "grad_norm": 0.9662153720855713, "learning_rate": 1.9647488130297154e-07, "loss": 0.6651, "num_input_tokens_seen": 75163328, "step": 129455 }, { "epoch": 19.28209711051534, "grad_norm": 1.3240026235580444, "learning_rate": 1.960685074651719e-07, "loss": 0.5816, "num_input_tokens_seen": 75166176, "step": 129460 }, { "epoch": 19.2828418230563, "grad_norm": 1.1174473762512207, "learning_rate": 1.9566255266458278e-07, "loss": 0.469, "num_input_tokens_seen": 75168896, "step": 129465 }, { "epoch": 19.28358653559726, "grad_norm": 2.133798360824585, "learning_rate": 1.9525701690806807e-07, "loss": 0.4937, "num_input_tokens_seen": 75172160, "step": 129470 }, { "epoch": 19.28433124813822, "grad_norm": 2.2869884967803955, "learning_rate": 1.948519002024751e-07, "loss": 0.7726, "num_input_tokens_seen": 75175072, "step": 129475 }, { "epoch": 19.285075960679176, "grad_norm": 2.1390151977539062, "learning_rate": 1.9444720255464844e-07, "loss": 0.6057, "num_input_tokens_seen": 75178144, "step": 129480 }, { "epoch": 19.285820673220137, "grad_norm": 1.1849154233932495, "learning_rate": 1.940429239714242e-07, "loss": 0.5293, "num_input_tokens_seen": 75180800, "step": 129485 }, { "epoch": 19.286565385761097, "grad_norm": 1.0830103158950806, "learning_rate": 1.936390644596303e-07, "loss": 0.4587, "num_input_tokens_seen": 75183360, "step": 129490 }, { "epoch": 19.287310098302054, "grad_norm": 2.670008659362793, "learning_rate": 1.932356240260974e-07, "loss": 0.5908, "num_input_tokens_seen": 75186240, "step": 129495 }, { "epoch": 19.288054810843015, "grad_norm": 1.2161535024642944, "learning_rate": 1.9283260267763115e-07, "loss": 0.5424, "num_input_tokens_seen": 75189088, "step": 129500 }, { "epoch": 19.288799523383975, "grad_norm": 1.5058696269989014, "learning_rate": 1.924300004210483e-07, "loss": 0.6303, "num_input_tokens_seen": 75192064, "step": 129505 }, { "epoch": 19.289544235924932, "grad_norm": 1.281674861907959, "learning_rate": 1.9202781726314622e-07, "loss": 0.4859, "num_input_tokens_seen": 75195136, "step": 129510 }, { "epoch": 19.290288948465893, "grad_norm": 2.0100667476654053, "learning_rate": 1.9162605321072224e-07, "loss": 0.5822, "num_input_tokens_seen": 75198080, "step": 129515 }, { "epoch": 19.29103366100685, "grad_norm": 1.4528898000717163, "learning_rate": 1.9122470827055984e-07, "loss": 0.4639, "num_input_tokens_seen": 75201184, "step": 129520 }, { "epoch": 19.29177837354781, "grad_norm": 1.6124861240386963, "learning_rate": 1.9082378244944242e-07, "loss": 0.5229, "num_input_tokens_seen": 75204064, "step": 129525 }, { "epoch": 19.29252308608877, "grad_norm": 2.1621739864349365, "learning_rate": 1.9042327575414242e-07, "loss": 0.6565, "num_input_tokens_seen": 75207328, "step": 129530 }, { "epoch": 19.293267798629728, "grad_norm": 2.253598928451538, "learning_rate": 1.9002318819142661e-07, "loss": 0.621, "num_input_tokens_seen": 75210400, "step": 129535 }, { "epoch": 19.29401251117069, "grad_norm": 1.218937635421753, "learning_rate": 1.8962351976805348e-07, "loss": 0.3817, "num_input_tokens_seen": 75213248, "step": 129540 }, { "epoch": 19.29475722371165, "grad_norm": 2.6170542240142822, "learning_rate": 1.892242704907732e-07, "loss": 0.7048, "num_input_tokens_seen": 75215936, "step": 129545 }, { "epoch": 19.295501936252606, "grad_norm": 1.486433982849121, "learning_rate": 1.888254403663331e-07, "loss": 0.5166, "num_input_tokens_seen": 75218464, "step": 129550 }, { "epoch": 19.296246648793566, "grad_norm": 1.5094871520996094, "learning_rate": 1.884270294014695e-07, "loss": 0.5253, "num_input_tokens_seen": 75221024, "step": 129555 }, { "epoch": 19.296991361334523, "grad_norm": 0.7629368305206299, "learning_rate": 1.880290376029159e-07, "loss": 0.4723, "num_input_tokens_seen": 75223808, "step": 129560 }, { "epoch": 19.297736073875484, "grad_norm": 2.5888593196868896, "learning_rate": 1.8763146497739194e-07, "loss": 0.6899, "num_input_tokens_seen": 75226784, "step": 129565 }, { "epoch": 19.298480786416444, "grad_norm": 1.2711200714111328, "learning_rate": 1.872343115316144e-07, "loss": 0.5374, "num_input_tokens_seen": 75229632, "step": 129570 }, { "epoch": 19.2992254989574, "grad_norm": 1.0585323572158813, "learning_rate": 1.8683757727229745e-07, "loss": 0.4738, "num_input_tokens_seen": 75232608, "step": 129575 }, { "epoch": 19.299970211498362, "grad_norm": 1.192095398902893, "learning_rate": 1.864412622061412e-07, "loss": 0.4, "num_input_tokens_seen": 75235264, "step": 129580 }, { "epoch": 19.300714924039323, "grad_norm": 1.4413659572601318, "learning_rate": 1.8604536633984037e-07, "loss": 0.3946, "num_input_tokens_seen": 75238080, "step": 129585 }, { "epoch": 19.30145963658028, "grad_norm": 1.1212061643600464, "learning_rate": 1.8564988968008124e-07, "loss": 0.615, "num_input_tokens_seen": 75240832, "step": 129590 }, { "epoch": 19.30220434912124, "grad_norm": 1.392037034034729, "learning_rate": 1.8525483223354734e-07, "loss": 0.5216, "num_input_tokens_seen": 75243744, "step": 129595 }, { "epoch": 19.302949061662197, "grad_norm": 1.2340558767318726, "learning_rate": 1.848601940069139e-07, "loss": 0.5161, "num_input_tokens_seen": 75246688, "step": 129600 }, { "epoch": 19.303693774203158, "grad_norm": 1.3126447200775146, "learning_rate": 1.8446597500684503e-07, "loss": 0.5052, "num_input_tokens_seen": 75249856, "step": 129605 }, { "epoch": 19.304438486744118, "grad_norm": 1.307114601135254, "learning_rate": 1.8407217524000486e-07, "loss": 0.6008, "num_input_tokens_seen": 75252640, "step": 129610 }, { "epoch": 19.305183199285075, "grad_norm": 1.2037594318389893, "learning_rate": 1.8367879471304084e-07, "loss": 0.7155, "num_input_tokens_seen": 75255648, "step": 129615 }, { "epoch": 19.305927911826036, "grad_norm": 1.8126670122146606, "learning_rate": 1.832858334326032e-07, "loss": 0.6116, "num_input_tokens_seen": 75258592, "step": 129620 }, { "epoch": 19.306672624366993, "grad_norm": 1.2729500532150269, "learning_rate": 1.828932914053255e-07, "loss": 0.5364, "num_input_tokens_seen": 75261472, "step": 129625 }, { "epoch": 19.307417336907953, "grad_norm": 1.5483062267303467, "learning_rate": 1.8250116863784694e-07, "loss": 0.5806, "num_input_tokens_seen": 75264288, "step": 129630 }, { "epoch": 19.308162049448914, "grad_norm": 1.1672985553741455, "learning_rate": 1.8210946513678439e-07, "loss": 0.671, "num_input_tokens_seen": 75267328, "step": 129635 }, { "epoch": 19.30890676198987, "grad_norm": 3.585944652557373, "learning_rate": 1.8171818090876037e-07, "loss": 0.6984, "num_input_tokens_seen": 75270272, "step": 129640 }, { "epoch": 19.30965147453083, "grad_norm": 2.047689437866211, "learning_rate": 1.8132731596038345e-07, "loss": 0.6748, "num_input_tokens_seen": 75273120, "step": 129645 }, { "epoch": 19.31039618707179, "grad_norm": 1.5844241380691528, "learning_rate": 1.8093687029825666e-07, "loss": 0.5436, "num_input_tokens_seen": 75276352, "step": 129650 }, { "epoch": 19.31114089961275, "grad_norm": 1.3432871103286743, "learning_rate": 1.8054684392897758e-07, "loss": 0.6457, "num_input_tokens_seen": 75279232, "step": 129655 }, { "epoch": 19.31188561215371, "grad_norm": 2.397984266281128, "learning_rate": 1.8015723685913255e-07, "loss": 0.5353, "num_input_tokens_seen": 75281984, "step": 129660 }, { "epoch": 19.312630324694666, "grad_norm": 1.381500005722046, "learning_rate": 1.797680490953052e-07, "loss": 0.4086, "num_input_tokens_seen": 75284832, "step": 129665 }, { "epoch": 19.313375037235627, "grad_norm": 2.2352895736694336, "learning_rate": 1.7937928064407085e-07, "loss": 0.6527, "num_input_tokens_seen": 75287680, "step": 129670 }, { "epoch": 19.314119749776587, "grad_norm": 0.9803788065910339, "learning_rate": 1.7899093151199643e-07, "loss": 0.5692, "num_input_tokens_seen": 75290400, "step": 129675 }, { "epoch": 19.314864462317544, "grad_norm": 1.8248896598815918, "learning_rate": 1.7860300170564613e-07, "loss": 0.512, "num_input_tokens_seen": 75293472, "step": 129680 }, { "epoch": 19.315609174858505, "grad_norm": 1.8149425983428955, "learning_rate": 1.7821549123156755e-07, "loss": 0.5523, "num_input_tokens_seen": 75296352, "step": 129685 }, { "epoch": 19.316353887399465, "grad_norm": 2.7215423583984375, "learning_rate": 1.7782840009631375e-07, "loss": 0.6568, "num_input_tokens_seen": 75299456, "step": 129690 }, { "epoch": 19.317098599940422, "grad_norm": 1.455173373222351, "learning_rate": 1.7744172830641835e-07, "loss": 0.634, "num_input_tokens_seen": 75302272, "step": 129695 }, { "epoch": 19.317843312481383, "grad_norm": 1.346251130104065, "learning_rate": 1.7705547586841785e-07, "loss": 0.5608, "num_input_tokens_seen": 75305184, "step": 129700 }, { "epoch": 19.31858802502234, "grad_norm": 1.4509772062301636, "learning_rate": 1.7666964278883202e-07, "loss": 0.607, "num_input_tokens_seen": 75307936, "step": 129705 }, { "epoch": 19.3193327375633, "grad_norm": 1.9135231971740723, "learning_rate": 1.7628422907418894e-07, "loss": 0.5741, "num_input_tokens_seen": 75310848, "step": 129710 }, { "epoch": 19.32007745010426, "grad_norm": 1.463983178138733, "learning_rate": 1.7589923473098902e-07, "loss": 0.635, "num_input_tokens_seen": 75313664, "step": 129715 }, { "epoch": 19.320822162645218, "grad_norm": 2.1246838569641113, "learning_rate": 1.7551465976574643e-07, "loss": 0.4743, "num_input_tokens_seen": 75316640, "step": 129720 }, { "epoch": 19.32156687518618, "grad_norm": 1.6468137502670288, "learning_rate": 1.7513050418495047e-07, "loss": 0.615, "num_input_tokens_seen": 75319680, "step": 129725 }, { "epoch": 19.32231158772714, "grad_norm": 1.5093448162078857, "learning_rate": 1.7474676799509314e-07, "loss": 0.4916, "num_input_tokens_seen": 75322912, "step": 129730 }, { "epoch": 19.323056300268096, "grad_norm": 3.1627657413482666, "learning_rate": 1.7436345120266095e-07, "loss": 0.7075, "num_input_tokens_seen": 75326272, "step": 129735 }, { "epoch": 19.323801012809056, "grad_norm": 1.3568392992019653, "learning_rate": 1.739805538141237e-07, "loss": 0.6765, "num_input_tokens_seen": 75329248, "step": 129740 }, { "epoch": 19.324545725350013, "grad_norm": 1.3756338357925415, "learning_rate": 1.73598075835954e-07, "loss": 0.579, "num_input_tokens_seen": 75332128, "step": 129745 }, { "epoch": 19.325290437890974, "grad_norm": 1.392424464225769, "learning_rate": 1.7321601727461334e-07, "loss": 0.4453, "num_input_tokens_seen": 75334784, "step": 129750 }, { "epoch": 19.326035150431935, "grad_norm": 3.905470848083496, "learning_rate": 1.7283437813655489e-07, "loss": 0.6674, "num_input_tokens_seen": 75337888, "step": 129755 }, { "epoch": 19.32677986297289, "grad_norm": 1.7037503719329834, "learning_rate": 1.7245315842822352e-07, "loss": 0.4527, "num_input_tokens_seen": 75340672, "step": 129760 }, { "epoch": 19.327524575513852, "grad_norm": 2.424461603164673, "learning_rate": 1.720723581560668e-07, "loss": 0.5983, "num_input_tokens_seen": 75343520, "step": 129765 }, { "epoch": 19.328269288054813, "grad_norm": 1.3820395469665527, "learning_rate": 1.716919773265102e-07, "loss": 0.588, "num_input_tokens_seen": 75346304, "step": 129770 }, { "epoch": 19.32901400059577, "grad_norm": 1.8211956024169922, "learning_rate": 1.7131201594598468e-07, "loss": 0.6655, "num_input_tokens_seen": 75349024, "step": 129775 }, { "epoch": 19.32975871313673, "grad_norm": 1.3628950119018555, "learning_rate": 1.709324740209073e-07, "loss": 0.6067, "num_input_tokens_seen": 75352064, "step": 129780 }, { "epoch": 19.330503425677687, "grad_norm": 1.9138762950897217, "learning_rate": 1.7055335155769238e-07, "loss": 0.6236, "num_input_tokens_seen": 75354880, "step": 129785 }, { "epoch": 19.331248138218648, "grad_norm": 1.7101478576660156, "learning_rate": 1.7017464856274033e-07, "loss": 0.6149, "num_input_tokens_seen": 75357600, "step": 129790 }, { "epoch": 19.331992850759608, "grad_norm": 1.6702886819839478, "learning_rate": 1.6979636504245445e-07, "loss": 0.7041, "num_input_tokens_seen": 75360384, "step": 129795 }, { "epoch": 19.332737563300565, "grad_norm": 0.7271849513053894, "learning_rate": 1.6941850100322122e-07, "loss": 0.4186, "num_input_tokens_seen": 75363296, "step": 129800 }, { "epoch": 19.333482275841526, "grad_norm": 2.1687707901000977, "learning_rate": 1.6904105645142444e-07, "loss": 0.594, "num_input_tokens_seen": 75366080, "step": 129805 }, { "epoch": 19.334226988382483, "grad_norm": 2.248762845993042, "learning_rate": 1.686640313934451e-07, "loss": 0.6215, "num_input_tokens_seen": 75369216, "step": 129810 }, { "epoch": 19.334971700923443, "grad_norm": 1.6441155672073364, "learning_rate": 1.6828742583564762e-07, "loss": 0.4549, "num_input_tokens_seen": 75371968, "step": 129815 }, { "epoch": 19.335716413464404, "grad_norm": 1.2756035327911377, "learning_rate": 1.6791123978439626e-07, "loss": 0.6687, "num_input_tokens_seen": 75374784, "step": 129820 }, { "epoch": 19.33646112600536, "grad_norm": 1.7580455541610718, "learning_rate": 1.6753547324604713e-07, "loss": 0.5889, "num_input_tokens_seen": 75377888, "step": 129825 }, { "epoch": 19.33720583854632, "grad_norm": 1.1423051357269287, "learning_rate": 1.671601262269451e-07, "loss": 0.5303, "num_input_tokens_seen": 75380672, "step": 129830 }, { "epoch": 19.337950551087282, "grad_norm": 1.6028226613998413, "learning_rate": 1.6678519873343789e-07, "loss": 0.6619, "num_input_tokens_seen": 75383552, "step": 129835 }, { "epoch": 19.33869526362824, "grad_norm": 1.5222327709197998, "learning_rate": 1.66410690771851e-07, "loss": 0.5084, "num_input_tokens_seen": 75386688, "step": 129840 }, { "epoch": 19.3394399761692, "grad_norm": 1.6847872734069824, "learning_rate": 1.6603660234851825e-07, "loss": 0.57, "num_input_tokens_seen": 75389248, "step": 129845 }, { "epoch": 19.340184688710156, "grad_norm": 1.1073464155197144, "learning_rate": 1.656629334697568e-07, "loss": 0.6263, "num_input_tokens_seen": 75392192, "step": 129850 }, { "epoch": 19.340929401251117, "grad_norm": 1.932331919670105, "learning_rate": 1.6528968414188107e-07, "loss": 0.5803, "num_input_tokens_seen": 75395072, "step": 129855 }, { "epoch": 19.341674113792077, "grad_norm": 1.1064561605453491, "learning_rate": 1.6491685437119154e-07, "loss": 0.5532, "num_input_tokens_seen": 75398112, "step": 129860 }, { "epoch": 19.342418826333034, "grad_norm": 2.1455607414245605, "learning_rate": 1.6454444416399428e-07, "loss": 0.4257, "num_input_tokens_seen": 75400960, "step": 129865 }, { "epoch": 19.343163538873995, "grad_norm": 1.6030277013778687, "learning_rate": 1.6417245352657317e-07, "loss": 0.6757, "num_input_tokens_seen": 75404032, "step": 129870 }, { "epoch": 19.343908251414955, "grad_norm": 1.9202871322631836, "learning_rate": 1.638008824652204e-07, "loss": 0.7029, "num_input_tokens_seen": 75406816, "step": 129875 }, { "epoch": 19.344652963955912, "grad_norm": 1.2652034759521484, "learning_rate": 1.6342973098620872e-07, "loss": 0.7694, "num_input_tokens_seen": 75409952, "step": 129880 }, { "epoch": 19.345397676496873, "grad_norm": 2.525480031967163, "learning_rate": 1.6305899909580814e-07, "loss": 0.7457, "num_input_tokens_seen": 75412640, "step": 129885 }, { "epoch": 19.34614238903783, "grad_norm": 1.5309362411499023, "learning_rate": 1.6268868680028026e-07, "loss": 0.4963, "num_input_tokens_seen": 75415488, "step": 129890 }, { "epoch": 19.34688710157879, "grad_norm": 1.3417140245437622, "learning_rate": 1.623187941058868e-07, "loss": 0.6125, "num_input_tokens_seen": 75418240, "step": 129895 }, { "epoch": 19.34763181411975, "grad_norm": 1.4643148183822632, "learning_rate": 1.6194932101886995e-07, "loss": 0.5126, "num_input_tokens_seen": 75421440, "step": 129900 }, { "epoch": 19.348376526660708, "grad_norm": 1.1280417442321777, "learning_rate": 1.615802675454775e-07, "loss": 0.5408, "num_input_tokens_seen": 75424256, "step": 129905 }, { "epoch": 19.34912123920167, "grad_norm": 2.0790724754333496, "learning_rate": 1.6121163369194335e-07, "loss": 0.736, "num_input_tokens_seen": 75427072, "step": 129910 }, { "epoch": 19.34986595174263, "grad_norm": 1.8216242790222168, "learning_rate": 1.6084341946449033e-07, "loss": 0.7247, "num_input_tokens_seen": 75429568, "step": 129915 }, { "epoch": 19.350610664283586, "grad_norm": 1.2853608131408691, "learning_rate": 1.6047562486934398e-07, "loss": 0.5337, "num_input_tokens_seen": 75432416, "step": 129920 }, { "epoch": 19.351355376824547, "grad_norm": 2.062551736831665, "learning_rate": 1.60108249912716e-07, "loss": 0.6226, "num_input_tokens_seen": 75435424, "step": 129925 }, { "epoch": 19.352100089365504, "grad_norm": 1.4035143852233887, "learning_rate": 1.5974129460081255e-07, "loss": 0.4138, "num_input_tokens_seen": 75438336, "step": 129930 }, { "epoch": 19.352844801906464, "grad_norm": 1.8646208047866821, "learning_rate": 1.5937475893983423e-07, "loss": 0.4755, "num_input_tokens_seen": 75441024, "step": 129935 }, { "epoch": 19.353589514447425, "grad_norm": 2.4370975494384766, "learning_rate": 1.5900864293597328e-07, "loss": 0.5873, "num_input_tokens_seen": 75444000, "step": 129940 }, { "epoch": 19.35433422698838, "grad_norm": 1.4790467023849487, "learning_rate": 1.5864294659541367e-07, "loss": 0.5029, "num_input_tokens_seen": 75446944, "step": 129945 }, { "epoch": 19.355078939529342, "grad_norm": 1.3542916774749756, "learning_rate": 1.5827766992433378e-07, "loss": 0.7504, "num_input_tokens_seen": 75449888, "step": 129950 }, { "epoch": 19.3558236520703, "grad_norm": 0.8857088685035706, "learning_rate": 1.5791281292890093e-07, "loss": 0.3924, "num_input_tokens_seen": 75452768, "step": 129955 }, { "epoch": 19.35656836461126, "grad_norm": 3.286296844482422, "learning_rate": 1.575483756152879e-07, "loss": 0.4864, "num_input_tokens_seen": 75455584, "step": 129960 }, { "epoch": 19.35731307715222, "grad_norm": 1.920882225036621, "learning_rate": 1.5718435798964538e-07, "loss": 0.4827, "num_input_tokens_seen": 75458176, "step": 129965 }, { "epoch": 19.358057789693177, "grad_norm": 1.3978877067565918, "learning_rate": 1.5682076005812118e-07, "loss": 0.5218, "num_input_tokens_seen": 75461056, "step": 129970 }, { "epoch": 19.358802502234138, "grad_norm": 1.9158836603164673, "learning_rate": 1.564575818268632e-07, "loss": 0.7385, "num_input_tokens_seen": 75463808, "step": 129975 }, { "epoch": 19.359547214775098, "grad_norm": 1.8263813257217407, "learning_rate": 1.5609482330200265e-07, "loss": 0.522, "num_input_tokens_seen": 75467040, "step": 129980 }, { "epoch": 19.360291927316055, "grad_norm": 2.6168739795684814, "learning_rate": 1.5573248448967072e-07, "loss": 0.6685, "num_input_tokens_seen": 75469824, "step": 129985 }, { "epoch": 19.361036639857016, "grad_norm": 2.218266725540161, "learning_rate": 1.5537056539598748e-07, "loss": 0.6258, "num_input_tokens_seen": 75472800, "step": 129990 }, { "epoch": 19.361781352397973, "grad_norm": 1.075732707977295, "learning_rate": 1.5500906602706756e-07, "loss": 0.884, "num_input_tokens_seen": 75475680, "step": 129995 }, { "epoch": 19.362526064938933, "grad_norm": 0.9010573029518127, "learning_rate": 1.546479863890199e-07, "loss": 0.5663, "num_input_tokens_seen": 75478368, "step": 130000 }, { "epoch": 19.363270777479894, "grad_norm": 1.6774667501449585, "learning_rate": 1.542873264879424e-07, "loss": 0.771, "num_input_tokens_seen": 75481024, "step": 130005 }, { "epoch": 19.36401549002085, "grad_norm": 1.5189108848571777, "learning_rate": 1.5392708632992748e-07, "loss": 0.7741, "num_input_tokens_seen": 75483936, "step": 130010 }, { "epoch": 19.36476020256181, "grad_norm": 0.9570638537406921, "learning_rate": 1.5356726592106185e-07, "loss": 0.6228, "num_input_tokens_seen": 75487008, "step": 130015 }, { "epoch": 19.365504915102772, "grad_norm": 2.4074654579162598, "learning_rate": 1.5320786526742682e-07, "loss": 0.626, "num_input_tokens_seen": 75489824, "step": 130020 }, { "epoch": 19.36624962764373, "grad_norm": 3.0688881874084473, "learning_rate": 1.5284888437508972e-07, "loss": 0.5676, "num_input_tokens_seen": 75492640, "step": 130025 }, { "epoch": 19.36699434018469, "grad_norm": 1.3066215515136719, "learning_rate": 1.5249032325011514e-07, "loss": 0.7188, "num_input_tokens_seen": 75495520, "step": 130030 }, { "epoch": 19.367739052725646, "grad_norm": 3.7823753356933594, "learning_rate": 1.5213218189856492e-07, "loss": 0.6649, "num_input_tokens_seen": 75498784, "step": 130035 }, { "epoch": 19.368483765266607, "grad_norm": 1.42906653881073, "learning_rate": 1.5177446032648702e-07, "loss": 0.5245, "num_input_tokens_seen": 75501600, "step": 130040 }, { "epoch": 19.369228477807567, "grad_norm": 0.9166719913482666, "learning_rate": 1.5141715853992654e-07, "loss": 0.5311, "num_input_tokens_seen": 75504736, "step": 130045 }, { "epoch": 19.369973190348524, "grad_norm": 1.2588350772857666, "learning_rate": 1.510602765449176e-07, "loss": 0.58, "num_input_tokens_seen": 75507520, "step": 130050 }, { "epoch": 19.370717902889485, "grad_norm": 1.3743481636047363, "learning_rate": 1.507038143474887e-07, "loss": 0.749, "num_input_tokens_seen": 75510400, "step": 130055 }, { "epoch": 19.371462615430445, "grad_norm": 0.9930475354194641, "learning_rate": 1.5034777195366278e-07, "loss": 0.4983, "num_input_tokens_seen": 75513408, "step": 130060 }, { "epoch": 19.372207327971402, "grad_norm": 2.140537738800049, "learning_rate": 1.4999214936945726e-07, "loss": 0.6239, "num_input_tokens_seen": 75516256, "step": 130065 }, { "epoch": 19.372952040512363, "grad_norm": 1.9386062622070312, "learning_rate": 1.496369466008757e-07, "loss": 0.5182, "num_input_tokens_seen": 75519328, "step": 130070 }, { "epoch": 19.37369675305332, "grad_norm": 0.9515326619148254, "learning_rate": 1.4928216365392157e-07, "loss": 0.545, "num_input_tokens_seen": 75522528, "step": 130075 }, { "epoch": 19.37444146559428, "grad_norm": 1.2458852529525757, "learning_rate": 1.489278005345901e-07, "loss": 0.4867, "num_input_tokens_seen": 75525440, "step": 130080 }, { "epoch": 19.37518617813524, "grad_norm": 2.3223695755004883, "learning_rate": 1.485738572488654e-07, "loss": 0.654, "num_input_tokens_seen": 75528288, "step": 130085 }, { "epoch": 19.375930890676198, "grad_norm": 1.0440126657485962, "learning_rate": 1.4822033380272603e-07, "loss": 0.4767, "num_input_tokens_seen": 75531168, "step": 130090 }, { "epoch": 19.37667560321716, "grad_norm": 2.7106175422668457, "learning_rate": 1.47867230202145e-07, "loss": 0.6694, "num_input_tokens_seen": 75533920, "step": 130095 }, { "epoch": 19.37742031575812, "grad_norm": 1.7196635007858276, "learning_rate": 1.4751454645309248e-07, "loss": 0.5104, "num_input_tokens_seen": 75536736, "step": 130100 }, { "epoch": 19.378165028299076, "grad_norm": 2.1174986362457275, "learning_rate": 1.471622825615193e-07, "loss": 0.5708, "num_input_tokens_seen": 75539488, "step": 130105 }, { "epoch": 19.378909740840037, "grad_norm": 1.4027502536773682, "learning_rate": 1.4681043853338184e-07, "loss": 0.5704, "num_input_tokens_seen": 75542656, "step": 130110 }, { "epoch": 19.379654453380994, "grad_norm": 1.7666196823120117, "learning_rate": 1.4645901437461972e-07, "loss": 0.8107, "num_input_tokens_seen": 75545248, "step": 130115 }, { "epoch": 19.380399165921954, "grad_norm": 1.9912235736846924, "learning_rate": 1.4610801009117548e-07, "loss": 0.5057, "num_input_tokens_seen": 75547872, "step": 130120 }, { "epoch": 19.381143878462915, "grad_norm": 1.4101262092590332, "learning_rate": 1.4575742568897488e-07, "loss": 0.6549, "num_input_tokens_seen": 75550976, "step": 130125 }, { "epoch": 19.38188859100387, "grad_norm": 1.1032133102416992, "learning_rate": 1.45407261173941e-07, "loss": 0.5559, "num_input_tokens_seen": 75553920, "step": 130130 }, { "epoch": 19.382633303544832, "grad_norm": 1.634782075881958, "learning_rate": 1.4505751655199405e-07, "loss": 0.5754, "num_input_tokens_seen": 75557216, "step": 130135 }, { "epoch": 19.38337801608579, "grad_norm": 1.4886527061462402, "learning_rate": 1.4470819182903493e-07, "loss": 0.5048, "num_input_tokens_seen": 75559904, "step": 130140 }, { "epoch": 19.38412272862675, "grad_norm": 1.4024180173873901, "learning_rate": 1.443592870109728e-07, "loss": 0.5374, "num_input_tokens_seen": 75562816, "step": 130145 }, { "epoch": 19.38486744116771, "grad_norm": 1.8807448148727417, "learning_rate": 1.4401080210369454e-07, "loss": 0.5414, "num_input_tokens_seen": 75565600, "step": 130150 }, { "epoch": 19.385612153708667, "grad_norm": 2.2077243328094482, "learning_rate": 1.4366273711309275e-07, "loss": 0.6435, "num_input_tokens_seen": 75568640, "step": 130155 }, { "epoch": 19.386356866249628, "grad_norm": 2.4817986488342285, "learning_rate": 1.43315092045046e-07, "loss": 0.5528, "num_input_tokens_seen": 75571712, "step": 130160 }, { "epoch": 19.38710157879059, "grad_norm": 0.6835544109344482, "learning_rate": 1.429678669054274e-07, "loss": 0.6463, "num_input_tokens_seen": 75574592, "step": 130165 }, { "epoch": 19.387846291331545, "grad_norm": 1.6842166185379028, "learning_rate": 1.4262106170010447e-07, "loss": 0.4704, "num_input_tokens_seen": 75577376, "step": 130170 }, { "epoch": 19.388591003872506, "grad_norm": 0.9956331253051758, "learning_rate": 1.4227467643493364e-07, "loss": 0.5189, "num_input_tokens_seen": 75580064, "step": 130175 }, { "epoch": 19.389335716413463, "grad_norm": 1.5151618719100952, "learning_rate": 1.4192871111576856e-07, "loss": 0.5654, "num_input_tokens_seen": 75582720, "step": 130180 }, { "epoch": 19.390080428954423, "grad_norm": 1.2424523830413818, "learning_rate": 1.4158316574845175e-07, "loss": 0.5636, "num_input_tokens_seen": 75585760, "step": 130185 }, { "epoch": 19.390825141495384, "grad_norm": 2.1039321422576904, "learning_rate": 1.4123804033882305e-07, "loss": 0.6072, "num_input_tokens_seen": 75588544, "step": 130190 }, { "epoch": 19.39156985403634, "grad_norm": 1.3781830072402954, "learning_rate": 1.4089333489271384e-07, "loss": 0.4403, "num_input_tokens_seen": 75591360, "step": 130195 }, { "epoch": 19.3923145665773, "grad_norm": 2.0334808826446533, "learning_rate": 1.405490494159445e-07, "loss": 0.5298, "num_input_tokens_seen": 75594112, "step": 130200 }, { "epoch": 19.393059279118262, "grad_norm": 1.5022525787353516, "learning_rate": 1.4020518391433258e-07, "loss": 0.442, "num_input_tokens_seen": 75596896, "step": 130205 }, { "epoch": 19.39380399165922, "grad_norm": 2.9478883743286133, "learning_rate": 1.398617383936901e-07, "loss": 0.6032, "num_input_tokens_seen": 75599648, "step": 130210 }, { "epoch": 19.39454870420018, "grad_norm": 1.5546603202819824, "learning_rate": 1.395187128598152e-07, "loss": 0.8348, "num_input_tokens_seen": 75602400, "step": 130215 }, { "epoch": 19.395293416741136, "grad_norm": 1.8230855464935303, "learning_rate": 1.3917610731850328e-07, "loss": 0.6082, "num_input_tokens_seen": 75605184, "step": 130220 }, { "epoch": 19.396038129282097, "grad_norm": 1.3056459426879883, "learning_rate": 1.3883392177554688e-07, "loss": 0.4993, "num_input_tokens_seen": 75608224, "step": 130225 }, { "epoch": 19.396782841823057, "grad_norm": 3.0200395584106445, "learning_rate": 1.3849215623672197e-07, "loss": 0.4784, "num_input_tokens_seen": 75611136, "step": 130230 }, { "epoch": 19.397527554364014, "grad_norm": 1.4569361209869385, "learning_rate": 1.3815081070780167e-07, "loss": 0.592, "num_input_tokens_seen": 75614112, "step": 130235 }, { "epoch": 19.398272266904975, "grad_norm": 1.8905906677246094, "learning_rate": 1.378098851945564e-07, "loss": 0.6954, "num_input_tokens_seen": 75616736, "step": 130240 }, { "epoch": 19.399016979445936, "grad_norm": 1.2650409936904907, "learning_rate": 1.3746937970274543e-07, "loss": 0.8236, "num_input_tokens_seen": 75619712, "step": 130245 }, { "epoch": 19.399761691986892, "grad_norm": 5.06742000579834, "learning_rate": 1.3712929423812247e-07, "loss": 0.5067, "num_input_tokens_seen": 75622560, "step": 130250 }, { "epoch": 19.400506404527853, "grad_norm": 0.5510016083717346, "learning_rate": 1.3678962880642465e-07, "loss": 0.5627, "num_input_tokens_seen": 75625504, "step": 130255 }, { "epoch": 19.40125111706881, "grad_norm": 3.9103317260742188, "learning_rate": 1.3645038341340011e-07, "loss": 0.5606, "num_input_tokens_seen": 75628096, "step": 130260 }, { "epoch": 19.40199582960977, "grad_norm": 1.206057071685791, "learning_rate": 1.361115580647748e-07, "loss": 0.4079, "num_input_tokens_seen": 75630848, "step": 130265 }, { "epoch": 19.40274054215073, "grad_norm": 1.4221532344818115, "learning_rate": 1.357731527662748e-07, "loss": 0.5083, "num_input_tokens_seen": 75633664, "step": 130270 }, { "epoch": 19.403485254691688, "grad_norm": 1.858814001083374, "learning_rate": 1.3543516752361763e-07, "loss": 0.7719, "num_input_tokens_seen": 75636288, "step": 130275 }, { "epoch": 19.40422996723265, "grad_norm": 2.1298439502716064, "learning_rate": 1.3509760234251267e-07, "loss": 0.7454, "num_input_tokens_seen": 75639328, "step": 130280 }, { "epoch": 19.40497467977361, "grad_norm": 1.2522332668304443, "learning_rate": 1.3476045722865815e-07, "loss": 0.5485, "num_input_tokens_seen": 75642048, "step": 130285 }, { "epoch": 19.405719392314566, "grad_norm": 4.161271572113037, "learning_rate": 1.3442373218775784e-07, "loss": 0.8088, "num_input_tokens_seen": 75644608, "step": 130290 }, { "epoch": 19.406464104855527, "grad_norm": 2.9860217571258545, "learning_rate": 1.340874272254933e-07, "loss": 0.5931, "num_input_tokens_seen": 75647488, "step": 130295 }, { "epoch": 19.407208817396484, "grad_norm": 1.8912993669509888, "learning_rate": 1.3375154234755162e-07, "loss": 0.6553, "num_input_tokens_seen": 75650560, "step": 130300 }, { "epoch": 19.407953529937444, "grad_norm": 1.3023186922073364, "learning_rate": 1.3341607755960327e-07, "loss": 0.6596, "num_input_tokens_seen": 75653824, "step": 130305 }, { "epoch": 19.408698242478405, "grad_norm": 1.1575336456298828, "learning_rate": 1.3308103286731598e-07, "loss": 0.6696, "num_input_tokens_seen": 75656864, "step": 130310 }, { "epoch": 19.40944295501936, "grad_norm": 1.5052158832550049, "learning_rate": 1.3274640827635187e-07, "loss": 0.6866, "num_input_tokens_seen": 75659616, "step": 130315 }, { "epoch": 19.410187667560322, "grad_norm": 2.2056682109832764, "learning_rate": 1.3241220379236473e-07, "loss": 0.5221, "num_input_tokens_seen": 75662240, "step": 130320 }, { "epoch": 19.41093238010128, "grad_norm": 2.208980083465576, "learning_rate": 1.320784194209973e-07, "loss": 0.6587, "num_input_tokens_seen": 75665280, "step": 130325 }, { "epoch": 19.41167709264224, "grad_norm": 1.5767700672149658, "learning_rate": 1.3174505516789226e-07, "loss": 0.6216, "num_input_tokens_seen": 75668000, "step": 130330 }, { "epoch": 19.4124218051832, "grad_norm": 1.45555579662323, "learning_rate": 1.3141211103867845e-07, "loss": 0.6469, "num_input_tokens_seen": 75670784, "step": 130335 }, { "epoch": 19.413166517724157, "grad_norm": 2.4803647994995117, "learning_rate": 1.3107958703898193e-07, "loss": 0.7318, "num_input_tokens_seen": 75673792, "step": 130340 }, { "epoch": 19.413911230265118, "grad_norm": 1.0556100606918335, "learning_rate": 1.3074748317442042e-07, "loss": 0.6073, "num_input_tokens_seen": 75676448, "step": 130345 }, { "epoch": 19.41465594280608, "grad_norm": 3.2207188606262207, "learning_rate": 1.3041579945060335e-07, "loss": 0.689, "num_input_tokens_seen": 75679424, "step": 130350 }, { "epoch": 19.415400655347035, "grad_norm": 1.8131119012832642, "learning_rate": 1.3008453587313453e-07, "loss": 0.4943, "num_input_tokens_seen": 75682208, "step": 130355 }, { "epoch": 19.416145367887996, "grad_norm": 1.4819049835205078, "learning_rate": 1.2975369244761226e-07, "loss": 0.6497, "num_input_tokens_seen": 75685216, "step": 130360 }, { "epoch": 19.416890080428953, "grad_norm": 1.5427396297454834, "learning_rate": 1.2942326917962377e-07, "loss": 0.8399, "num_input_tokens_seen": 75688000, "step": 130365 }, { "epoch": 19.417634792969913, "grad_norm": 1.0093978643417358, "learning_rate": 1.290932660747507e-07, "loss": 0.5448, "num_input_tokens_seen": 75690752, "step": 130370 }, { "epoch": 19.418379505510874, "grad_norm": 1.6830641031265259, "learning_rate": 1.287636831385719e-07, "loss": 0.4666, "num_input_tokens_seen": 75693376, "step": 130375 }, { "epoch": 19.41912421805183, "grad_norm": 1.9695848226547241, "learning_rate": 1.2843452037664962e-07, "loss": 0.6474, "num_input_tokens_seen": 75696384, "step": 130380 }, { "epoch": 19.41986893059279, "grad_norm": 1.669843077659607, "learning_rate": 1.281057777945488e-07, "loss": 0.5615, "num_input_tokens_seen": 75699072, "step": 130385 }, { "epoch": 19.420613643133752, "grad_norm": 1.4311425685882568, "learning_rate": 1.2777745539782337e-07, "loss": 0.4329, "num_input_tokens_seen": 75701792, "step": 130390 }, { "epoch": 19.42135835567471, "grad_norm": 1.0706967115402222, "learning_rate": 1.274495531920189e-07, "loss": 0.4329, "num_input_tokens_seen": 75704416, "step": 130395 }, { "epoch": 19.42210306821567, "grad_norm": 1.8083019256591797, "learning_rate": 1.2712207118267262e-07, "loss": 0.706, "num_input_tokens_seen": 75707200, "step": 130400 }, { "epoch": 19.422847780756626, "grad_norm": 1.611345887184143, "learning_rate": 1.2679500937532173e-07, "loss": 0.6077, "num_input_tokens_seen": 75710208, "step": 130405 }, { "epoch": 19.423592493297587, "grad_norm": 3.4817373752593994, "learning_rate": 1.2646836777548688e-07, "loss": 0.5621, "num_input_tokens_seen": 75713024, "step": 130410 }, { "epoch": 19.424337205838548, "grad_norm": 1.2090353965759277, "learning_rate": 1.2614214638869137e-07, "loss": 0.5269, "num_input_tokens_seen": 75715776, "step": 130415 }, { "epoch": 19.425081918379504, "grad_norm": 1.7185938358306885, "learning_rate": 1.2581634522044194e-07, "loss": 0.5222, "num_input_tokens_seen": 75718720, "step": 130420 }, { "epoch": 19.425826630920465, "grad_norm": 0.9948790073394775, "learning_rate": 1.254909642762453e-07, "loss": 0.5711, "num_input_tokens_seen": 75721568, "step": 130425 }, { "epoch": 19.426571343461426, "grad_norm": 1.7933971881866455, "learning_rate": 1.2516600356159701e-07, "loss": 0.7279, "num_input_tokens_seen": 75724448, "step": 130430 }, { "epoch": 19.427316056002383, "grad_norm": 1.8211454153060913, "learning_rate": 1.248414630819872e-07, "loss": 0.6911, "num_input_tokens_seen": 75727424, "step": 130435 }, { "epoch": 19.428060768543343, "grad_norm": 1.4390509128570557, "learning_rate": 1.2451734284289752e-07, "loss": 0.6065, "num_input_tokens_seen": 75730528, "step": 130440 }, { "epoch": 19.4288054810843, "grad_norm": 2.105337381362915, "learning_rate": 1.2419364284980696e-07, "loss": 0.6728, "num_input_tokens_seen": 75733472, "step": 130445 }, { "epoch": 19.42955019362526, "grad_norm": 1.780160903930664, "learning_rate": 1.2387036310818334e-07, "loss": 0.4996, "num_input_tokens_seen": 75736800, "step": 130450 }, { "epoch": 19.43029490616622, "grad_norm": 1.2550371885299683, "learning_rate": 1.2354750362348344e-07, "loss": 0.6126, "num_input_tokens_seen": 75740640, "step": 130455 }, { "epoch": 19.431039618707178, "grad_norm": 2.098721504211426, "learning_rate": 1.2322506440116676e-07, "loss": 0.6325, "num_input_tokens_seen": 75743648, "step": 130460 }, { "epoch": 19.43178433124814, "grad_norm": 2.3297693729400635, "learning_rate": 1.2290304544668174e-07, "loss": 0.5944, "num_input_tokens_seen": 75746368, "step": 130465 }, { "epoch": 19.432529043789096, "grad_norm": 1.335626482963562, "learning_rate": 1.2258144676546291e-07, "loss": 0.4757, "num_input_tokens_seen": 75749152, "step": 130470 }, { "epoch": 19.433273756330056, "grad_norm": 0.9516724348068237, "learning_rate": 1.2226026836294756e-07, "loss": 0.595, "num_input_tokens_seen": 75752352, "step": 130475 }, { "epoch": 19.434018468871017, "grad_norm": 1.6817058324813843, "learning_rate": 1.2193951024455918e-07, "loss": 0.5208, "num_input_tokens_seen": 75755328, "step": 130480 }, { "epoch": 19.434763181411974, "grad_norm": 1.1772416830062866, "learning_rate": 1.216191724157184e-07, "loss": 0.5911, "num_input_tokens_seen": 75758368, "step": 130485 }, { "epoch": 19.435507893952934, "grad_norm": 1.2116047143936157, "learning_rate": 1.212992548818348e-07, "loss": 0.5719, "num_input_tokens_seen": 75761280, "step": 130490 }, { "epoch": 19.436252606493895, "grad_norm": 2.996199131011963, "learning_rate": 1.2097975764831516e-07, "loss": 0.7953, "num_input_tokens_seen": 75764192, "step": 130495 }, { "epoch": 19.43699731903485, "grad_norm": 0.9927780032157898, "learning_rate": 1.206606807205579e-07, "loss": 0.5917, "num_input_tokens_seen": 75766944, "step": 130500 }, { "epoch": 19.437742031575812, "grad_norm": 1.204944133758545, "learning_rate": 1.2034202410395324e-07, "loss": 0.5872, "num_input_tokens_seen": 75769888, "step": 130505 }, { "epoch": 19.43848674411677, "grad_norm": 1.4987810850143433, "learning_rate": 1.200237878038829e-07, "loss": 0.6966, "num_input_tokens_seen": 75772640, "step": 130510 }, { "epoch": 19.43923145665773, "grad_norm": 0.927870512008667, "learning_rate": 1.197059718257204e-07, "loss": 0.5292, "num_input_tokens_seen": 75775456, "step": 130515 }, { "epoch": 19.43997616919869, "grad_norm": 2.8798441886901855, "learning_rate": 1.19388576174842e-07, "loss": 0.6587, "num_input_tokens_seen": 75778112, "step": 130520 }, { "epoch": 19.440720881739647, "grad_norm": 1.2268975973129272, "learning_rate": 1.1907160085660451e-07, "loss": 0.5732, "num_input_tokens_seen": 75781056, "step": 130525 }, { "epoch": 19.441465594280608, "grad_norm": 1.7337924242019653, "learning_rate": 1.1875504587636477e-07, "loss": 0.4916, "num_input_tokens_seen": 75784128, "step": 130530 }, { "epoch": 19.44221030682157, "grad_norm": 2.119880437850952, "learning_rate": 1.1843891123947126e-07, "loss": 0.6001, "num_input_tokens_seen": 75786848, "step": 130535 }, { "epoch": 19.442955019362525, "grad_norm": 1.8641047477722168, "learning_rate": 1.1812319695126416e-07, "loss": 0.4629, "num_input_tokens_seen": 75789984, "step": 130540 }, { "epoch": 19.443699731903486, "grad_norm": 2.399301290512085, "learning_rate": 1.1780790301707533e-07, "loss": 0.5197, "num_input_tokens_seen": 75792736, "step": 130545 }, { "epoch": 19.444444444444443, "grad_norm": 1.3235656023025513, "learning_rate": 1.1749302944223384e-07, "loss": 0.5865, "num_input_tokens_seen": 75795936, "step": 130550 }, { "epoch": 19.445189156985403, "grad_norm": 1.8978662490844727, "learning_rate": 1.1717857623205764e-07, "loss": 0.6362, "num_input_tokens_seen": 75798848, "step": 130555 }, { "epoch": 19.445933869526364, "grad_norm": 1.6084307432174683, "learning_rate": 1.1686454339185915e-07, "loss": 0.6976, "num_input_tokens_seen": 75801664, "step": 130560 }, { "epoch": 19.44667858206732, "grad_norm": 1.815421223640442, "learning_rate": 1.1655093092694525e-07, "loss": 0.5234, "num_input_tokens_seen": 75804352, "step": 130565 }, { "epoch": 19.44742329460828, "grad_norm": 1.3896777629852295, "learning_rate": 1.1623773884261169e-07, "loss": 0.4668, "num_input_tokens_seen": 75807328, "step": 130570 }, { "epoch": 19.448168007149242, "grad_norm": 1.1991212368011475, "learning_rate": 1.1592496714415147e-07, "loss": 0.6421, "num_input_tokens_seen": 75810368, "step": 130575 }, { "epoch": 19.4489127196902, "grad_norm": 1.9830960035324097, "learning_rate": 1.1561261583684924e-07, "loss": 0.5072, "num_input_tokens_seen": 75813120, "step": 130580 }, { "epoch": 19.44965743223116, "grad_norm": 0.844476044178009, "learning_rate": 1.1530068492597856e-07, "loss": 0.6213, "num_input_tokens_seen": 75816096, "step": 130585 }, { "epoch": 19.450402144772116, "grad_norm": 2.0202414989471436, "learning_rate": 1.1498917441681023e-07, "loss": 0.5971, "num_input_tokens_seen": 75819136, "step": 130590 }, { "epoch": 19.451146857313077, "grad_norm": 2.3456082344055176, "learning_rate": 1.1467808431460947e-07, "loss": 0.4576, "num_input_tokens_seen": 75821824, "step": 130595 }, { "epoch": 19.451891569854038, "grad_norm": 2.583982229232788, "learning_rate": 1.143674146246304e-07, "loss": 0.6404, "num_input_tokens_seen": 75824576, "step": 130600 }, { "epoch": 19.452636282394995, "grad_norm": 1.6947656869888306, "learning_rate": 1.1405716535212163e-07, "loss": 0.5761, "num_input_tokens_seen": 75827680, "step": 130605 }, { "epoch": 19.453380994935955, "grad_norm": 2.3672640323638916, "learning_rate": 1.1374733650232338e-07, "loss": 0.6116, "num_input_tokens_seen": 75831040, "step": 130610 }, { "epoch": 19.454125707476916, "grad_norm": 0.9889684915542603, "learning_rate": 1.1343792808047038e-07, "loss": 0.6526, "num_input_tokens_seen": 75833888, "step": 130615 }, { "epoch": 19.454870420017873, "grad_norm": 1.5363082885742188, "learning_rate": 1.1312894009179176e-07, "loss": 0.6956, "num_input_tokens_seen": 75836864, "step": 130620 }, { "epoch": 19.455615132558833, "grad_norm": 1.5091207027435303, "learning_rate": 1.1282037254150279e-07, "loss": 0.6153, "num_input_tokens_seen": 75839712, "step": 130625 }, { "epoch": 19.45635984509979, "grad_norm": 1.3610817193984985, "learning_rate": 1.1251222543482154e-07, "loss": 0.5665, "num_input_tokens_seen": 75842752, "step": 130630 }, { "epoch": 19.45710455764075, "grad_norm": 0.7839595079421997, "learning_rate": 1.1220449877694938e-07, "loss": 0.6428, "num_input_tokens_seen": 75845952, "step": 130635 }, { "epoch": 19.45784927018171, "grad_norm": 2.270953416824341, "learning_rate": 1.1189719257309051e-07, "loss": 0.5983, "num_input_tokens_seen": 75849088, "step": 130640 }, { "epoch": 19.458593982722668, "grad_norm": 1.332573413848877, "learning_rate": 1.1159030682843242e-07, "loss": 0.5658, "num_input_tokens_seen": 75852096, "step": 130645 }, { "epoch": 19.45933869526363, "grad_norm": 2.1569621562957764, "learning_rate": 1.1128384154815984e-07, "loss": 0.7151, "num_input_tokens_seen": 75854752, "step": 130650 }, { "epoch": 19.46008340780459, "grad_norm": 1.6177725791931152, "learning_rate": 1.1097779673745201e-07, "loss": 0.6113, "num_input_tokens_seen": 75857888, "step": 130655 }, { "epoch": 19.460828120345546, "grad_norm": 2.0736987590789795, "learning_rate": 1.1067217240147698e-07, "loss": 0.7311, "num_input_tokens_seen": 75860608, "step": 130660 }, { "epoch": 19.461572832886507, "grad_norm": 1.0783888101577759, "learning_rate": 1.1036696854540007e-07, "loss": 0.7682, "num_input_tokens_seen": 75863872, "step": 130665 }, { "epoch": 19.462317545427464, "grad_norm": 1.5196415185928345, "learning_rate": 1.100621851743755e-07, "loss": 0.5065, "num_input_tokens_seen": 75866848, "step": 130670 }, { "epoch": 19.463062257968424, "grad_norm": 1.802184820175171, "learning_rate": 1.0975782229355469e-07, "loss": 0.4861, "num_input_tokens_seen": 75869856, "step": 130675 }, { "epoch": 19.463806970509385, "grad_norm": 4.514451026916504, "learning_rate": 1.0945387990807798e-07, "loss": 0.7208, "num_input_tokens_seen": 75872480, "step": 130680 }, { "epoch": 19.464551683050342, "grad_norm": 1.7569870948791504, "learning_rate": 1.0915035802308016e-07, "loss": 0.5064, "num_input_tokens_seen": 75875680, "step": 130685 }, { "epoch": 19.465296395591302, "grad_norm": 0.9037337899208069, "learning_rate": 1.0884725664368766e-07, "loss": 0.566, "num_input_tokens_seen": 75878560, "step": 130690 }, { "epoch": 19.46604110813226, "grad_norm": 1.6431362628936768, "learning_rate": 1.0854457577502419e-07, "loss": 0.4782, "num_input_tokens_seen": 75881728, "step": 130695 }, { "epoch": 19.46678582067322, "grad_norm": 1.5776221752166748, "learning_rate": 1.0824231542220232e-07, "loss": 0.4029, "num_input_tokens_seen": 75884896, "step": 130700 }, { "epoch": 19.46753053321418, "grad_norm": 3.8198628425598145, "learning_rate": 1.0794047559032627e-07, "loss": 0.6437, "num_input_tokens_seen": 75887904, "step": 130705 }, { "epoch": 19.468275245755137, "grad_norm": 1.3082698583602905, "learning_rate": 1.0763905628449478e-07, "loss": 0.6341, "num_input_tokens_seen": 75890560, "step": 130710 }, { "epoch": 19.469019958296098, "grad_norm": 0.6032809615135193, "learning_rate": 1.0733805750980653e-07, "loss": 0.3596, "num_input_tokens_seen": 75893280, "step": 130715 }, { "epoch": 19.46976467083706, "grad_norm": 1.3562120199203491, "learning_rate": 1.07037479271338e-07, "loss": 0.6298, "num_input_tokens_seen": 75896128, "step": 130720 }, { "epoch": 19.470509383378015, "grad_norm": 1.4926669597625732, "learning_rate": 1.0673732157417404e-07, "loss": 0.618, "num_input_tokens_seen": 75899168, "step": 130725 }, { "epoch": 19.471254095918976, "grad_norm": 2.1732654571533203, "learning_rate": 1.0643758442338004e-07, "loss": 0.7527, "num_input_tokens_seen": 75902272, "step": 130730 }, { "epoch": 19.471998808459933, "grad_norm": 1.2137529850006104, "learning_rate": 1.0613826782402414e-07, "loss": 0.594, "num_input_tokens_seen": 75905216, "step": 130735 }, { "epoch": 19.472743521000893, "grad_norm": 2.371703863143921, "learning_rate": 1.0583937178116066e-07, "loss": 0.6763, "num_input_tokens_seen": 75908224, "step": 130740 }, { "epoch": 19.473488233541854, "grad_norm": 1.2975304126739502, "learning_rate": 1.055408962998411e-07, "loss": 0.586, "num_input_tokens_seen": 75910880, "step": 130745 }, { "epoch": 19.47423294608281, "grad_norm": 1.497536063194275, "learning_rate": 1.0524284138510588e-07, "loss": 0.6001, "num_input_tokens_seen": 75913888, "step": 130750 }, { "epoch": 19.47497765862377, "grad_norm": 1.401755452156067, "learning_rate": 1.0494520704198985e-07, "loss": 0.5203, "num_input_tokens_seen": 75917216, "step": 130755 }, { "epoch": 19.475722371164732, "grad_norm": 1.7741544246673584, "learning_rate": 1.0464799327552232e-07, "loss": 0.6577, "num_input_tokens_seen": 75920128, "step": 130760 }, { "epoch": 19.47646708370569, "grad_norm": 1.7802952527999878, "learning_rate": 1.043512000907243e-07, "loss": 0.6318, "num_input_tokens_seen": 75923296, "step": 130765 }, { "epoch": 19.47721179624665, "grad_norm": 1.6404062509536743, "learning_rate": 1.040548274926112e-07, "loss": 0.5676, "num_input_tokens_seen": 75926208, "step": 130770 }, { "epoch": 19.477956508787607, "grad_norm": 1.4333250522613525, "learning_rate": 1.0375887548618735e-07, "loss": 0.6047, "num_input_tokens_seen": 75929152, "step": 130775 }, { "epoch": 19.478701221328567, "grad_norm": 1.2700796127319336, "learning_rate": 1.034633440764543e-07, "loss": 0.4789, "num_input_tokens_seen": 75931968, "step": 130780 }, { "epoch": 19.479445933869528, "grad_norm": 2.2533769607543945, "learning_rate": 1.031682332684053e-07, "loss": 0.5906, "num_input_tokens_seen": 75935072, "step": 130785 }, { "epoch": 19.480190646410485, "grad_norm": 2.5293688774108887, "learning_rate": 1.0287354306702524e-07, "loss": 0.8475, "num_input_tokens_seen": 75937856, "step": 130790 }, { "epoch": 19.480935358951445, "grad_norm": 2.2895145416259766, "learning_rate": 1.0257927347729068e-07, "loss": 0.5607, "num_input_tokens_seen": 75940864, "step": 130795 }, { "epoch": 19.481680071492406, "grad_norm": 1.6700221300125122, "learning_rate": 1.0228542450417545e-07, "loss": 0.4329, "num_input_tokens_seen": 75943488, "step": 130800 }, { "epoch": 19.482424784033363, "grad_norm": 2.6059505939483643, "learning_rate": 1.0199199615264499e-07, "loss": 0.6898, "num_input_tokens_seen": 75946272, "step": 130805 }, { "epoch": 19.483169496574323, "grad_norm": 1.8557173013687134, "learning_rate": 1.0169898842765091e-07, "loss": 0.6492, "num_input_tokens_seen": 75949152, "step": 130810 }, { "epoch": 19.48391420911528, "grad_norm": 2.036956310272217, "learning_rate": 1.0140640133415036e-07, "loss": 0.5714, "num_input_tokens_seen": 75952064, "step": 130815 }, { "epoch": 19.48465892165624, "grad_norm": 2.1442506313323975, "learning_rate": 1.0111423487708105e-07, "loss": 0.587, "num_input_tokens_seen": 75954944, "step": 130820 }, { "epoch": 19.4854036341972, "grad_norm": 1.8538227081298828, "learning_rate": 1.008224890613807e-07, "loss": 0.6795, "num_input_tokens_seen": 75957536, "step": 130825 }, { "epoch": 19.486148346738158, "grad_norm": 1.4777531623840332, "learning_rate": 1.0053116389197592e-07, "loss": 0.4953, "num_input_tokens_seen": 75960640, "step": 130830 }, { "epoch": 19.48689305927912, "grad_norm": 1.5059528350830078, "learning_rate": 1.0024025937379333e-07, "loss": 0.6599, "num_input_tokens_seen": 75963680, "step": 130835 }, { "epoch": 19.487637771820076, "grad_norm": 1.573940396308899, "learning_rate": 9.994977551174289e-08, "loss": 0.6923, "num_input_tokens_seen": 75966496, "step": 130840 }, { "epoch": 19.488382484361036, "grad_norm": 1.4878816604614258, "learning_rate": 9.965971231073456e-08, "loss": 0.5305, "num_input_tokens_seen": 75969184, "step": 130845 }, { "epoch": 19.489127196901997, "grad_norm": 3.4767701625823975, "learning_rate": 9.937006977566998e-08, "loss": 0.6176, "num_input_tokens_seen": 75972288, "step": 130850 }, { "epoch": 19.489871909442954, "grad_norm": 1.3690214157104492, "learning_rate": 9.90808479114369e-08, "loss": 0.5995, "num_input_tokens_seen": 75974976, "step": 130855 }, { "epoch": 19.490616621983914, "grad_norm": 0.9271087646484375, "learning_rate": 9.879204672292586e-08, "loss": 0.4823, "num_input_tokens_seen": 75977920, "step": 130860 }, { "epoch": 19.491361334524875, "grad_norm": 5.758367538452148, "learning_rate": 9.850366621501628e-08, "loss": 0.6035, "num_input_tokens_seen": 75980704, "step": 130865 }, { "epoch": 19.492106047065832, "grad_norm": 1.4083161354064941, "learning_rate": 9.82157063925765e-08, "loss": 0.4076, "num_input_tokens_seen": 75983552, "step": 130870 }, { "epoch": 19.492850759606792, "grad_norm": 1.8148013353347778, "learning_rate": 9.792816726047482e-08, "loss": 0.5579, "num_input_tokens_seen": 75986880, "step": 130875 }, { "epoch": 19.49359547214775, "grad_norm": 1.5174309015274048, "learning_rate": 9.764104882356572e-08, "loss": 0.7032, "num_input_tokens_seen": 75989760, "step": 130880 }, { "epoch": 19.49434018468871, "grad_norm": 0.8903810381889343, "learning_rate": 9.735435108670088e-08, "loss": 0.4652, "num_input_tokens_seen": 75992864, "step": 130885 }, { "epoch": 19.49508489722967, "grad_norm": 1.8895668983459473, "learning_rate": 9.70680740547264e-08, "loss": 0.7097, "num_input_tokens_seen": 75995744, "step": 130890 }, { "epoch": 19.495829609770627, "grad_norm": 1.0350149869918823, "learning_rate": 9.67822177324773e-08, "loss": 0.4844, "num_input_tokens_seen": 75998656, "step": 130895 }, { "epoch": 19.496574322311588, "grad_norm": 1.7690318822860718, "learning_rate": 9.64967821247803e-08, "loss": 0.7508, "num_input_tokens_seen": 76001696, "step": 130900 }, { "epoch": 19.49731903485255, "grad_norm": 2.565481424331665, "learning_rate": 9.621176723645931e-08, "loss": 0.8597, "num_input_tokens_seen": 76004288, "step": 130905 }, { "epoch": 19.498063747393505, "grad_norm": 1.0609813928604126, "learning_rate": 9.59271730723299e-08, "loss": 0.6846, "num_input_tokens_seen": 76007104, "step": 130910 }, { "epoch": 19.498808459934466, "grad_norm": 2.215188503265381, "learning_rate": 9.564299963719936e-08, "loss": 0.5327, "num_input_tokens_seen": 76009760, "step": 130915 }, { "epoch": 19.499553172475423, "grad_norm": 1.6212068796157837, "learning_rate": 9.53592469358694e-08, "loss": 0.514, "num_input_tokens_seen": 76012384, "step": 130920 }, { "epoch": 19.500297885016384, "grad_norm": 2.782942533493042, "learning_rate": 9.507591497313063e-08, "loss": 0.6232, "num_input_tokens_seen": 76015392, "step": 130925 }, { "epoch": 19.501042597557344, "grad_norm": 3.6507296562194824, "learning_rate": 9.479300375377365e-08, "loss": 0.5791, "num_input_tokens_seen": 76018240, "step": 130930 }, { "epoch": 19.5017873100983, "grad_norm": 2.263488292694092, "learning_rate": 9.451051328257799e-08, "loss": 0.627, "num_input_tokens_seen": 76021088, "step": 130935 }, { "epoch": 19.50253202263926, "grad_norm": 1.3231068849563599, "learning_rate": 9.422844356431481e-08, "loss": 0.5121, "num_input_tokens_seen": 76024000, "step": 130940 }, { "epoch": 19.503276735180222, "grad_norm": 1.2652050256729126, "learning_rate": 9.3946794603747e-08, "loss": 0.6051, "num_input_tokens_seen": 76027136, "step": 130945 }, { "epoch": 19.50402144772118, "grad_norm": 2.229705572128296, "learning_rate": 9.366556640563462e-08, "loss": 0.4459, "num_input_tokens_seen": 76030016, "step": 130950 }, { "epoch": 19.50476616026214, "grad_norm": 1.626859188079834, "learning_rate": 9.338475897472942e-08, "loss": 0.6348, "num_input_tokens_seen": 76032800, "step": 130955 }, { "epoch": 19.505510872803097, "grad_norm": 1.5746077299118042, "learning_rate": 9.310437231577207e-08, "loss": 0.4878, "num_input_tokens_seen": 76035616, "step": 130960 }, { "epoch": 19.506255585344057, "grad_norm": 2.551272392272949, "learning_rate": 9.282440643350598e-08, "loss": 0.6948, "num_input_tokens_seen": 76038432, "step": 130965 }, { "epoch": 19.507000297885018, "grad_norm": 1.171657681465149, "learning_rate": 9.254486133265517e-08, "loss": 0.5595, "num_input_tokens_seen": 76041120, "step": 130970 }, { "epoch": 19.507745010425975, "grad_norm": 2.440073013305664, "learning_rate": 9.226573701794361e-08, "loss": 0.7997, "num_input_tokens_seen": 76043904, "step": 130975 }, { "epoch": 19.508489722966935, "grad_norm": 1.1141456365585327, "learning_rate": 9.198703349408977e-08, "loss": 0.6082, "num_input_tokens_seen": 76046880, "step": 130980 }, { "epoch": 19.509234435507892, "grad_norm": 1.9625002145767212, "learning_rate": 9.170875076579821e-08, "loss": 0.5121, "num_input_tokens_seen": 76049600, "step": 130985 }, { "epoch": 19.509979148048853, "grad_norm": 1.0820562839508057, "learning_rate": 9.143088883777073e-08, "loss": 0.5996, "num_input_tokens_seen": 76052480, "step": 130990 }, { "epoch": 19.510723860589813, "grad_norm": 0.8805582523345947, "learning_rate": 9.115344771470357e-08, "loss": 0.774, "num_input_tokens_seen": 76055552, "step": 130995 }, { "epoch": 19.51146857313077, "grad_norm": 1.6964737176895142, "learning_rate": 9.087642740128188e-08, "loss": 0.5935, "num_input_tokens_seen": 76058464, "step": 131000 }, { "epoch": 19.51221328567173, "grad_norm": 1.6298227310180664, "learning_rate": 9.059982790218801e-08, "loss": 0.8089, "num_input_tokens_seen": 76061408, "step": 131005 }, { "epoch": 19.51295799821269, "grad_norm": 1.6854643821716309, "learning_rate": 9.032364922209047e-08, "loss": 0.5609, "num_input_tokens_seen": 76064512, "step": 131010 }, { "epoch": 19.51370271075365, "grad_norm": 1.7902424335479736, "learning_rate": 9.00478913656605e-08, "loss": 0.6483, "num_input_tokens_seen": 76067456, "step": 131015 }, { "epoch": 19.51444742329461, "grad_norm": 0.8722209334373474, "learning_rate": 8.977255433755272e-08, "loss": 0.5522, "num_input_tokens_seen": 76070144, "step": 131020 }, { "epoch": 19.515192135835566, "grad_norm": 1.8580681085586548, "learning_rate": 8.949763814242173e-08, "loss": 0.5642, "num_input_tokens_seen": 76072576, "step": 131025 }, { "epoch": 19.515936848376526, "grad_norm": 1.8631047010421753, "learning_rate": 8.922314278490829e-08, "loss": 0.4382, "num_input_tokens_seen": 76075552, "step": 131030 }, { "epoch": 19.516681560917487, "grad_norm": 1.9027554988861084, "learning_rate": 8.89490682696531e-08, "loss": 0.5197, "num_input_tokens_seen": 76078080, "step": 131035 }, { "epoch": 19.517426273458444, "grad_norm": 0.905432403087616, "learning_rate": 8.867541460128304e-08, "loss": 0.4711, "num_input_tokens_seen": 76081152, "step": 131040 }, { "epoch": 19.518170985999404, "grad_norm": 1.671393871307373, "learning_rate": 8.840218178442494e-08, "loss": 0.4089, "num_input_tokens_seen": 76084192, "step": 131045 }, { "epoch": 19.518915698540365, "grad_norm": 1.2533107995986938, "learning_rate": 8.81293698236918e-08, "loss": 0.5553, "num_input_tokens_seen": 76087776, "step": 131050 }, { "epoch": 19.519660411081322, "grad_norm": 1.8334410190582275, "learning_rate": 8.785697872369381e-08, "loss": 0.5918, "num_input_tokens_seen": 76090912, "step": 131055 }, { "epoch": 19.520405123622282, "grad_norm": 0.7755984663963318, "learning_rate": 8.758500848903283e-08, "loss": 0.4615, "num_input_tokens_seen": 76093824, "step": 131060 }, { "epoch": 19.52114983616324, "grad_norm": 1.946863055229187, "learning_rate": 8.731345912430245e-08, "loss": 0.6094, "num_input_tokens_seen": 76096736, "step": 131065 }, { "epoch": 19.5218945487042, "grad_norm": 1.3294376134872437, "learning_rate": 8.704233063409339e-08, "loss": 0.5943, "num_input_tokens_seen": 76099552, "step": 131070 }, { "epoch": 19.52263926124516, "grad_norm": 1.7024004459381104, "learning_rate": 8.677162302298258e-08, "loss": 0.7192, "num_input_tokens_seen": 76102336, "step": 131075 }, { "epoch": 19.523383973786117, "grad_norm": 0.5826594233512878, "learning_rate": 8.650133629554413e-08, "loss": 0.6689, "num_input_tokens_seen": 76105024, "step": 131080 }, { "epoch": 19.524128686327078, "grad_norm": 1.6343932151794434, "learning_rate": 8.623147045634383e-08, "loss": 0.5787, "num_input_tokens_seen": 76107840, "step": 131085 }, { "epoch": 19.52487339886804, "grad_norm": 1.718579888343811, "learning_rate": 8.596202550994193e-08, "loss": 0.5075, "num_input_tokens_seen": 76110944, "step": 131090 }, { "epoch": 19.525618111408996, "grad_norm": 0.8854349851608276, "learning_rate": 8.569300146089032e-08, "loss": 0.4324, "num_input_tokens_seen": 76113824, "step": 131095 }, { "epoch": 19.526362823949956, "grad_norm": 1.2299367189407349, "learning_rate": 8.542439831373539e-08, "loss": 0.6767, "num_input_tokens_seen": 76117120, "step": 131100 }, { "epoch": 19.527107536490913, "grad_norm": 0.8615179657936096, "learning_rate": 8.515621607301239e-08, "loss": 0.6698, "num_input_tokens_seen": 76119840, "step": 131105 }, { "epoch": 19.527852249031874, "grad_norm": 1.1299309730529785, "learning_rate": 8.488845474325102e-08, "loss": 0.5614, "num_input_tokens_seen": 76122784, "step": 131110 }, { "epoch": 19.528596961572834, "grad_norm": 1.6134315729141235, "learning_rate": 8.462111432897823e-08, "loss": 0.4837, "num_input_tokens_seen": 76125408, "step": 131115 }, { "epoch": 19.52934167411379, "grad_norm": 1.8667657375335693, "learning_rate": 8.435419483470707e-08, "loss": 0.6725, "num_input_tokens_seen": 76128384, "step": 131120 }, { "epoch": 19.53008638665475, "grad_norm": 3.25881290435791, "learning_rate": 8.408769626495061e-08, "loss": 0.6738, "num_input_tokens_seen": 76131200, "step": 131125 }, { "epoch": 19.530831099195712, "grad_norm": 1.268853783607483, "learning_rate": 8.382161862420801e-08, "loss": 0.5288, "num_input_tokens_seen": 76134112, "step": 131130 }, { "epoch": 19.53157581173667, "grad_norm": 1.7649565935134888, "learning_rate": 8.355596191697845e-08, "loss": 0.4313, "num_input_tokens_seen": 76137248, "step": 131135 }, { "epoch": 19.53232052427763, "grad_norm": 2.100248336791992, "learning_rate": 8.329072614774446e-08, "loss": 0.7424, "num_input_tokens_seen": 76140224, "step": 131140 }, { "epoch": 19.533065236818587, "grad_norm": 0.8709089159965515, "learning_rate": 8.302591132098857e-08, "loss": 0.4958, "num_input_tokens_seen": 76143264, "step": 131145 }, { "epoch": 19.533809949359547, "grad_norm": 1.4091277122497559, "learning_rate": 8.276151744118777e-08, "loss": 0.5403, "num_input_tokens_seen": 76146368, "step": 131150 }, { "epoch": 19.534554661900508, "grad_norm": 1.5908962488174438, "learning_rate": 8.249754451280512e-08, "loss": 0.6047, "num_input_tokens_seen": 76149024, "step": 131155 }, { "epoch": 19.535299374441465, "grad_norm": 2.7484724521636963, "learning_rate": 8.223399254030095e-08, "loss": 0.7345, "num_input_tokens_seen": 76151904, "step": 131160 }, { "epoch": 19.536044086982425, "grad_norm": 1.1810529232025146, "learning_rate": 8.197086152812728e-08, "loss": 0.5075, "num_input_tokens_seen": 76154592, "step": 131165 }, { "epoch": 19.536788799523386, "grad_norm": 1.5775282382965088, "learning_rate": 8.17081514807333e-08, "loss": 0.5955, "num_input_tokens_seen": 76157440, "step": 131170 }, { "epoch": 19.537533512064343, "grad_norm": 1.206803321838379, "learning_rate": 8.144586240255159e-08, "loss": 0.6701, "num_input_tokens_seen": 76160192, "step": 131175 }, { "epoch": 19.538278224605303, "grad_norm": 1.3250524997711182, "learning_rate": 8.118399429801749e-08, "loss": 0.5343, "num_input_tokens_seen": 76163264, "step": 131180 }, { "epoch": 19.53902293714626, "grad_norm": 1.5342960357666016, "learning_rate": 8.092254717155246e-08, "loss": 0.6303, "num_input_tokens_seen": 76166240, "step": 131185 }, { "epoch": 19.53976764968722, "grad_norm": 1.4466341733932495, "learning_rate": 8.066152102757518e-08, "loss": 0.5283, "num_input_tokens_seen": 76168992, "step": 131190 }, { "epoch": 19.54051236222818, "grad_norm": 1.0939933061599731, "learning_rate": 8.040091587049325e-08, "loss": 0.5999, "num_input_tokens_seen": 76172000, "step": 131195 }, { "epoch": 19.54125707476914, "grad_norm": 1.7842880487442017, "learning_rate": 8.014073170471149e-08, "loss": 0.5448, "num_input_tokens_seen": 76174880, "step": 131200 }, { "epoch": 19.5420017873101, "grad_norm": 1.4118738174438477, "learning_rate": 7.988096853462634e-08, "loss": 0.5173, "num_input_tokens_seen": 76177856, "step": 131205 }, { "epoch": 19.542746499851056, "grad_norm": 1.3834184408187866, "learning_rate": 7.962162636462323e-08, "loss": 0.4787, "num_input_tokens_seen": 76180896, "step": 131210 }, { "epoch": 19.543491212392016, "grad_norm": 1.2498633861541748, "learning_rate": 7.936270519908473e-08, "loss": 0.6347, "num_input_tokens_seen": 76183872, "step": 131215 }, { "epoch": 19.544235924932977, "grad_norm": 0.9232773184776306, "learning_rate": 7.910420504238514e-08, "loss": 0.6673, "num_input_tokens_seen": 76186656, "step": 131220 }, { "epoch": 19.544980637473934, "grad_norm": 1.8076367378234863, "learning_rate": 7.88461258988904e-08, "loss": 0.7343, "num_input_tokens_seen": 76189568, "step": 131225 }, { "epoch": 19.545725350014894, "grad_norm": 1.115098237991333, "learning_rate": 7.858846777296369e-08, "loss": 0.4355, "num_input_tokens_seen": 76192320, "step": 131230 }, { "epoch": 19.546470062555855, "grad_norm": 1.375917911529541, "learning_rate": 7.833123066895432e-08, "loss": 0.5122, "num_input_tokens_seen": 76194880, "step": 131235 }, { "epoch": 19.547214775096812, "grad_norm": 2.1197257041931152, "learning_rate": 7.807441459121156e-08, "loss": 0.7637, "num_input_tokens_seen": 76197696, "step": 131240 }, { "epoch": 19.547959487637772, "grad_norm": 2.897329092025757, "learning_rate": 7.781801954406809e-08, "loss": 0.6099, "num_input_tokens_seen": 76200416, "step": 131245 }, { "epoch": 19.54870420017873, "grad_norm": 1.737072229385376, "learning_rate": 7.756204553186208e-08, "loss": 0.6966, "num_input_tokens_seen": 76203200, "step": 131250 }, { "epoch": 19.54944891271969, "grad_norm": 2.0083084106445312, "learning_rate": 7.730649255891509e-08, "loss": 0.4825, "num_input_tokens_seen": 76205984, "step": 131255 }, { "epoch": 19.55019362526065, "grad_norm": 1.5652509927749634, "learning_rate": 7.705136062954587e-08, "loss": 0.6858, "num_input_tokens_seen": 76208992, "step": 131260 }, { "epoch": 19.550938337801608, "grad_norm": 2.0102901458740234, "learning_rate": 7.679664974806212e-08, "loss": 0.5383, "num_input_tokens_seen": 76211776, "step": 131265 }, { "epoch": 19.551683050342568, "grad_norm": 1.4331343173980713, "learning_rate": 7.654235991876867e-08, "loss": 0.5483, "num_input_tokens_seen": 76214688, "step": 131270 }, { "epoch": 19.55242776288353, "grad_norm": 1.559149980545044, "learning_rate": 7.628849114596214e-08, "loss": 0.7629, "num_input_tokens_seen": 76217632, "step": 131275 }, { "epoch": 19.553172475424486, "grad_norm": 3.042836904525757, "learning_rate": 7.603504343392798e-08, "loss": 0.6028, "num_input_tokens_seen": 76220320, "step": 131280 }, { "epoch": 19.553917187965446, "grad_norm": 1.1345747709274292, "learning_rate": 7.578201678694885e-08, "loss": 0.4745, "num_input_tokens_seen": 76223456, "step": 131285 }, { "epoch": 19.554661900506403, "grad_norm": 1.3846884965896606, "learning_rate": 7.55294112093019e-08, "loss": 0.4393, "num_input_tokens_seen": 76225984, "step": 131290 }, { "epoch": 19.555406613047364, "grad_norm": 2.7124719619750977, "learning_rate": 7.527722670525594e-08, "loss": 0.6068, "num_input_tokens_seen": 76228896, "step": 131295 }, { "epoch": 19.556151325588324, "grad_norm": 1.626696228981018, "learning_rate": 7.50254632790659e-08, "loss": 0.4489, "num_input_tokens_seen": 76231488, "step": 131300 }, { "epoch": 19.55689603812928, "grad_norm": 1.3446201086044312, "learning_rate": 7.477412093498947e-08, "loss": 0.4768, "num_input_tokens_seen": 76234560, "step": 131305 }, { "epoch": 19.55764075067024, "grad_norm": 2.4170849323272705, "learning_rate": 7.452319967727328e-08, "loss": 0.5957, "num_input_tokens_seen": 76237760, "step": 131310 }, { "epoch": 19.558385463211202, "grad_norm": 1.429600477218628, "learning_rate": 7.427269951015004e-08, "loss": 0.6342, "num_input_tokens_seen": 76240672, "step": 131315 }, { "epoch": 19.55913017575216, "grad_norm": 1.9068611860275269, "learning_rate": 7.402262043785801e-08, "loss": 0.6574, "num_input_tokens_seen": 76243616, "step": 131320 }, { "epoch": 19.55987488829312, "grad_norm": 1.4736220836639404, "learning_rate": 7.377296246462162e-08, "loss": 0.5625, "num_input_tokens_seen": 76246720, "step": 131325 }, { "epoch": 19.560619600834077, "grad_norm": 2.419869899749756, "learning_rate": 7.352372559465693e-08, "loss": 0.7438, "num_input_tokens_seen": 76249664, "step": 131330 }, { "epoch": 19.561364313375037, "grad_norm": 1.671942114830017, "learning_rate": 7.327490983217444e-08, "loss": 0.5382, "num_input_tokens_seen": 76252768, "step": 131335 }, { "epoch": 19.562109025915998, "grad_norm": 3.189880609512329, "learning_rate": 7.302651518137638e-08, "loss": 0.6107, "num_input_tokens_seen": 76255552, "step": 131340 }, { "epoch": 19.562853738456955, "grad_norm": 1.4628995656967163, "learning_rate": 7.277854164646214e-08, "loss": 0.5908, "num_input_tokens_seen": 76258560, "step": 131345 }, { "epoch": 19.563598450997915, "grad_norm": 1.4004125595092773, "learning_rate": 7.253098923162005e-08, "loss": 0.5702, "num_input_tokens_seen": 76261408, "step": 131350 }, { "epoch": 19.564343163538872, "grad_norm": 4.894278049468994, "learning_rate": 7.22838579410301e-08, "loss": 0.6259, "num_input_tokens_seen": 76264576, "step": 131355 }, { "epoch": 19.565087876079833, "grad_norm": 1.5805696249008179, "learning_rate": 7.20371477788695e-08, "loss": 0.5153, "num_input_tokens_seen": 76267616, "step": 131360 }, { "epoch": 19.565832588620793, "grad_norm": 1.5856536626815796, "learning_rate": 7.179085874930713e-08, "loss": 0.6234, "num_input_tokens_seen": 76270368, "step": 131365 }, { "epoch": 19.56657730116175, "grad_norm": 1.4277864694595337, "learning_rate": 7.154499085650079e-08, "loss": 0.6295, "num_input_tokens_seen": 76273216, "step": 131370 }, { "epoch": 19.56732201370271, "grad_norm": 1.7615255117416382, "learning_rate": 7.129954410460548e-08, "loss": 0.4977, "num_input_tokens_seen": 76276064, "step": 131375 }, { "epoch": 19.56806672624367, "grad_norm": 1.426619529724121, "learning_rate": 7.105451849777067e-08, "loss": 0.6051, "num_input_tokens_seen": 76278688, "step": 131380 }, { "epoch": 19.56881143878463, "grad_norm": 1.2745281457901, "learning_rate": 7.080991404012915e-08, "loss": 0.663, "num_input_tokens_seen": 76281344, "step": 131385 }, { "epoch": 19.56955615132559, "grad_norm": 1.2984039783477783, "learning_rate": 7.056573073581929e-08, "loss": 0.6037, "num_input_tokens_seen": 76284544, "step": 131390 }, { "epoch": 19.570300863866546, "grad_norm": 1.1517189741134644, "learning_rate": 7.032196858896279e-08, "loss": 0.7302, "num_input_tokens_seen": 76287360, "step": 131395 }, { "epoch": 19.571045576407506, "grad_norm": 2.169135332107544, "learning_rate": 7.007862760368133e-08, "loss": 0.5298, "num_input_tokens_seen": 76290144, "step": 131400 }, { "epoch": 19.571790288948467, "grad_norm": 1.2655125856399536, "learning_rate": 6.983570778408277e-08, "loss": 0.5182, "num_input_tokens_seen": 76293056, "step": 131405 }, { "epoch": 19.572535001489424, "grad_norm": 1.0532466173171997, "learning_rate": 6.959320913427492e-08, "loss": 0.5995, "num_input_tokens_seen": 76295936, "step": 131410 }, { "epoch": 19.573279714030384, "grad_norm": 1.5687071084976196, "learning_rate": 6.935113165834616e-08, "loss": 0.662, "num_input_tokens_seen": 76298592, "step": 131415 }, { "epoch": 19.574024426571345, "grad_norm": 1.827123761177063, "learning_rate": 6.910947536039603e-08, "loss": 0.5377, "num_input_tokens_seen": 76301408, "step": 131420 }, { "epoch": 19.574769139112302, "grad_norm": 1.6126679182052612, "learning_rate": 6.886824024450178e-08, "loss": 0.5194, "num_input_tokens_seen": 76304224, "step": 131425 }, { "epoch": 19.575513851653263, "grad_norm": 2.9096503257751465, "learning_rate": 6.862742631473795e-08, "loss": 0.7078, "num_input_tokens_seen": 76307072, "step": 131430 }, { "epoch": 19.57625856419422, "grad_norm": 2.007106065750122, "learning_rate": 6.838703357517628e-08, "loss": 0.5936, "num_input_tokens_seen": 76310240, "step": 131435 }, { "epoch": 19.57700327673518, "grad_norm": 1.234700083732605, "learning_rate": 6.814706202987465e-08, "loss": 0.6499, "num_input_tokens_seen": 76312992, "step": 131440 }, { "epoch": 19.57774798927614, "grad_norm": 2.5761170387268066, "learning_rate": 6.79075116828909e-08, "loss": 0.5711, "num_input_tokens_seen": 76316032, "step": 131445 }, { "epoch": 19.578492701817098, "grad_norm": 2.413905620574951, "learning_rate": 6.766838253826902e-08, "loss": 0.6304, "num_input_tokens_seen": 76318976, "step": 131450 }, { "epoch": 19.579237414358058, "grad_norm": 1.9410847425460815, "learning_rate": 6.742967460005023e-08, "loss": 0.5768, "num_input_tokens_seen": 76321728, "step": 131455 }, { "epoch": 19.57998212689902, "grad_norm": 1.3886579275131226, "learning_rate": 6.719138787226464e-08, "loss": 0.5696, "num_input_tokens_seen": 76324512, "step": 131460 }, { "epoch": 19.580726839439976, "grad_norm": 1.4990166425704956, "learning_rate": 6.695352235894237e-08, "loss": 0.5122, "num_input_tokens_seen": 76327648, "step": 131465 }, { "epoch": 19.581471551980936, "grad_norm": 1.4088752269744873, "learning_rate": 6.671607806409963e-08, "loss": 0.572, "num_input_tokens_seen": 76330336, "step": 131470 }, { "epoch": 19.582216264521893, "grad_norm": 1.4077794551849365, "learning_rate": 6.647905499174712e-08, "loss": 0.6603, "num_input_tokens_seen": 76333088, "step": 131475 }, { "epoch": 19.582960977062854, "grad_norm": 1.1213654279708862, "learning_rate": 6.624245314588994e-08, "loss": 0.5277, "num_input_tokens_seen": 76336288, "step": 131480 }, { "epoch": 19.583705689603814, "grad_norm": 1.5328277349472046, "learning_rate": 6.600627253052216e-08, "loss": 0.5792, "num_input_tokens_seen": 76339008, "step": 131485 }, { "epoch": 19.58445040214477, "grad_norm": 2.102959632873535, "learning_rate": 6.577051314964055e-08, "loss": 0.5396, "num_input_tokens_seen": 76342080, "step": 131490 }, { "epoch": 19.58519511468573, "grad_norm": 2.160780906677246, "learning_rate": 6.55351750072225e-08, "loss": 0.6469, "num_input_tokens_seen": 76344800, "step": 131495 }, { "epoch": 19.58593982722669, "grad_norm": 1.425611138343811, "learning_rate": 6.530025810724539e-08, "loss": 0.5123, "num_input_tokens_seen": 76347680, "step": 131500 }, { "epoch": 19.58668453976765, "grad_norm": 2.3015167713165283, "learning_rate": 6.506576245367824e-08, "loss": 0.5818, "num_input_tokens_seen": 76350624, "step": 131505 }, { "epoch": 19.58742925230861, "grad_norm": 2.885978937149048, "learning_rate": 6.483168805047901e-08, "loss": 0.8052, "num_input_tokens_seen": 76353664, "step": 131510 }, { "epoch": 19.588173964849567, "grad_norm": 3.100693941116333, "learning_rate": 6.459803490160843e-08, "loss": 0.7292, "num_input_tokens_seen": 76356608, "step": 131515 }, { "epoch": 19.588918677390527, "grad_norm": 1.1471415758132935, "learning_rate": 6.436480301101055e-08, "loss": 0.4395, "num_input_tokens_seen": 76359328, "step": 131520 }, { "epoch": 19.589663389931488, "grad_norm": 1.7729111909866333, "learning_rate": 6.413199238262668e-08, "loss": 0.552, "num_input_tokens_seen": 76362144, "step": 131525 }, { "epoch": 19.590408102472445, "grad_norm": 1.0989187955856323, "learning_rate": 6.389960302038978e-08, "loss": 0.4779, "num_input_tokens_seen": 76365408, "step": 131530 }, { "epoch": 19.591152815013405, "grad_norm": 1.870458960533142, "learning_rate": 6.366763492822448e-08, "loss": 0.5301, "num_input_tokens_seen": 76368128, "step": 131535 }, { "epoch": 19.591897527554362, "grad_norm": 1.703736424446106, "learning_rate": 6.343608811004986e-08, "loss": 0.6585, "num_input_tokens_seen": 76371200, "step": 131540 }, { "epoch": 19.592642240095323, "grad_norm": 1.213889479637146, "learning_rate": 6.320496256977671e-08, "loss": 0.5214, "num_input_tokens_seen": 76373856, "step": 131545 }, { "epoch": 19.593386952636283, "grad_norm": 2.2179951667785645, "learning_rate": 6.297425831131299e-08, "loss": 0.5701, "num_input_tokens_seen": 76376736, "step": 131550 }, { "epoch": 19.59413166517724, "grad_norm": 2.504786968231201, "learning_rate": 6.274397533855281e-08, "loss": 0.4616, "num_input_tokens_seen": 76379744, "step": 131555 }, { "epoch": 19.5948763777182, "grad_norm": 0.9291492700576782, "learning_rate": 6.251411365539029e-08, "loss": 0.4428, "num_input_tokens_seen": 76382336, "step": 131560 }, { "epoch": 19.59562109025916, "grad_norm": 1.4862456321716309, "learning_rate": 6.228467326570286e-08, "loss": 0.6033, "num_input_tokens_seen": 76385216, "step": 131565 }, { "epoch": 19.59636580280012, "grad_norm": 2.3178720474243164, "learning_rate": 6.205565417337356e-08, "loss": 0.5066, "num_input_tokens_seen": 76387968, "step": 131570 }, { "epoch": 19.59711051534108, "grad_norm": 1.4503782987594604, "learning_rate": 6.182705638226872e-08, "loss": 0.4592, "num_input_tokens_seen": 76390816, "step": 131575 }, { "epoch": 19.597855227882036, "grad_norm": 2.212279796600342, "learning_rate": 6.159887989624635e-08, "loss": 0.7502, "num_input_tokens_seen": 76393696, "step": 131580 }, { "epoch": 19.598599940422996, "grad_norm": 1.6482642889022827, "learning_rate": 6.137112471916729e-08, "loss": 0.5386, "num_input_tokens_seen": 76396544, "step": 131585 }, { "epoch": 19.599344652963957, "grad_norm": 1.4674415588378906, "learning_rate": 6.114379085487565e-08, "loss": 0.5482, "num_input_tokens_seen": 76399456, "step": 131590 }, { "epoch": 19.600089365504914, "grad_norm": 1.5989962816238403, "learning_rate": 6.091687830721282e-08, "loss": 0.6769, "num_input_tokens_seen": 76402176, "step": 131595 }, { "epoch": 19.600834078045875, "grad_norm": 1.2207107543945312, "learning_rate": 6.069038708001462e-08, "loss": 0.5935, "num_input_tokens_seen": 76404992, "step": 131600 }, { "epoch": 19.601578790586835, "grad_norm": 1.6624282598495483, "learning_rate": 6.046431717710299e-08, "loss": 0.5884, "num_input_tokens_seen": 76407936, "step": 131605 }, { "epoch": 19.602323503127792, "grad_norm": 2.0807955265045166, "learning_rate": 6.023866860229988e-08, "loss": 0.4803, "num_input_tokens_seen": 76410528, "step": 131610 }, { "epoch": 19.603068215668753, "grad_norm": 3.6986019611358643, "learning_rate": 6.001344135941611e-08, "loss": 0.7135, "num_input_tokens_seen": 76413056, "step": 131615 }, { "epoch": 19.60381292820971, "grad_norm": 2.297698497772217, "learning_rate": 5.9788635452257e-08, "loss": 0.6692, "num_input_tokens_seen": 76415776, "step": 131620 }, { "epoch": 19.60455764075067, "grad_norm": 1.1246520280838013, "learning_rate": 5.9564250884622255e-08, "loss": 0.5084, "num_input_tokens_seen": 76418784, "step": 131625 }, { "epoch": 19.60530235329163, "grad_norm": 1.1913565397262573, "learning_rate": 5.934028766030053e-08, "loss": 0.55, "num_input_tokens_seen": 76421888, "step": 131630 }, { "epoch": 19.606047065832588, "grad_norm": 1.1634947061538696, "learning_rate": 5.911674578307491e-08, "loss": 0.6386, "num_input_tokens_seen": 76424768, "step": 131635 }, { "epoch": 19.606791778373548, "grad_norm": 1.6128507852554321, "learning_rate": 5.88936252567257e-08, "loss": 0.7029, "num_input_tokens_seen": 76427744, "step": 131640 }, { "epoch": 19.60753649091451, "grad_norm": 2.394318103790283, "learning_rate": 5.8670926085016564e-08, "loss": 0.5388, "num_input_tokens_seen": 76430496, "step": 131645 }, { "epoch": 19.608281203455466, "grad_norm": 3.1561107635498047, "learning_rate": 5.8448648271713925e-08, "loss": 0.6688, "num_input_tokens_seen": 76433440, "step": 131650 }, { "epoch": 19.609025915996426, "grad_norm": 1.8535773754119873, "learning_rate": 5.822679182057311e-08, "loss": 0.694, "num_input_tokens_seen": 76436576, "step": 131655 }, { "epoch": 19.609770628537383, "grad_norm": 1.609034776687622, "learning_rate": 5.8005356735341135e-08, "loss": 0.4421, "num_input_tokens_seen": 76439488, "step": 131660 }, { "epoch": 19.610515341078344, "grad_norm": 1.8985241651535034, "learning_rate": 5.7784343019759436e-08, "loss": 0.5681, "num_input_tokens_seen": 76442464, "step": 131665 }, { "epoch": 19.611260053619304, "grad_norm": 1.1292506456375122, "learning_rate": 5.756375067755837e-08, "loss": 0.4307, "num_input_tokens_seen": 76445664, "step": 131670 }, { "epoch": 19.61200476616026, "grad_norm": 1.582255244255066, "learning_rate": 5.7343579712468286e-08, "loss": 0.4639, "num_input_tokens_seen": 76448352, "step": 131675 }, { "epoch": 19.61274947870122, "grad_norm": 1.1069343090057373, "learning_rate": 5.712383012820843e-08, "loss": 0.498, "num_input_tokens_seen": 76451200, "step": 131680 }, { "epoch": 19.613494191242182, "grad_norm": 1.6522928476333618, "learning_rate": 5.6904501928489726e-08, "loss": 0.584, "num_input_tokens_seen": 76454240, "step": 131685 }, { "epoch": 19.61423890378314, "grad_norm": 1.2871161699295044, "learning_rate": 5.668559511702032e-08, "loss": 0.561, "num_input_tokens_seen": 76456800, "step": 131690 }, { "epoch": 19.6149836163241, "grad_norm": 1.420419454574585, "learning_rate": 5.646710969749447e-08, "loss": 0.5945, "num_input_tokens_seen": 76459808, "step": 131695 }, { "epoch": 19.615728328865057, "grad_norm": 2.439382791519165, "learning_rate": 5.6249045673606446e-08, "loss": 0.5802, "num_input_tokens_seen": 76463168, "step": 131700 }, { "epoch": 19.616473041406017, "grad_norm": 1.485117793083191, "learning_rate": 5.603140304903942e-08, "loss": 0.691, "num_input_tokens_seen": 76466144, "step": 131705 }, { "epoch": 19.617217753946978, "grad_norm": 2.5124716758728027, "learning_rate": 5.581418182746823e-08, "loss": 0.8506, "num_input_tokens_seen": 76469280, "step": 131710 }, { "epoch": 19.617962466487935, "grad_norm": 1.398783564567566, "learning_rate": 5.55973820125677e-08, "loss": 0.7958, "num_input_tokens_seen": 76471968, "step": 131715 }, { "epoch": 19.618707179028895, "grad_norm": 1.3329178094863892, "learning_rate": 5.538100360799325e-08, "loss": 0.6699, "num_input_tokens_seen": 76474944, "step": 131720 }, { "epoch": 19.619451891569852, "grad_norm": 1.2862564325332642, "learning_rate": 5.516504661740585e-08, "loss": 0.647, "num_input_tokens_seen": 76477888, "step": 131725 }, { "epoch": 19.620196604110813, "grad_norm": 1.082721471786499, "learning_rate": 5.494951104445256e-08, "loss": 0.6074, "num_input_tokens_seen": 76480672, "step": 131730 }, { "epoch": 19.620941316651773, "grad_norm": 1.2067395448684692, "learning_rate": 5.473439689277493e-08, "loss": 0.8056, "num_input_tokens_seen": 76483552, "step": 131735 }, { "epoch": 19.62168602919273, "grad_norm": 1.2226272821426392, "learning_rate": 5.451970416600338e-08, "loss": 0.5572, "num_input_tokens_seen": 76486432, "step": 131740 }, { "epoch": 19.62243074173369, "grad_norm": 1.688048005104065, "learning_rate": 5.430543286777112e-08, "loss": 0.5087, "num_input_tokens_seen": 76489504, "step": 131745 }, { "epoch": 19.62317545427465, "grad_norm": 2.0642998218536377, "learning_rate": 5.4091583001691923e-08, "loss": 0.8245, "num_input_tokens_seen": 76492128, "step": 131750 }, { "epoch": 19.62392016681561, "grad_norm": 2.7010092735290527, "learning_rate": 5.387815457138512e-08, "loss": 0.8015, "num_input_tokens_seen": 76494816, "step": 131755 }, { "epoch": 19.62466487935657, "grad_norm": 1.6583631038665771, "learning_rate": 5.3665147580450604e-08, "loss": 0.7336, "num_input_tokens_seen": 76497888, "step": 131760 }, { "epoch": 19.625409591897526, "grad_norm": 1.3284530639648438, "learning_rate": 5.3452562032488275e-08, "loss": 0.5425, "num_input_tokens_seen": 76500960, "step": 131765 }, { "epoch": 19.626154304438487, "grad_norm": 2.659715414047241, "learning_rate": 5.324039793109248e-08, "loss": 0.5719, "num_input_tokens_seen": 76503808, "step": 131770 }, { "epoch": 19.626899016979447, "grad_norm": 2.3895010948181152, "learning_rate": 5.302865527984369e-08, "loss": 0.5945, "num_input_tokens_seen": 76507008, "step": 131775 }, { "epoch": 19.627643729520404, "grad_norm": 1.6992418766021729, "learning_rate": 5.281733408232237e-08, "loss": 0.5641, "num_input_tokens_seen": 76510112, "step": 131780 }, { "epoch": 19.628388442061365, "grad_norm": 1.416261911392212, "learning_rate": 5.2606434342095115e-08, "loss": 0.548, "num_input_tokens_seen": 76513312, "step": 131785 }, { "epoch": 19.629133154602325, "grad_norm": 1.3597558736801147, "learning_rate": 5.2395956062728515e-08, "loss": 0.5263, "num_input_tokens_seen": 76516096, "step": 131790 }, { "epoch": 19.629877867143282, "grad_norm": 1.404283881187439, "learning_rate": 5.218589924777528e-08, "loss": 0.657, "num_input_tokens_seen": 76519008, "step": 131795 }, { "epoch": 19.630622579684243, "grad_norm": 2.491438627243042, "learning_rate": 5.1976263900788136e-08, "loss": 0.6013, "num_input_tokens_seen": 76521536, "step": 131800 }, { "epoch": 19.6313672922252, "grad_norm": 1.3546233177185059, "learning_rate": 5.176705002530313e-08, "loss": 0.5435, "num_input_tokens_seen": 76524352, "step": 131805 }, { "epoch": 19.63211200476616, "grad_norm": 1.5594727993011475, "learning_rate": 5.155825762485911e-08, "loss": 0.4076, "num_input_tokens_seen": 76527232, "step": 131810 }, { "epoch": 19.63285671730712, "grad_norm": 1.9863989353179932, "learning_rate": 5.134988670298102e-08, "loss": 0.67, "num_input_tokens_seen": 76530208, "step": 131815 }, { "epoch": 19.633601429848078, "grad_norm": 1.5253729820251465, "learning_rate": 5.1141937263188276e-08, "loss": 0.5205, "num_input_tokens_seen": 76533056, "step": 131820 }, { "epoch": 19.634346142389038, "grad_norm": 2.1212098598480225, "learning_rate": 5.093440930899751e-08, "loss": 0.5551, "num_input_tokens_seen": 76536000, "step": 131825 }, { "epoch": 19.63509085493, "grad_norm": 1.2674691677093506, "learning_rate": 5.072730284391425e-08, "loss": 0.5242, "num_input_tokens_seen": 76538720, "step": 131830 }, { "epoch": 19.635835567470956, "grad_norm": 1.8690675497055054, "learning_rate": 5.0520617871432916e-08, "loss": 0.6051, "num_input_tokens_seen": 76541920, "step": 131835 }, { "epoch": 19.636580280011916, "grad_norm": 1.8661201000213623, "learning_rate": 5.0314354395050724e-08, "loss": 0.4984, "num_input_tokens_seen": 76544992, "step": 131840 }, { "epoch": 19.637324992552873, "grad_norm": 1.388755440711975, "learning_rate": 5.010851241824821e-08, "loss": 0.6092, "num_input_tokens_seen": 76547808, "step": 131845 }, { "epoch": 19.638069705093834, "grad_norm": 1.2355408668518066, "learning_rate": 4.990309194450593e-08, "loss": 0.5997, "num_input_tokens_seen": 76550880, "step": 131850 }, { "epoch": 19.638814417634794, "grad_norm": 1.2828385829925537, "learning_rate": 4.9698092977290556e-08, "loss": 0.371, "num_input_tokens_seen": 76553664, "step": 131855 }, { "epoch": 19.63955913017575, "grad_norm": 1.7032294273376465, "learning_rate": 4.9493515520068754e-08, "loss": 0.5648, "num_input_tokens_seen": 76556512, "step": 131860 }, { "epoch": 19.640303842716712, "grad_norm": 2.2765159606933594, "learning_rate": 4.92893595762961e-08, "loss": 0.5979, "num_input_tokens_seen": 76559264, "step": 131865 }, { "epoch": 19.64104855525767, "grad_norm": 1.475662112236023, "learning_rate": 4.908562514941983e-08, "loss": 0.6008, "num_input_tokens_seen": 76562272, "step": 131870 }, { "epoch": 19.64179326779863, "grad_norm": 1.8664686679840088, "learning_rate": 4.888231224288442e-08, "loss": 0.6087, "num_input_tokens_seen": 76565056, "step": 131875 }, { "epoch": 19.64253798033959, "grad_norm": 1.5033072233200073, "learning_rate": 4.867942086012045e-08, "loss": 0.6296, "num_input_tokens_seen": 76567776, "step": 131880 }, { "epoch": 19.643282692880547, "grad_norm": 2.7230865955352783, "learning_rate": 4.847695100456129e-08, "loss": 0.693, "num_input_tokens_seen": 76570464, "step": 131885 }, { "epoch": 19.644027405421507, "grad_norm": 4.1468353271484375, "learning_rate": 4.8274902679623644e-08, "loss": 0.5238, "num_input_tokens_seen": 76573088, "step": 131890 }, { "epoch": 19.644772117962468, "grad_norm": 1.7398746013641357, "learning_rate": 4.807327588871868e-08, "loss": 0.5722, "num_input_tokens_seen": 76576128, "step": 131895 }, { "epoch": 19.645516830503425, "grad_norm": 2.2075345516204834, "learning_rate": 4.7872070635260333e-08, "loss": 0.6495, "num_input_tokens_seen": 76578976, "step": 131900 }, { "epoch": 19.646261543044385, "grad_norm": 1.391730785369873, "learning_rate": 4.7671286922640335e-08, "loss": 0.5527, "num_input_tokens_seen": 76581696, "step": 131905 }, { "epoch": 19.647006255585342, "grad_norm": 1.816384196281433, "learning_rate": 4.7470924754253184e-08, "loss": 0.5249, "num_input_tokens_seen": 76584704, "step": 131910 }, { "epoch": 19.647750968126303, "grad_norm": 1.7829725742340088, "learning_rate": 4.727098413348785e-08, "loss": 0.6521, "num_input_tokens_seen": 76587872, "step": 131915 }, { "epoch": 19.648495680667263, "grad_norm": 1.397947907447815, "learning_rate": 4.707146506371385e-08, "loss": 0.5611, "num_input_tokens_seen": 76590784, "step": 131920 }, { "epoch": 19.64924039320822, "grad_norm": 2.482285261154175, "learning_rate": 4.6872367548309036e-08, "loss": 0.4857, "num_input_tokens_seen": 76593632, "step": 131925 }, { "epoch": 19.64998510574918, "grad_norm": 1.2957342863082886, "learning_rate": 4.6673691590634614e-08, "loss": 0.529, "num_input_tokens_seen": 76596416, "step": 131930 }, { "epoch": 19.65072981829014, "grad_norm": 1.0971252918243408, "learning_rate": 4.6475437194046237e-08, "loss": 0.4271, "num_input_tokens_seen": 76599648, "step": 131935 }, { "epoch": 19.6514745308311, "grad_norm": 1.2980613708496094, "learning_rate": 4.627760436189121e-08, "loss": 0.5587, "num_input_tokens_seen": 76602784, "step": 131940 }, { "epoch": 19.65221924337206, "grad_norm": 2.4159295558929443, "learning_rate": 4.6080193097516877e-08, "loss": 0.5829, "num_input_tokens_seen": 76605536, "step": 131945 }, { "epoch": 19.652963955913016, "grad_norm": 2.3005926609039307, "learning_rate": 4.588320340425667e-08, "loss": 0.4763, "num_input_tokens_seen": 76608256, "step": 131950 }, { "epoch": 19.653708668453977, "grad_norm": 0.8162123560905457, "learning_rate": 4.5686635285432934e-08, "loss": 0.4661, "num_input_tokens_seen": 76611008, "step": 131955 }, { "epoch": 19.654453380994937, "grad_norm": 1.3032904863357544, "learning_rate": 4.5490488744376335e-08, "loss": 0.6529, "num_input_tokens_seen": 76613952, "step": 131960 }, { "epoch": 19.655198093535894, "grad_norm": 1.1715911626815796, "learning_rate": 4.529476378439257e-08, "loss": 0.3754, "num_input_tokens_seen": 76616960, "step": 131965 }, { "epoch": 19.655942806076855, "grad_norm": 1.767094373703003, "learning_rate": 4.50994604087901e-08, "loss": 0.4335, "num_input_tokens_seen": 76619936, "step": 131970 }, { "epoch": 19.656687518617815, "grad_norm": 2.085767984390259, "learning_rate": 4.490457862087183e-08, "loss": 0.5407, "num_input_tokens_seen": 76622656, "step": 131975 }, { "epoch": 19.657432231158772, "grad_norm": 1.8438079357147217, "learning_rate": 4.471011842392403e-08, "loss": 0.6579, "num_input_tokens_seen": 76625536, "step": 131980 }, { "epoch": 19.658176943699733, "grad_norm": 2.566830635070801, "learning_rate": 4.451607982123851e-08, "loss": 0.6132, "num_input_tokens_seen": 76628512, "step": 131985 }, { "epoch": 19.65892165624069, "grad_norm": 1.3154141902923584, "learning_rate": 4.432246281609042e-08, "loss": 0.5509, "num_input_tokens_seen": 76631456, "step": 131990 }, { "epoch": 19.65966636878165, "grad_norm": 1.4825146198272705, "learning_rate": 4.4129267411749386e-08, "loss": 0.4954, "num_input_tokens_seen": 76634304, "step": 131995 }, { "epoch": 19.66041108132261, "grad_norm": 1.5123945474624634, "learning_rate": 4.393649361147944e-08, "loss": 0.7, "num_input_tokens_seen": 76637280, "step": 132000 }, { "epoch": 19.661155793863568, "grad_norm": 1.4070457220077515, "learning_rate": 4.374414141853911e-08, "loss": 0.5117, "num_input_tokens_seen": 76640160, "step": 132005 }, { "epoch": 19.66190050640453, "grad_norm": 1.809403657913208, "learning_rate": 4.355221083617578e-08, "loss": 0.5816, "num_input_tokens_seen": 76643168, "step": 132010 }, { "epoch": 19.662645218945485, "grad_norm": 1.6173702478408813, "learning_rate": 4.336070186763685e-08, "loss": 0.4045, "num_input_tokens_seen": 76645888, "step": 132015 }, { "epoch": 19.663389931486446, "grad_norm": 1.8165241479873657, "learning_rate": 4.316961451615031e-08, "loss": 0.6086, "num_input_tokens_seen": 76648896, "step": 132020 }, { "epoch": 19.664134644027406, "grad_norm": 1.5487051010131836, "learning_rate": 4.297894878494968e-08, "loss": 0.6366, "num_input_tokens_seen": 76652224, "step": 132025 }, { "epoch": 19.664879356568363, "grad_norm": 1.6762380599975586, "learning_rate": 4.27887046772546e-08, "loss": 0.5841, "num_input_tokens_seen": 76655008, "step": 132030 }, { "epoch": 19.665624069109324, "grad_norm": 2.7822399139404297, "learning_rate": 4.2598882196279165e-08, "loss": 0.57, "num_input_tokens_seen": 76657792, "step": 132035 }, { "epoch": 19.666368781650284, "grad_norm": 3.2806613445281982, "learning_rate": 4.240948134522915e-08, "loss": 0.7621, "num_input_tokens_seen": 76660512, "step": 132040 }, { "epoch": 19.66711349419124, "grad_norm": 1.6509491205215454, "learning_rate": 4.2220502127304775e-08, "loss": 0.6787, "num_input_tokens_seen": 76663488, "step": 132045 }, { "epoch": 19.667858206732202, "grad_norm": 2.3931727409362793, "learning_rate": 4.2031944545700696e-08, "loss": 0.8493, "num_input_tokens_seen": 76666144, "step": 132050 }, { "epoch": 19.66860291927316, "grad_norm": 0.9959797263145447, "learning_rate": 4.184380860360049e-08, "loss": 0.4567, "num_input_tokens_seen": 76668864, "step": 132055 }, { "epoch": 19.66934763181412, "grad_norm": 1.0979945659637451, "learning_rate": 4.165609430418216e-08, "loss": 0.4989, "num_input_tokens_seen": 76671936, "step": 132060 }, { "epoch": 19.67009234435508, "grad_norm": 1.8810817003250122, "learning_rate": 4.1468801650618175e-08, "loss": 0.6041, "num_input_tokens_seen": 76674880, "step": 132065 }, { "epoch": 19.670837056896037, "grad_norm": 0.9991785287857056, "learning_rate": 4.128193064606989e-08, "loss": 0.4057, "num_input_tokens_seen": 76677600, "step": 132070 }, { "epoch": 19.671581769436997, "grad_norm": 4.991951942443848, "learning_rate": 4.1095481293698665e-08, "loss": 0.7819, "num_input_tokens_seen": 76680768, "step": 132075 }, { "epoch": 19.672326481977958, "grad_norm": 1.6841312646865845, "learning_rate": 4.0909453596651995e-08, "loss": 0.6774, "num_input_tokens_seen": 76683424, "step": 132080 }, { "epoch": 19.673071194518915, "grad_norm": 2.789991855621338, "learning_rate": 4.0723847558071795e-08, "loss": 0.6093, "num_input_tokens_seen": 76686432, "step": 132085 }, { "epoch": 19.673815907059875, "grad_norm": 3.201822519302368, "learning_rate": 4.0538663181097224e-08, "loss": 0.5925, "num_input_tokens_seen": 76689120, "step": 132090 }, { "epoch": 19.674560619600832, "grad_norm": 0.9598259925842285, "learning_rate": 4.035390046885079e-08, "loss": 0.7833, "num_input_tokens_seen": 76692096, "step": 132095 }, { "epoch": 19.675305332141793, "grad_norm": 1.482399821281433, "learning_rate": 4.016955942446055e-08, "loss": 0.6777, "num_input_tokens_seen": 76695040, "step": 132100 }, { "epoch": 19.676050044682754, "grad_norm": 2.370500326156616, "learning_rate": 3.9985640051035114e-08, "loss": 0.4753, "num_input_tokens_seen": 76697952, "step": 132105 }, { "epoch": 19.67679475722371, "grad_norm": 1.8629257678985596, "learning_rate": 3.980214235168589e-08, "loss": 0.554, "num_input_tokens_seen": 76700512, "step": 132110 }, { "epoch": 19.67753946976467, "grad_norm": 2.0969221591949463, "learning_rate": 3.96190663295104e-08, "loss": 0.5052, "num_input_tokens_seen": 76703680, "step": 132115 }, { "epoch": 19.67828418230563, "grad_norm": 1.0308432579040527, "learning_rate": 3.943641198760062e-08, "loss": 0.4778, "num_input_tokens_seen": 76706400, "step": 132120 }, { "epoch": 19.67902889484659, "grad_norm": 1.0492178201675415, "learning_rate": 3.925417932904574e-08, "loss": 0.5979, "num_input_tokens_seen": 76709344, "step": 132125 }, { "epoch": 19.67977360738755, "grad_norm": 2.81820011138916, "learning_rate": 3.907236835692385e-08, "loss": 0.4694, "num_input_tokens_seen": 76712352, "step": 132130 }, { "epoch": 19.680518319928506, "grad_norm": 1.7519080638885498, "learning_rate": 3.8890979074301946e-08, "loss": 0.582, "num_input_tokens_seen": 76715264, "step": 132135 }, { "epoch": 19.681263032469467, "grad_norm": 1.9977195262908936, "learning_rate": 3.8710011484249795e-08, "loss": 0.5941, "num_input_tokens_seen": 76718272, "step": 132140 }, { "epoch": 19.682007745010427, "grad_norm": 0.7956366539001465, "learning_rate": 3.8529465589820514e-08, "loss": 0.4276, "num_input_tokens_seen": 76721408, "step": 132145 }, { "epoch": 19.682752457551384, "grad_norm": 1.0836262702941895, "learning_rate": 3.8349341394067204e-08, "loss": 0.5295, "num_input_tokens_seen": 76724064, "step": 132150 }, { "epoch": 19.683497170092345, "grad_norm": 1.430395483970642, "learning_rate": 3.816963890003189e-08, "loss": 0.5886, "num_input_tokens_seen": 76726880, "step": 132155 }, { "epoch": 19.684241882633305, "grad_norm": 2.9370036125183105, "learning_rate": 3.799035811075102e-08, "loss": 0.7248, "num_input_tokens_seen": 76729728, "step": 132160 }, { "epoch": 19.684986595174262, "grad_norm": 1.6111761331558228, "learning_rate": 3.7811499029252737e-08, "loss": 0.7161, "num_input_tokens_seen": 76732320, "step": 132165 }, { "epoch": 19.685731307715223, "grad_norm": 0.9767888188362122, "learning_rate": 3.763306165855962e-08, "loss": 0.6028, "num_input_tokens_seen": 76735424, "step": 132170 }, { "epoch": 19.68647602025618, "grad_norm": 1.4037833213806152, "learning_rate": 3.745504600168315e-08, "loss": 0.7471, "num_input_tokens_seen": 76738240, "step": 132175 }, { "epoch": 19.68722073279714, "grad_norm": 1.7369784116744995, "learning_rate": 3.72774520616348e-08, "loss": 0.6235, "num_input_tokens_seen": 76741216, "step": 132180 }, { "epoch": 19.6879654453381, "grad_norm": 2.209397315979004, "learning_rate": 3.7100279841412177e-08, "loss": 0.665, "num_input_tokens_seen": 76743872, "step": 132185 }, { "epoch": 19.688710157879058, "grad_norm": 3.434399127960205, "learning_rate": 3.6923529344007336e-08, "loss": 0.5825, "num_input_tokens_seen": 76746752, "step": 132190 }, { "epoch": 19.68945487042002, "grad_norm": 0.7248722910881042, "learning_rate": 3.674720057240955e-08, "loss": 0.5266, "num_input_tokens_seen": 76749696, "step": 132195 }, { "epoch": 19.69019958296098, "grad_norm": 2.0512847900390625, "learning_rate": 3.657129352959698e-08, "loss": 0.6677, "num_input_tokens_seen": 76753024, "step": 132200 }, { "epoch": 19.690944295501936, "grad_norm": 1.7033512592315674, "learning_rate": 3.63958082185395e-08, "loss": 0.6214, "num_input_tokens_seen": 76756160, "step": 132205 }, { "epoch": 19.691689008042896, "grad_norm": 1.5352622270584106, "learning_rate": 3.622074464220415e-08, "loss": 0.714, "num_input_tokens_seen": 76759168, "step": 132210 }, { "epoch": 19.692433720583853, "grad_norm": 1.7872519493103027, "learning_rate": 3.604610280354692e-08, "loss": 0.6594, "num_input_tokens_seen": 76761760, "step": 132215 }, { "epoch": 19.693178433124814, "grad_norm": 1.2586169242858887, "learning_rate": 3.587188270551822e-08, "loss": 0.5622, "num_input_tokens_seen": 76764672, "step": 132220 }, { "epoch": 19.693923145665774, "grad_norm": 2.011111259460449, "learning_rate": 3.569808435106292e-08, "loss": 0.6407, "num_input_tokens_seen": 76767328, "step": 132225 }, { "epoch": 19.69466785820673, "grad_norm": 2.526233434677124, "learning_rate": 3.552470774311478e-08, "loss": 0.7335, "num_input_tokens_seen": 76770016, "step": 132230 }, { "epoch": 19.695412570747692, "grad_norm": 0.6455752849578857, "learning_rate": 3.535175288460479e-08, "loss": 0.5165, "num_input_tokens_seen": 76773312, "step": 132235 }, { "epoch": 19.69615728328865, "grad_norm": 4.606656551361084, "learning_rate": 3.5179219778452846e-08, "loss": 0.6691, "num_input_tokens_seen": 76776512, "step": 132240 }, { "epoch": 19.69690199582961, "grad_norm": 1.7499576807022095, "learning_rate": 3.500710842757604e-08, "loss": 0.6075, "num_input_tokens_seen": 76779264, "step": 132245 }, { "epoch": 19.69764670837057, "grad_norm": 1.2142994403839111, "learning_rate": 3.483541883487762e-08, "loss": 0.6802, "num_input_tokens_seen": 76782240, "step": 132250 }, { "epoch": 19.698391420911527, "grad_norm": 2.1879794597625732, "learning_rate": 3.466415100326359e-08, "loss": 0.6181, "num_input_tokens_seen": 76785056, "step": 132255 }, { "epoch": 19.699136133452487, "grad_norm": 2.092244863510132, "learning_rate": 3.449330493562608e-08, "loss": 0.7621, "num_input_tokens_seen": 76787744, "step": 132260 }, { "epoch": 19.699880845993448, "grad_norm": 1.8245917558670044, "learning_rate": 3.4322880634851674e-08, "loss": 0.5823, "num_input_tokens_seen": 76790976, "step": 132265 }, { "epoch": 19.700625558534405, "grad_norm": 1.0868620872497559, "learning_rate": 3.415287810381584e-08, "loss": 0.4472, "num_input_tokens_seen": 76793760, "step": 132270 }, { "epoch": 19.701370271075366, "grad_norm": 1.9213130474090576, "learning_rate": 3.3983297345391296e-08, "loss": 0.5911, "num_input_tokens_seen": 76796640, "step": 132275 }, { "epoch": 19.702114983616323, "grad_norm": 1.2093621492385864, "learning_rate": 3.381413836244796e-08, "loss": 0.7019, "num_input_tokens_seen": 76799872, "step": 132280 }, { "epoch": 19.702859696157283, "grad_norm": 1.12258780002594, "learning_rate": 3.36454011578391e-08, "loss": 0.4917, "num_input_tokens_seen": 76802656, "step": 132285 }, { "epoch": 19.703604408698244, "grad_norm": 2.717702865600586, "learning_rate": 3.347708573441521e-08, "loss": 0.5507, "num_input_tokens_seen": 76805568, "step": 132290 }, { "epoch": 19.7043491212392, "grad_norm": 1.3322398662567139, "learning_rate": 3.330919209502126e-08, "loss": 0.5368, "num_input_tokens_seen": 76808480, "step": 132295 }, { "epoch": 19.70509383378016, "grad_norm": 1.3156962394714355, "learning_rate": 3.314172024249662e-08, "loss": 0.689, "num_input_tokens_seen": 76811328, "step": 132300 }, { "epoch": 19.70583854632112, "grad_norm": 1.8268498182296753, "learning_rate": 3.297467017966405e-08, "loss": 0.7277, "num_input_tokens_seen": 76814400, "step": 132305 }, { "epoch": 19.70658325886208, "grad_norm": 1.6735299825668335, "learning_rate": 3.280804190935183e-08, "loss": 0.6481, "num_input_tokens_seen": 76817088, "step": 132310 }, { "epoch": 19.70732797140304, "grad_norm": 1.895098090171814, "learning_rate": 3.264183543436883e-08, "loss": 0.5515, "num_input_tokens_seen": 76819840, "step": 132315 }, { "epoch": 19.708072683943996, "grad_norm": 2.7029740810394287, "learning_rate": 3.2476050757529466e-08, "loss": 0.6825, "num_input_tokens_seen": 76822880, "step": 132320 }, { "epoch": 19.708817396484957, "grad_norm": 0.7475992441177368, "learning_rate": 3.231068788162872e-08, "loss": 0.4861, "num_input_tokens_seen": 76826176, "step": 132325 }, { "epoch": 19.709562109025917, "grad_norm": 1.878872275352478, "learning_rate": 3.214574680946436e-08, "loss": 0.8081, "num_input_tokens_seen": 76829184, "step": 132330 }, { "epoch": 19.710306821566874, "grad_norm": 1.7714881896972656, "learning_rate": 3.198122754382305e-08, "loss": 0.3791, "num_input_tokens_seen": 76832064, "step": 132335 }, { "epoch": 19.711051534107835, "grad_norm": 1.5823508501052856, "learning_rate": 3.181713008748033e-08, "loss": 0.6043, "num_input_tokens_seen": 76835040, "step": 132340 }, { "epoch": 19.711796246648795, "grad_norm": 0.9684354662895203, "learning_rate": 3.1653454443211774e-08, "loss": 0.4235, "num_input_tokens_seen": 76837856, "step": 132345 }, { "epoch": 19.712540959189752, "grad_norm": 1.749889612197876, "learning_rate": 3.1490200613779056e-08, "loss": 0.645, "num_input_tokens_seen": 76840672, "step": 132350 }, { "epoch": 19.713285671730713, "grad_norm": 2.89974308013916, "learning_rate": 3.132736860194385e-08, "loss": 0.7011, "num_input_tokens_seen": 76843520, "step": 132355 }, { "epoch": 19.71403038427167, "grad_norm": 2.0287296772003174, "learning_rate": 3.116495841045675e-08, "loss": 0.7485, "num_input_tokens_seen": 76846528, "step": 132360 }, { "epoch": 19.71477509681263, "grad_norm": 0.9965155124664307, "learning_rate": 3.1002970042059984e-08, "loss": 0.6572, "num_input_tokens_seen": 76849344, "step": 132365 }, { "epoch": 19.71551980935359, "grad_norm": 1.0519963502883911, "learning_rate": 3.084140349949027e-08, "loss": 0.6251, "num_input_tokens_seen": 76852704, "step": 132370 }, { "epoch": 19.716264521894548, "grad_norm": 1.3902112245559692, "learning_rate": 3.0680258785478756e-08, "loss": 0.4261, "num_input_tokens_seen": 76855424, "step": 132375 }, { "epoch": 19.71700923443551, "grad_norm": 1.2411270141601562, "learning_rate": 3.051953590274548e-08, "loss": 0.593, "num_input_tokens_seen": 76858496, "step": 132380 }, { "epoch": 19.717753946976465, "grad_norm": 1.7229074239730835, "learning_rate": 3.035923485400771e-08, "loss": 0.701, "num_input_tokens_seen": 76860960, "step": 132385 }, { "epoch": 19.718498659517426, "grad_norm": 1.7573848962783813, "learning_rate": 3.0199355641971626e-08, "loss": 0.5746, "num_input_tokens_seen": 76863904, "step": 132390 }, { "epoch": 19.719243372058386, "grad_norm": 1.8266652822494507, "learning_rate": 3.003989826934062e-08, "loss": 0.6901, "num_input_tokens_seen": 76866688, "step": 132395 }, { "epoch": 19.719988084599343, "grad_norm": 2.0669562816619873, "learning_rate": 2.9880862738804196e-08, "loss": 0.545, "num_input_tokens_seen": 76869728, "step": 132400 }, { "epoch": 19.720732797140304, "grad_norm": 2.1870741844177246, "learning_rate": 2.9722249053054653e-08, "loss": 0.6277, "num_input_tokens_seen": 76872864, "step": 132405 }, { "epoch": 19.721477509681264, "grad_norm": 1.866544485092163, "learning_rate": 2.9564057214767627e-08, "loss": 0.5852, "num_input_tokens_seen": 76875840, "step": 132410 }, { "epoch": 19.72222222222222, "grad_norm": 1.3455021381378174, "learning_rate": 2.9406287226618756e-08, "loss": 0.5511, "num_input_tokens_seen": 76878848, "step": 132415 }, { "epoch": 19.722966934763182, "grad_norm": 1.4153923988342285, "learning_rate": 2.92489390912698e-08, "loss": 0.4879, "num_input_tokens_seen": 76881792, "step": 132420 }, { "epoch": 19.72371164730414, "grad_norm": 3.3667635917663574, "learning_rate": 2.909201281138252e-08, "loss": 0.7773, "num_input_tokens_seen": 76884832, "step": 132425 }, { "epoch": 19.7244563598451, "grad_norm": 1.5681719779968262, "learning_rate": 2.8935508389607568e-08, "loss": 0.6682, "num_input_tokens_seen": 76887616, "step": 132430 }, { "epoch": 19.72520107238606, "grad_norm": 1.1925491094589233, "learning_rate": 2.8779425828584506e-08, "loss": 0.6095, "num_input_tokens_seen": 76890816, "step": 132435 }, { "epoch": 19.725945784927017, "grad_norm": 1.303954005241394, "learning_rate": 2.862376513095566e-08, "loss": 0.6677, "num_input_tokens_seen": 76894112, "step": 132440 }, { "epoch": 19.726690497467978, "grad_norm": 4.056666851043701, "learning_rate": 2.846852629934671e-08, "loss": 0.8828, "num_input_tokens_seen": 76897056, "step": 132445 }, { "epoch": 19.727435210008938, "grad_norm": 1.9622083902359009, "learning_rate": 2.831370933638333e-08, "loss": 0.6589, "num_input_tokens_seen": 76899872, "step": 132450 }, { "epoch": 19.728179922549895, "grad_norm": 0.9416496157646179, "learning_rate": 2.8159314244680103e-08, "loss": 0.3815, "num_input_tokens_seen": 76902944, "step": 132455 }, { "epoch": 19.728924635090856, "grad_norm": 1.2472388744354248, "learning_rate": 2.800534102684327e-08, "loss": 0.4174, "num_input_tokens_seen": 76905504, "step": 132460 }, { "epoch": 19.729669347631813, "grad_norm": 2.2900006771087646, "learning_rate": 2.7851789685476304e-08, "loss": 0.6674, "num_input_tokens_seen": 76908640, "step": 132465 }, { "epoch": 19.730414060172773, "grad_norm": 2.1373581886291504, "learning_rate": 2.7698660223174355e-08, "loss": 0.635, "num_input_tokens_seen": 76911808, "step": 132470 }, { "epoch": 19.731158772713734, "grad_norm": 1.7198879718780518, "learning_rate": 2.7545952642521466e-08, "loss": 0.7626, "num_input_tokens_seen": 76914688, "step": 132475 }, { "epoch": 19.73190348525469, "grad_norm": 1.4243663549423218, "learning_rate": 2.7393666946098906e-08, "loss": 0.7657, "num_input_tokens_seen": 76917536, "step": 132480 }, { "epoch": 19.73264819779565, "grad_norm": 1.6093041896820068, "learning_rate": 2.7241803136479616e-08, "loss": 0.7034, "num_input_tokens_seen": 76920320, "step": 132485 }, { "epoch": 19.73339291033661, "grad_norm": 3.0087478160858154, "learning_rate": 2.7090361216230987e-08, "loss": 0.6547, "num_input_tokens_seen": 76923392, "step": 132490 }, { "epoch": 19.73413762287757, "grad_norm": 9.039560317993164, "learning_rate": 2.693934118790653e-08, "loss": 0.6516, "num_input_tokens_seen": 76926112, "step": 132495 }, { "epoch": 19.73488233541853, "grad_norm": 1.145159363746643, "learning_rate": 2.678874305405976e-08, "loss": 0.4124, "num_input_tokens_seen": 76929344, "step": 132500 }, { "epoch": 19.735627047959486, "grad_norm": 0.7658141255378723, "learning_rate": 2.663856681723864e-08, "loss": 0.8129, "num_input_tokens_seen": 76932352, "step": 132505 }, { "epoch": 19.736371760500447, "grad_norm": 1.8688322305679321, "learning_rate": 2.6488812479974478e-08, "loss": 0.6413, "num_input_tokens_seen": 76935040, "step": 132510 }, { "epoch": 19.737116473041407, "grad_norm": 1.779903531074524, "learning_rate": 2.6339480044801355e-08, "loss": 0.6237, "num_input_tokens_seen": 76937920, "step": 132515 }, { "epoch": 19.737861185582364, "grad_norm": 2.5878422260284424, "learning_rate": 2.6190569514239484e-08, "loss": 0.6093, "num_input_tokens_seen": 76940544, "step": 132520 }, { "epoch": 19.738605898123325, "grad_norm": 3.2400853633880615, "learning_rate": 2.6042080890806285e-08, "loss": 0.6339, "num_input_tokens_seen": 76943488, "step": 132525 }, { "epoch": 19.73935061066428, "grad_norm": 0.9531382918357849, "learning_rate": 2.5894014177010872e-08, "loss": 0.5602, "num_input_tokens_seen": 76946048, "step": 132530 }, { "epoch": 19.740095323205242, "grad_norm": 1.2319812774658203, "learning_rate": 2.5746369375351242e-08, "loss": 0.6823, "num_input_tokens_seen": 76948960, "step": 132535 }, { "epoch": 19.740840035746203, "grad_norm": 1.819036602973938, "learning_rate": 2.55991464883254e-08, "loss": 0.5866, "num_input_tokens_seen": 76952064, "step": 132540 }, { "epoch": 19.74158474828716, "grad_norm": 0.8964017033576965, "learning_rate": 2.545234551842024e-08, "loss": 0.6089, "num_input_tokens_seen": 76954720, "step": 132545 }, { "epoch": 19.74232946082812, "grad_norm": 2.0235817432403564, "learning_rate": 2.530596646811434e-08, "loss": 0.4594, "num_input_tokens_seen": 76957664, "step": 132550 }, { "epoch": 19.74307417336908, "grad_norm": 1.6227210760116577, "learning_rate": 2.5160009339877944e-08, "loss": 0.5183, "num_input_tokens_seen": 76960704, "step": 132555 }, { "epoch": 19.743818885910038, "grad_norm": 2.5193910598754883, "learning_rate": 2.5014474136184075e-08, "loss": 0.4845, "num_input_tokens_seen": 76963584, "step": 132560 }, { "epoch": 19.744563598451, "grad_norm": 1.643058180809021, "learning_rate": 2.486936085948355e-08, "loss": 0.582, "num_input_tokens_seen": 76966336, "step": 132565 }, { "epoch": 19.74530831099196, "grad_norm": 1.6906170845031738, "learning_rate": 2.4724669512232734e-08, "loss": 0.5516, "num_input_tokens_seen": 76969216, "step": 132570 }, { "epoch": 19.746053023532916, "grad_norm": 2.1159005165100098, "learning_rate": 2.45804000968769e-08, "loss": 0.6641, "num_input_tokens_seen": 76972640, "step": 132575 }, { "epoch": 19.746797736073876, "grad_norm": 1.4677172899246216, "learning_rate": 2.4436552615850205e-08, "loss": 0.5917, "num_input_tokens_seen": 76975488, "step": 132580 }, { "epoch": 19.747542448614833, "grad_norm": 1.5822707414627075, "learning_rate": 2.4293127071584043e-08, "loss": 0.5646, "num_input_tokens_seen": 76978272, "step": 132585 }, { "epoch": 19.748287161155794, "grad_norm": 1.5725432634353638, "learning_rate": 2.4150123466498698e-08, "loss": 0.5425, "num_input_tokens_seen": 76981152, "step": 132590 }, { "epoch": 19.749031873696755, "grad_norm": 1.428148627281189, "learning_rate": 2.400754180301723e-08, "loss": 0.4753, "num_input_tokens_seen": 76983936, "step": 132595 }, { "epoch": 19.74977658623771, "grad_norm": 1.5342391729354858, "learning_rate": 2.38653820835405e-08, "loss": 0.5664, "num_input_tokens_seen": 76986944, "step": 132600 }, { "epoch": 19.750521298778672, "grad_norm": 1.8846122026443481, "learning_rate": 2.3723644310474914e-08, "loss": 0.552, "num_input_tokens_seen": 76990464, "step": 132605 }, { "epoch": 19.75126601131963, "grad_norm": 1.9753988981246948, "learning_rate": 2.3582328486213e-08, "loss": 0.5458, "num_input_tokens_seen": 76993312, "step": 132610 }, { "epoch": 19.75201072386059, "grad_norm": 1.47931969165802, "learning_rate": 2.3441434613141743e-08, "loss": 0.6335, "num_input_tokens_seen": 76996032, "step": 132615 }, { "epoch": 19.75275543640155, "grad_norm": 3.3882014751434326, "learning_rate": 2.3300962693645345e-08, "loss": 0.7452, "num_input_tokens_seen": 76998880, "step": 132620 }, { "epoch": 19.753500148942507, "grad_norm": 1.236594796180725, "learning_rate": 2.3160912730091357e-08, "loss": 0.4384, "num_input_tokens_seen": 77002016, "step": 132625 }, { "epoch": 19.754244861483468, "grad_norm": 3.2218122482299805, "learning_rate": 2.30212847248501e-08, "loss": 0.4323, "num_input_tokens_seen": 77004672, "step": 132630 }, { "epoch": 19.754989574024428, "grad_norm": 2.634329319000244, "learning_rate": 2.288207868027803e-08, "loss": 0.5651, "num_input_tokens_seen": 77007584, "step": 132635 }, { "epoch": 19.755734286565385, "grad_norm": 1.5222113132476807, "learning_rate": 2.2743294598726038e-08, "loss": 0.5356, "num_input_tokens_seen": 77010528, "step": 132640 }, { "epoch": 19.756478999106346, "grad_norm": 1.9596490859985352, "learning_rate": 2.260493248254225e-08, "loss": 0.5692, "num_input_tokens_seen": 77013440, "step": 132645 }, { "epoch": 19.757223711647303, "grad_norm": 2.769371509552002, "learning_rate": 2.246699233406091e-08, "loss": 0.7481, "num_input_tokens_seen": 77016320, "step": 132650 }, { "epoch": 19.757968424188263, "grad_norm": 2.5871341228485107, "learning_rate": 2.2329474155616258e-08, "loss": 0.6049, "num_input_tokens_seen": 77019328, "step": 132655 }, { "epoch": 19.758713136729224, "grad_norm": 1.3547958135604858, "learning_rate": 2.2192377949525888e-08, "loss": 0.6061, "num_input_tokens_seen": 77022368, "step": 132660 }, { "epoch": 19.75945784927018, "grad_norm": 2.177823305130005, "learning_rate": 2.205570371811294e-08, "loss": 0.6277, "num_input_tokens_seen": 77025600, "step": 132665 }, { "epoch": 19.76020256181114, "grad_norm": 2.3228049278259277, "learning_rate": 2.1919451463678353e-08, "loss": 0.5987, "num_input_tokens_seen": 77028384, "step": 132670 }, { "epoch": 19.7609472743521, "grad_norm": 2.974863052368164, "learning_rate": 2.178362118853139e-08, "loss": 0.4963, "num_input_tokens_seen": 77030944, "step": 132675 }, { "epoch": 19.76169198689306, "grad_norm": 1.0687662363052368, "learning_rate": 2.1648212894961884e-08, "loss": 0.621, "num_input_tokens_seen": 77033696, "step": 132680 }, { "epoch": 19.76243669943402, "grad_norm": 1.6104450225830078, "learning_rate": 2.1513226585256897e-08, "loss": 0.6869, "num_input_tokens_seen": 77036576, "step": 132685 }, { "epoch": 19.763181411974976, "grad_norm": 1.2274361848831177, "learning_rate": 2.137866226170071e-08, "loss": 0.4907, "num_input_tokens_seen": 77039584, "step": 132690 }, { "epoch": 19.763926124515937, "grad_norm": 1.8513761758804321, "learning_rate": 2.1244519926566507e-08, "loss": 0.5361, "num_input_tokens_seen": 77042560, "step": 132695 }, { "epoch": 19.764670837056897, "grad_norm": 1.2862893342971802, "learning_rate": 2.111079958211637e-08, "loss": 0.597, "num_input_tokens_seen": 77045440, "step": 132700 }, { "epoch": 19.765415549597854, "grad_norm": 1.2669260501861572, "learning_rate": 2.0977501230612374e-08, "loss": 0.5588, "num_input_tokens_seen": 77048192, "step": 132705 }, { "epoch": 19.766160262138815, "grad_norm": 1.957345724105835, "learning_rate": 2.0844624874305497e-08, "loss": 0.5912, "num_input_tokens_seen": 77051072, "step": 132710 }, { "epoch": 19.766904974679775, "grad_norm": 1.9540355205535889, "learning_rate": 2.0712170515443942e-08, "loss": 0.584, "num_input_tokens_seen": 77053888, "step": 132715 }, { "epoch": 19.767649687220732, "grad_norm": 1.4081515073776245, "learning_rate": 2.0580138156259256e-08, "loss": 0.6121, "num_input_tokens_seen": 77056800, "step": 132720 }, { "epoch": 19.768394399761693, "grad_norm": 2.168210506439209, "learning_rate": 2.0448527798985762e-08, "loss": 0.6527, "num_input_tokens_seen": 77059744, "step": 132725 }, { "epoch": 19.76913911230265, "grad_norm": 1.5101609230041504, "learning_rate": 2.0317339445849458e-08, "loss": 0.6371, "num_input_tokens_seen": 77062720, "step": 132730 }, { "epoch": 19.76988382484361, "grad_norm": 3.199563503265381, "learning_rate": 2.0186573099059693e-08, "loss": 0.5894, "num_input_tokens_seen": 77065632, "step": 132735 }, { "epoch": 19.77062853738457, "grad_norm": 1.403926968574524, "learning_rate": 2.005622876083135e-08, "loss": 0.6159, "num_input_tokens_seen": 77068416, "step": 132740 }, { "epoch": 19.771373249925528, "grad_norm": 1.5341038703918457, "learning_rate": 1.9926306433362683e-08, "loss": 0.5903, "num_input_tokens_seen": 77071424, "step": 132745 }, { "epoch": 19.77211796246649, "grad_norm": 1.3458448648452759, "learning_rate": 1.979680611885193e-08, "loss": 0.5493, "num_input_tokens_seen": 77074336, "step": 132750 }, { "epoch": 19.772862675007445, "grad_norm": 2.9120686054229736, "learning_rate": 1.9667727819486225e-08, "loss": 0.5126, "num_input_tokens_seen": 77077376, "step": 132755 }, { "epoch": 19.773607387548406, "grad_norm": 1.093198537826538, "learning_rate": 1.9539071537444387e-08, "loss": 0.5914, "num_input_tokens_seen": 77080032, "step": 132760 }, { "epoch": 19.774352100089367, "grad_norm": 1.4666064977645874, "learning_rate": 1.9410837274899674e-08, "loss": 0.5843, "num_input_tokens_seen": 77082720, "step": 132765 }, { "epoch": 19.775096812630323, "grad_norm": 1.3767262697219849, "learning_rate": 1.9283025034019797e-08, "loss": 0.446, "num_input_tokens_seen": 77085888, "step": 132770 }, { "epoch": 19.775841525171284, "grad_norm": 1.2523092031478882, "learning_rate": 1.9155634816966916e-08, "loss": 0.5813, "num_input_tokens_seen": 77088960, "step": 132775 }, { "epoch": 19.776586237712245, "grad_norm": 1.5880553722381592, "learning_rate": 1.902866662588654e-08, "loss": 0.5808, "num_input_tokens_seen": 77092064, "step": 132780 }, { "epoch": 19.7773309502532, "grad_norm": 2.271063804626465, "learning_rate": 1.890212046292972e-08, "loss": 0.5093, "num_input_tokens_seen": 77094880, "step": 132785 }, { "epoch": 19.778075662794162, "grad_norm": 1.677746295928955, "learning_rate": 1.877599633023086e-08, "loss": 0.7033, "num_input_tokens_seen": 77097568, "step": 132790 }, { "epoch": 19.77882037533512, "grad_norm": 1.4942253828048706, "learning_rate": 1.865029422992437e-08, "loss": 0.5888, "num_input_tokens_seen": 77100416, "step": 132795 }, { "epoch": 19.77956508787608, "grad_norm": 1.2347979545593262, "learning_rate": 1.8525014164127996e-08, "loss": 0.5317, "num_input_tokens_seen": 77103360, "step": 132800 }, { "epoch": 19.78030980041704, "grad_norm": 1.7264710664749146, "learning_rate": 1.8400156134962265e-08, "loss": 0.6536, "num_input_tokens_seen": 77106272, "step": 132805 }, { "epoch": 19.781054512957997, "grad_norm": 3.0671443939208984, "learning_rate": 1.82757201445366e-08, "loss": 0.7145, "num_input_tokens_seen": 77109024, "step": 132810 }, { "epoch": 19.781799225498958, "grad_norm": 1.0531337261199951, "learning_rate": 1.8151706194952102e-08, "loss": 0.4577, "num_input_tokens_seen": 77111808, "step": 132815 }, { "epoch": 19.782543938039918, "grad_norm": 1.4765889644622803, "learning_rate": 1.8028114288301535e-08, "loss": 0.5436, "num_input_tokens_seen": 77114816, "step": 132820 }, { "epoch": 19.783288650580875, "grad_norm": 1.4922916889190674, "learning_rate": 1.7904944426677673e-08, "loss": 0.5337, "num_input_tokens_seen": 77117728, "step": 132825 }, { "epoch": 19.784033363121836, "grad_norm": 2.1549019813537598, "learning_rate": 1.7782196612159406e-08, "loss": 0.5256, "num_input_tokens_seen": 77120416, "step": 132830 }, { "epoch": 19.784778075662793, "grad_norm": 1.577531337738037, "learning_rate": 1.7659870846820083e-08, "loss": 0.559, "num_input_tokens_seen": 77123648, "step": 132835 }, { "epoch": 19.785522788203753, "grad_norm": 1.593701720237732, "learning_rate": 1.7537967132727483e-08, "loss": 0.628, "num_input_tokens_seen": 77126464, "step": 132840 }, { "epoch": 19.786267500744714, "grad_norm": 1.9680575132369995, "learning_rate": 1.7416485471938304e-08, "loss": 0.584, "num_input_tokens_seen": 77129312, "step": 132845 }, { "epoch": 19.78701221328567, "grad_norm": 1.1843421459197998, "learning_rate": 1.7295425866506453e-08, "loss": 0.5117, "num_input_tokens_seen": 77132032, "step": 132850 }, { "epoch": 19.78775692582663, "grad_norm": 2.4274191856384277, "learning_rate": 1.7174788318477518e-08, "loss": 0.6567, "num_input_tokens_seen": 77134880, "step": 132855 }, { "epoch": 19.788501638367592, "grad_norm": 1.3719995021820068, "learning_rate": 1.705457282988876e-08, "loss": 0.4675, "num_input_tokens_seen": 77137472, "step": 132860 }, { "epoch": 19.78924635090855, "grad_norm": 1.1965224742889404, "learning_rate": 1.6934779402771884e-08, "loss": 0.4752, "num_input_tokens_seen": 77140320, "step": 132865 }, { "epoch": 19.78999106344951, "grad_norm": 2.0004029273986816, "learning_rate": 1.681540803915027e-08, "loss": 0.6378, "num_input_tokens_seen": 77143104, "step": 132870 }, { "epoch": 19.790735775990466, "grad_norm": 1.823888897895813, "learning_rate": 1.669645874103898e-08, "loss": 0.5673, "num_input_tokens_seen": 77146240, "step": 132875 }, { "epoch": 19.791480488531427, "grad_norm": 1.4366415739059448, "learning_rate": 1.6577931510450283e-08, "loss": 0.6905, "num_input_tokens_seen": 77149344, "step": 132880 }, { "epoch": 19.792225201072387, "grad_norm": 2.2812488079071045, "learning_rate": 1.6459826349385365e-08, "loss": 0.5384, "num_input_tokens_seen": 77152352, "step": 132885 }, { "epoch": 19.792969913613344, "grad_norm": 1.2777177095413208, "learning_rate": 1.6342143259839848e-08, "loss": 0.5424, "num_input_tokens_seen": 77155168, "step": 132890 }, { "epoch": 19.793714626154305, "grad_norm": 1.6371665000915527, "learning_rate": 1.622488224380103e-08, "loss": 0.6662, "num_input_tokens_seen": 77157920, "step": 132895 }, { "epoch": 19.794459338695262, "grad_norm": 1.1265301704406738, "learning_rate": 1.6108043303250664e-08, "loss": 0.5801, "num_input_tokens_seen": 77160896, "step": 132900 }, { "epoch": 19.795204051236222, "grad_norm": 1.5017235279083252, "learning_rate": 1.5991626440162165e-08, "loss": 0.6287, "num_input_tokens_seen": 77163904, "step": 132905 }, { "epoch": 19.795948763777183, "grad_norm": 2.4754998683929443, "learning_rate": 1.587563165650341e-08, "loss": 0.8914, "num_input_tokens_seen": 77166688, "step": 132910 }, { "epoch": 19.79669347631814, "grad_norm": 2.109807014465332, "learning_rate": 1.5760058954233935e-08, "loss": 0.4877, "num_input_tokens_seen": 77169760, "step": 132915 }, { "epoch": 19.7974381888591, "grad_norm": 1.3709678649902344, "learning_rate": 1.564490833530219e-08, "loss": 0.5387, "num_input_tokens_seen": 77172704, "step": 132920 }, { "epoch": 19.79818290140006, "grad_norm": 1.3607730865478516, "learning_rate": 1.5530179801659382e-08, "loss": 0.5876, "num_input_tokens_seen": 77175680, "step": 132925 }, { "epoch": 19.798927613941018, "grad_norm": 1.019281029701233, "learning_rate": 1.5415873355240086e-08, "loss": 0.4141, "num_input_tokens_seen": 77178784, "step": 132930 }, { "epoch": 19.79967232648198, "grad_norm": 1.3216886520385742, "learning_rate": 1.5301988997978857e-08, "loss": 0.5711, "num_input_tokens_seen": 77181536, "step": 132935 }, { "epoch": 19.800417039022935, "grad_norm": 1.2775802612304688, "learning_rate": 1.5188526731793608e-08, "loss": 0.5098, "num_input_tokens_seen": 77184512, "step": 132940 }, { "epoch": 19.801161751563896, "grad_norm": 1.7549690008163452, "learning_rate": 1.50754865586078e-08, "loss": 0.6712, "num_input_tokens_seen": 77187200, "step": 132945 }, { "epoch": 19.801906464104857, "grad_norm": 5.225903034210205, "learning_rate": 1.4962868480325465e-08, "loss": 0.9495, "num_input_tokens_seen": 77189952, "step": 132950 }, { "epoch": 19.802651176645814, "grad_norm": 1.5621588230133057, "learning_rate": 1.4850672498853413e-08, "loss": 0.5199, "num_input_tokens_seen": 77192896, "step": 132955 }, { "epoch": 19.803395889186774, "grad_norm": 1.454188346862793, "learning_rate": 1.4738898616084573e-08, "loss": 0.7143, "num_input_tokens_seen": 77195872, "step": 132960 }, { "epoch": 19.804140601727735, "grad_norm": 1.5938270092010498, "learning_rate": 1.4627546833909101e-08, "loss": 0.6063, "num_input_tokens_seen": 77198752, "step": 132965 }, { "epoch": 19.80488531426869, "grad_norm": 2.533205509185791, "learning_rate": 1.4516617154206048e-08, "loss": 0.4854, "num_input_tokens_seen": 77201600, "step": 132970 }, { "epoch": 19.805630026809652, "grad_norm": 1.2293641567230225, "learning_rate": 1.440610957885169e-08, "loss": 0.5417, "num_input_tokens_seen": 77204512, "step": 132975 }, { "epoch": 19.80637473935061, "grad_norm": 1.6709293127059937, "learning_rate": 1.4296024109711203e-08, "loss": 0.5425, "num_input_tokens_seen": 77207136, "step": 132980 }, { "epoch": 19.80711945189157, "grad_norm": 1.2544631958007812, "learning_rate": 1.4186360748644212e-08, "loss": 0.7357, "num_input_tokens_seen": 77210112, "step": 132985 }, { "epoch": 19.80786416443253, "grad_norm": 1.0138623714447021, "learning_rate": 1.4077119497507562e-08, "loss": 0.4624, "num_input_tokens_seen": 77213088, "step": 132990 }, { "epoch": 19.808608876973487, "grad_norm": 1.3270941972732544, "learning_rate": 1.3968300358138675e-08, "loss": 0.6665, "num_input_tokens_seen": 77215968, "step": 132995 }, { "epoch": 19.809353589514448, "grad_norm": 1.1445934772491455, "learning_rate": 1.3859903332383296e-08, "loss": 0.4506, "num_input_tokens_seen": 77218912, "step": 133000 }, { "epoch": 19.81009830205541, "grad_norm": 1.8180222511291504, "learning_rate": 1.3751928422070515e-08, "loss": 0.6059, "num_input_tokens_seen": 77221760, "step": 133005 }, { "epoch": 19.810843014596365, "grad_norm": 2.0152456760406494, "learning_rate": 1.3644375629023875e-08, "loss": 0.6719, "num_input_tokens_seen": 77224672, "step": 133010 }, { "epoch": 19.811587727137326, "grad_norm": 1.0040384531021118, "learning_rate": 1.3537244955061368e-08, "loss": 0.352, "num_input_tokens_seen": 77227712, "step": 133015 }, { "epoch": 19.812332439678283, "grad_norm": 1.3716492652893066, "learning_rate": 1.343053640198988e-08, "loss": 0.5459, "num_input_tokens_seen": 77230528, "step": 133020 }, { "epoch": 19.813077152219243, "grad_norm": 1.38121497631073, "learning_rate": 1.3324249971613523e-08, "loss": 0.655, "num_input_tokens_seen": 77233344, "step": 133025 }, { "epoch": 19.813821864760204, "grad_norm": 2.477508068084717, "learning_rate": 1.3218385665730859e-08, "loss": 0.4542, "num_input_tokens_seen": 77236384, "step": 133030 }, { "epoch": 19.81456657730116, "grad_norm": 1.508248209953308, "learning_rate": 1.3112943486129347e-08, "loss": 0.6308, "num_input_tokens_seen": 77239520, "step": 133035 }, { "epoch": 19.81531128984212, "grad_norm": 1.7703875303268433, "learning_rate": 1.3007923434585345e-08, "loss": 0.4985, "num_input_tokens_seen": 77242432, "step": 133040 }, { "epoch": 19.816056002383082, "grad_norm": 1.934226155281067, "learning_rate": 1.290332551288076e-08, "loss": 0.532, "num_input_tokens_seen": 77245280, "step": 133045 }, { "epoch": 19.81680071492404, "grad_norm": 1.4017534255981445, "learning_rate": 1.2799149722775294e-08, "loss": 0.7112, "num_input_tokens_seen": 77248320, "step": 133050 }, { "epoch": 19.817545427465, "grad_norm": 2.028315544128418, "learning_rate": 1.2695396066034205e-08, "loss": 0.4954, "num_input_tokens_seen": 77251424, "step": 133055 }, { "epoch": 19.818290140005956, "grad_norm": 1.5324273109436035, "learning_rate": 1.2592064544408866e-08, "loss": 0.6046, "num_input_tokens_seen": 77254432, "step": 133060 }, { "epoch": 19.819034852546917, "grad_norm": 1.509200930595398, "learning_rate": 1.248915515964233e-08, "loss": 0.4397, "num_input_tokens_seen": 77257600, "step": 133065 }, { "epoch": 19.819779565087877, "grad_norm": 2.3050527572631836, "learning_rate": 1.2386667913477645e-08, "loss": 0.6632, "num_input_tokens_seen": 77260512, "step": 133070 }, { "epoch": 19.820524277628834, "grad_norm": 2.47007417678833, "learning_rate": 1.228460280764121e-08, "loss": 0.5404, "num_input_tokens_seen": 77263840, "step": 133075 }, { "epoch": 19.821268990169795, "grad_norm": 1.8065723180770874, "learning_rate": 1.2182959843862196e-08, "loss": 0.6568, "num_input_tokens_seen": 77266752, "step": 133080 }, { "epoch": 19.822013702710755, "grad_norm": 1.6324553489685059, "learning_rate": 1.2081739023855899e-08, "loss": 0.6776, "num_input_tokens_seen": 77269568, "step": 133085 }, { "epoch": 19.822758415251712, "grad_norm": 1.8110885620117188, "learning_rate": 1.198094034933206e-08, "loss": 0.6052, "num_input_tokens_seen": 77272416, "step": 133090 }, { "epoch": 19.823503127792673, "grad_norm": 2.0926644802093506, "learning_rate": 1.1880563821992096e-08, "loss": 0.5206, "num_input_tokens_seen": 77274912, "step": 133095 }, { "epoch": 19.82424784033363, "grad_norm": 1.2872049808502197, "learning_rate": 1.1780609443534652e-08, "loss": 0.5492, "num_input_tokens_seen": 77277632, "step": 133100 }, { "epoch": 19.82499255287459, "grad_norm": 3.4921956062316895, "learning_rate": 1.1681077215644487e-08, "loss": 0.8765, "num_input_tokens_seen": 77280864, "step": 133105 }, { "epoch": 19.82573726541555, "grad_norm": 1.3835232257843018, "learning_rate": 1.1581967140009142e-08, "loss": 0.4406, "num_input_tokens_seen": 77283680, "step": 133110 }, { "epoch": 19.826481977956508, "grad_norm": 2.283860921859741, "learning_rate": 1.1483279218296728e-08, "loss": 0.6313, "num_input_tokens_seen": 77286752, "step": 133115 }, { "epoch": 19.82722669049747, "grad_norm": 1.5508677959442139, "learning_rate": 1.1385013452178128e-08, "loss": 0.5266, "num_input_tokens_seen": 77289632, "step": 133120 }, { "epoch": 19.827971403038426, "grad_norm": 1.2637298107147217, "learning_rate": 1.1287169843313127e-08, "loss": 0.4674, "num_input_tokens_seen": 77292512, "step": 133125 }, { "epoch": 19.828716115579386, "grad_norm": 1.013427972793579, "learning_rate": 1.1189748393353184e-08, "loss": 0.5341, "num_input_tokens_seen": 77295232, "step": 133130 }, { "epoch": 19.829460828120347, "grad_norm": 1.0809037685394287, "learning_rate": 1.10927491039442e-08, "loss": 0.5657, "num_input_tokens_seen": 77298112, "step": 133135 }, { "epoch": 19.830205540661304, "grad_norm": 1.68605637550354, "learning_rate": 1.0996171976726532e-08, "loss": 0.6935, "num_input_tokens_seen": 77301248, "step": 133140 }, { "epoch": 19.830950253202264, "grad_norm": 1.6204949617385864, "learning_rate": 1.0900017013329434e-08, "loss": 0.899, "num_input_tokens_seen": 77304096, "step": 133145 }, { "epoch": 19.831694965743225, "grad_norm": 1.7262622117996216, "learning_rate": 1.0804284215379379e-08, "loss": 0.5304, "num_input_tokens_seen": 77306880, "step": 133150 }, { "epoch": 19.83243967828418, "grad_norm": 2.122002363204956, "learning_rate": 1.070897358449452e-08, "loss": 0.5251, "num_input_tokens_seen": 77309664, "step": 133155 }, { "epoch": 19.833184390825142, "grad_norm": 1.9540822505950928, "learning_rate": 1.0614085122281902e-08, "loss": 0.5891, "num_input_tokens_seen": 77312800, "step": 133160 }, { "epoch": 19.8339291033661, "grad_norm": 2.3669862747192383, "learning_rate": 1.0519618830348577e-08, "loss": 0.5427, "num_input_tokens_seen": 77315648, "step": 133165 }, { "epoch": 19.83467381590706, "grad_norm": 3.0208511352539062, "learning_rate": 1.0425574710284936e-08, "loss": 0.6583, "num_input_tokens_seen": 77318688, "step": 133170 }, { "epoch": 19.83541852844802, "grad_norm": 2.171442747116089, "learning_rate": 1.033195276368415e-08, "loss": 0.5813, "num_input_tokens_seen": 77321472, "step": 133175 }, { "epoch": 19.836163240988977, "grad_norm": 1.3193252086639404, "learning_rate": 1.0238752992128287e-08, "loss": 0.5989, "num_input_tokens_seen": 77324384, "step": 133180 }, { "epoch": 19.836907953529938, "grad_norm": 1.3410247564315796, "learning_rate": 1.0145975397188311e-08, "loss": 0.4709, "num_input_tokens_seen": 77327264, "step": 133185 }, { "epoch": 19.8376526660709, "grad_norm": 1.3904767036437988, "learning_rate": 1.0053619980435191e-08, "loss": 0.5784, "num_input_tokens_seen": 77330272, "step": 133190 }, { "epoch": 19.838397378611855, "grad_norm": 2.146587371826172, "learning_rate": 9.961686743426012e-09, "loss": 0.6302, "num_input_tokens_seen": 77333216, "step": 133195 }, { "epoch": 19.839142091152816, "grad_norm": 1.297154188156128, "learning_rate": 9.870175687715089e-09, "loss": 0.4864, "num_input_tokens_seen": 77335840, "step": 133200 }, { "epoch": 19.839886803693773, "grad_norm": 1.3031792640686035, "learning_rate": 9.77908681485118e-09, "loss": 0.6289, "num_input_tokens_seen": 77338528, "step": 133205 }, { "epoch": 19.840631516234733, "grad_norm": 1.0658612251281738, "learning_rate": 9.688420126369168e-09, "loss": 0.6242, "num_input_tokens_seen": 77341472, "step": 133210 }, { "epoch": 19.841376228775694, "grad_norm": 3.7203755378723145, "learning_rate": 9.598175623801165e-09, "loss": 0.6308, "num_input_tokens_seen": 77344320, "step": 133215 }, { "epoch": 19.84212094131665, "grad_norm": 1.8742331266403198, "learning_rate": 9.508353308673723e-09, "loss": 0.6626, "num_input_tokens_seen": 77347200, "step": 133220 }, { "epoch": 19.84286565385761, "grad_norm": 1.2721781730651855, "learning_rate": 9.4189531825023e-09, "loss": 0.5105, "num_input_tokens_seen": 77349856, "step": 133225 }, { "epoch": 19.843610366398572, "grad_norm": 2.630413770675659, "learning_rate": 9.329975246799571e-09, "loss": 0.8509, "num_input_tokens_seen": 77352480, "step": 133230 }, { "epoch": 19.84435507893953, "grad_norm": 1.2508906126022339, "learning_rate": 9.241419503069892e-09, "loss": 0.4496, "num_input_tokens_seen": 77355168, "step": 133235 }, { "epoch": 19.84509979148049, "grad_norm": 1.6373400688171387, "learning_rate": 9.153285952803736e-09, "loss": 0.7122, "num_input_tokens_seen": 77358240, "step": 133240 }, { "epoch": 19.845844504021446, "grad_norm": 0.564106285572052, "learning_rate": 9.065574597494352e-09, "loss": 0.3466, "num_input_tokens_seen": 77361120, "step": 133245 }, { "epoch": 19.846589216562407, "grad_norm": 1.2656399011611938, "learning_rate": 8.978285438621115e-09, "loss": 0.6508, "num_input_tokens_seen": 77364000, "step": 133250 }, { "epoch": 19.847333929103367, "grad_norm": 2.9707114696502686, "learning_rate": 8.891418477660617e-09, "loss": 0.6218, "num_input_tokens_seen": 77366752, "step": 133255 }, { "epoch": 19.848078641644324, "grad_norm": 1.691328763961792, "learning_rate": 8.804973716081132e-09, "loss": 0.4248, "num_input_tokens_seen": 77369728, "step": 133260 }, { "epoch": 19.848823354185285, "grad_norm": 1.0953168869018555, "learning_rate": 8.718951155339827e-09, "loss": 0.7228, "num_input_tokens_seen": 77372896, "step": 133265 }, { "epoch": 19.849568066726242, "grad_norm": 1.3464170694351196, "learning_rate": 8.633350796893869e-09, "loss": 0.3873, "num_input_tokens_seen": 77375840, "step": 133270 }, { "epoch": 19.850312779267203, "grad_norm": 1.1312617063522339, "learning_rate": 8.548172642186547e-09, "loss": 0.4881, "num_input_tokens_seen": 77378688, "step": 133275 }, { "epoch": 19.851057491808163, "grad_norm": 1.413800597190857, "learning_rate": 8.463416692658377e-09, "loss": 0.5294, "num_input_tokens_seen": 77381760, "step": 133280 }, { "epoch": 19.85180220434912, "grad_norm": 1.3941446542739868, "learning_rate": 8.37908294973877e-09, "loss": 0.6055, "num_input_tokens_seen": 77384352, "step": 133285 }, { "epoch": 19.85254691689008, "grad_norm": 1.4580078125, "learning_rate": 8.295171414854363e-09, "loss": 0.6027, "num_input_tokens_seen": 77387264, "step": 133290 }, { "epoch": 19.85329162943104, "grad_norm": 1.2693859338760376, "learning_rate": 8.211682089423467e-09, "loss": 0.6075, "num_input_tokens_seen": 77389888, "step": 133295 }, { "epoch": 19.854036341971998, "grad_norm": 1.4346230030059814, "learning_rate": 8.128614974856064e-09, "loss": 0.4254, "num_input_tokens_seen": 77392928, "step": 133300 }, { "epoch": 19.85478105451296, "grad_norm": 1.6034015417099, "learning_rate": 8.04597007255381e-09, "loss": 0.6785, "num_input_tokens_seen": 77395936, "step": 133305 }, { "epoch": 19.855525767053916, "grad_norm": 1.4971016645431519, "learning_rate": 7.963747383915587e-09, "loss": 0.5821, "num_input_tokens_seen": 77398848, "step": 133310 }, { "epoch": 19.856270479594876, "grad_norm": 1.182718276977539, "learning_rate": 7.881946910329175e-09, "loss": 0.516, "num_input_tokens_seen": 77401664, "step": 133315 }, { "epoch": 19.857015192135837, "grad_norm": 1.4644474983215332, "learning_rate": 7.800568653174023e-09, "loss": 0.5622, "num_input_tokens_seen": 77404608, "step": 133320 }, { "epoch": 19.857759904676794, "grad_norm": 2.021878957748413, "learning_rate": 7.719612613829586e-09, "loss": 0.8182, "num_input_tokens_seen": 77407552, "step": 133325 }, { "epoch": 19.858504617217754, "grad_norm": 0.668481171131134, "learning_rate": 7.639078793661436e-09, "loss": 0.5089, "num_input_tokens_seen": 77410624, "step": 133330 }, { "epoch": 19.859249329758715, "grad_norm": 1.726362705230713, "learning_rate": 7.558967194029598e-09, "loss": 0.7829, "num_input_tokens_seen": 77413344, "step": 133335 }, { "epoch": 19.85999404229967, "grad_norm": 1.307720422744751, "learning_rate": 7.479277816285768e-09, "loss": 0.5205, "num_input_tokens_seen": 77415936, "step": 133340 }, { "epoch": 19.860738754840632, "grad_norm": 2.5059404373168945, "learning_rate": 7.400010661781642e-09, "loss": 0.7223, "num_input_tokens_seen": 77418944, "step": 133345 }, { "epoch": 19.86148346738159, "grad_norm": 1.6098326444625854, "learning_rate": 7.321165731849488e-09, "loss": 0.6694, "num_input_tokens_seen": 77422016, "step": 133350 }, { "epoch": 19.86222817992255, "grad_norm": 0.8611371517181396, "learning_rate": 7.242743027827126e-09, "loss": 0.4138, "num_input_tokens_seen": 77424672, "step": 133355 }, { "epoch": 19.86297289246351, "grad_norm": 2.1762940883636475, "learning_rate": 7.1647425510384944e-09, "loss": 0.6529, "num_input_tokens_seen": 77427616, "step": 133360 }, { "epoch": 19.863717605004467, "grad_norm": 1.1046254634857178, "learning_rate": 7.087164302796434e-09, "loss": 0.5479, "num_input_tokens_seen": 77430368, "step": 133365 }, { "epoch": 19.864462317545428, "grad_norm": 1.5409808158874512, "learning_rate": 7.010008284416558e-09, "loss": 0.6836, "num_input_tokens_seen": 77433312, "step": 133370 }, { "epoch": 19.86520703008639, "grad_norm": 1.226192831993103, "learning_rate": 6.933274497200604e-09, "loss": 0.5014, "num_input_tokens_seen": 77436128, "step": 133375 }, { "epoch": 19.865951742627345, "grad_norm": 1.064908504486084, "learning_rate": 6.856962942447531e-09, "loss": 0.5915, "num_input_tokens_seen": 77438944, "step": 133380 }, { "epoch": 19.866696455168306, "grad_norm": 1.2474075555801392, "learning_rate": 6.781073621442424e-09, "loss": 0.6279, "num_input_tokens_seen": 77441824, "step": 133385 }, { "epoch": 19.867441167709263, "grad_norm": 1.2845392227172852, "learning_rate": 6.70560653546759e-09, "loss": 0.4683, "num_input_tokens_seen": 77444576, "step": 133390 }, { "epoch": 19.868185880250223, "grad_norm": 1.33815336227417, "learning_rate": 6.6305616857997855e-09, "loss": 0.4999, "num_input_tokens_seen": 77447872, "step": 133395 }, { "epoch": 19.868930592791184, "grad_norm": 1.1649432182312012, "learning_rate": 6.55593907370744e-09, "loss": 0.5944, "num_input_tokens_seen": 77450976, "step": 133400 }, { "epoch": 19.86967530533214, "grad_norm": 1.7153940200805664, "learning_rate": 6.481738700450657e-09, "loss": 0.6053, "num_input_tokens_seen": 77453856, "step": 133405 }, { "epoch": 19.8704200178731, "grad_norm": 2.0585403442382812, "learning_rate": 6.4079605672839886e-09, "loss": 0.6458, "num_input_tokens_seen": 77456736, "step": 133410 }, { "epoch": 19.87116473041406, "grad_norm": 2.3316657543182373, "learning_rate": 6.334604675450884e-09, "loss": 0.6641, "num_input_tokens_seen": 77459488, "step": 133415 }, { "epoch": 19.87190944295502, "grad_norm": 2.4390299320220947, "learning_rate": 6.2616710261920176e-09, "loss": 0.5836, "num_input_tokens_seen": 77462464, "step": 133420 }, { "epoch": 19.87265415549598, "grad_norm": 1.0608243942260742, "learning_rate": 6.189159620739737e-09, "loss": 0.4619, "num_input_tokens_seen": 77465280, "step": 133425 }, { "epoch": 19.873398868036936, "grad_norm": 2.8635294437408447, "learning_rate": 6.117070460318064e-09, "loss": 0.4284, "num_input_tokens_seen": 77467936, "step": 133430 }, { "epoch": 19.874143580577897, "grad_norm": 1.3048152923583984, "learning_rate": 6.045403546148243e-09, "loss": 0.4986, "num_input_tokens_seen": 77470816, "step": 133435 }, { "epoch": 19.874888293118858, "grad_norm": 1.4628148078918457, "learning_rate": 5.974158879434866e-09, "loss": 0.5413, "num_input_tokens_seen": 77473472, "step": 133440 }, { "epoch": 19.875633005659815, "grad_norm": 2.276759386062622, "learning_rate": 5.903336461388076e-09, "loss": 0.6596, "num_input_tokens_seen": 77476864, "step": 133445 }, { "epoch": 19.876377718200775, "grad_norm": 2.2379729747772217, "learning_rate": 5.832936293201363e-09, "loss": 0.7201, "num_input_tokens_seen": 77479744, "step": 133450 }, { "epoch": 19.877122430741732, "grad_norm": 1.7310272455215454, "learning_rate": 5.762958376065441e-09, "loss": 0.462, "num_input_tokens_seen": 77482400, "step": 133455 }, { "epoch": 19.877867143282693, "grad_norm": 2.3029584884643555, "learning_rate": 5.693402711159923e-09, "loss": 0.5403, "num_input_tokens_seen": 77485600, "step": 133460 }, { "epoch": 19.878611855823653, "grad_norm": 1.8254146575927734, "learning_rate": 5.6242692996616445e-09, "loss": 0.8158, "num_input_tokens_seen": 77488544, "step": 133465 }, { "epoch": 19.87935656836461, "grad_norm": 1.5627176761627197, "learning_rate": 5.555558142736339e-09, "loss": 0.5316, "num_input_tokens_seen": 77491360, "step": 133470 }, { "epoch": 19.88010128090557, "grad_norm": 1.4095675945281982, "learning_rate": 5.487269241549742e-09, "loss": 0.4887, "num_input_tokens_seen": 77494112, "step": 133475 }, { "epoch": 19.88084599344653, "grad_norm": 1.5082427263259888, "learning_rate": 5.419402597250934e-09, "loss": 0.5375, "num_input_tokens_seen": 77497120, "step": 133480 }, { "epoch": 19.881590705987488, "grad_norm": 1.480962872505188, "learning_rate": 5.351958210986219e-09, "loss": 0.59, "num_input_tokens_seen": 77500000, "step": 133485 }, { "epoch": 19.88233541852845, "grad_norm": 1.7550212144851685, "learning_rate": 5.284936083899128e-09, "loss": 0.5323, "num_input_tokens_seen": 77502688, "step": 133490 }, { "epoch": 19.883080131069406, "grad_norm": 1.7807135581970215, "learning_rate": 5.218336217119313e-09, "loss": 0.4627, "num_input_tokens_seen": 77505312, "step": 133495 }, { "epoch": 19.883824843610366, "grad_norm": 1.4995613098144531, "learning_rate": 5.152158611770874e-09, "loss": 0.561, "num_input_tokens_seen": 77508352, "step": 133500 }, { "epoch": 19.884569556151327, "grad_norm": 2.640345335006714, "learning_rate": 5.086403268975137e-09, "loss": 0.7209, "num_input_tokens_seen": 77511584, "step": 133505 }, { "epoch": 19.885314268692284, "grad_norm": 1.1573476791381836, "learning_rate": 5.0210701898395494e-09, "loss": 0.6361, "num_input_tokens_seen": 77514592, "step": 133510 }, { "epoch": 19.886058981233244, "grad_norm": 2.039546489715576, "learning_rate": 4.956159375468783e-09, "loss": 0.7223, "num_input_tokens_seen": 77517408, "step": 133515 }, { "epoch": 19.886803693774205, "grad_norm": 1.3162305355072021, "learning_rate": 4.891670826959183e-09, "loss": 0.5985, "num_input_tokens_seen": 77520256, "step": 133520 }, { "epoch": 19.88754840631516, "grad_norm": 1.4982067346572876, "learning_rate": 4.8276045454043185e-09, "loss": 0.6553, "num_input_tokens_seen": 77523360, "step": 133525 }, { "epoch": 19.888293118856122, "grad_norm": 1.2254738807678223, "learning_rate": 4.763960531878331e-09, "loss": 0.5048, "num_input_tokens_seen": 77526016, "step": 133530 }, { "epoch": 19.88903783139708, "grad_norm": 1.3342082500457764, "learning_rate": 4.700738787466463e-09, "loss": 0.4693, "num_input_tokens_seen": 77529248, "step": 133535 }, { "epoch": 19.88978254393804, "grad_norm": 1.8011770248413086, "learning_rate": 4.637939313226203e-09, "loss": 0.5575, "num_input_tokens_seen": 77532096, "step": 133540 }, { "epoch": 19.890527256479, "grad_norm": 1.4835139513015747, "learning_rate": 4.575562110228915e-09, "loss": 0.7675, "num_input_tokens_seen": 77534720, "step": 133545 }, { "epoch": 19.891271969019957, "grad_norm": 2.265925407409668, "learning_rate": 4.513607179520985e-09, "loss": 0.4658, "num_input_tokens_seen": 77537536, "step": 133550 }, { "epoch": 19.892016681560918, "grad_norm": 1.7461414337158203, "learning_rate": 4.452074522148797e-09, "loss": 0.5774, "num_input_tokens_seen": 77540608, "step": 133555 }, { "epoch": 19.89276139410188, "grad_norm": 1.783718228340149, "learning_rate": 4.390964139158737e-09, "loss": 0.6428, "num_input_tokens_seen": 77543360, "step": 133560 }, { "epoch": 19.893506106642835, "grad_norm": 2.742034435272217, "learning_rate": 4.330276031577762e-09, "loss": 0.7778, "num_input_tokens_seen": 77546464, "step": 133565 }, { "epoch": 19.894250819183796, "grad_norm": 2.936511516571045, "learning_rate": 4.270010200430052e-09, "loss": 0.6459, "num_input_tokens_seen": 77549376, "step": 133570 }, { "epoch": 19.894995531724753, "grad_norm": 1.318387508392334, "learning_rate": 4.210166646737013e-09, "loss": 0.4037, "num_input_tokens_seen": 77552128, "step": 133575 }, { "epoch": 19.895740244265713, "grad_norm": 1.817775845527649, "learning_rate": 4.150745371508946e-09, "loss": 0.7506, "num_input_tokens_seen": 77555104, "step": 133580 }, { "epoch": 19.896484956806674, "grad_norm": 1.6076725721359253, "learning_rate": 4.0917463757506045e-09, "loss": 0.6362, "num_input_tokens_seen": 77557920, "step": 133585 }, { "epoch": 19.89722966934763, "grad_norm": 1.6719874143600464, "learning_rate": 4.033169660458413e-09, "loss": 0.5479, "num_input_tokens_seen": 77560992, "step": 133590 }, { "epoch": 19.89797438188859, "grad_norm": 1.036983847618103, "learning_rate": 3.975015226617695e-09, "loss": 0.7137, "num_input_tokens_seen": 77564032, "step": 133595 }, { "epoch": 19.898719094429552, "grad_norm": 1.2604385614395142, "learning_rate": 3.917283075216549e-09, "loss": 0.5447, "num_input_tokens_seen": 77567296, "step": 133600 }, { "epoch": 19.89946380697051, "grad_norm": 1.646789789199829, "learning_rate": 3.8599732072264195e-09, "loss": 0.5526, "num_input_tokens_seen": 77570112, "step": 133605 }, { "epoch": 19.90020851951147, "grad_norm": 2.0513556003570557, "learning_rate": 3.803085623618752e-09, "loss": 0.6572, "num_input_tokens_seen": 77572928, "step": 133610 }, { "epoch": 19.900953232052427, "grad_norm": 1.9348737001419067, "learning_rate": 3.746620325351113e-09, "loss": 0.5466, "num_input_tokens_seen": 77575520, "step": 133615 }, { "epoch": 19.901697944593387, "grad_norm": 1.1264809370040894, "learning_rate": 3.690577313381072e-09, "loss": 0.7605, "num_input_tokens_seen": 77578272, "step": 133620 }, { "epoch": 19.902442657134348, "grad_norm": 1.7751041650772095, "learning_rate": 3.6349565886523163e-09, "loss": 0.4821, "num_input_tokens_seen": 77581344, "step": 133625 }, { "epoch": 19.903187369675305, "grad_norm": 2.4684252738952637, "learning_rate": 3.579758152105761e-09, "loss": 0.469, "num_input_tokens_seen": 77584352, "step": 133630 }, { "epoch": 19.903932082216265, "grad_norm": 1.7242759466171265, "learning_rate": 3.524982004676769e-09, "loss": 0.5342, "num_input_tokens_seen": 77587264, "step": 133635 }, { "epoch": 19.904676794757222, "grad_norm": 2.283365249633789, "learning_rate": 3.4706281472840495e-09, "loss": 0.3939, "num_input_tokens_seen": 77590112, "step": 133640 }, { "epoch": 19.905421507298183, "grad_norm": 1.3690826892852783, "learning_rate": 3.4166965808518637e-09, "loss": 0.6561, "num_input_tokens_seen": 77592800, "step": 133645 }, { "epoch": 19.906166219839143, "grad_norm": 1.017572283744812, "learning_rate": 3.363187306287818e-09, "loss": 0.4372, "num_input_tokens_seen": 77595840, "step": 133650 }, { "epoch": 19.9069109323801, "grad_norm": 1.730246663093567, "learning_rate": 3.310100324499521e-09, "loss": 0.6298, "num_input_tokens_seen": 77598752, "step": 133655 }, { "epoch": 19.90765564492106, "grad_norm": 2.0926451683044434, "learning_rate": 3.2574356363807013e-09, "loss": 0.5247, "num_input_tokens_seen": 77601632, "step": 133660 }, { "epoch": 19.90840035746202, "grad_norm": 2.35685658454895, "learning_rate": 3.2051932428195375e-09, "loss": 0.7548, "num_input_tokens_seen": 77604320, "step": 133665 }, { "epoch": 19.909145070002978, "grad_norm": 1.4694465398788452, "learning_rate": 3.153373144704208e-09, "loss": 0.567, "num_input_tokens_seen": 77607040, "step": 133670 }, { "epoch": 19.90988978254394, "grad_norm": 1.8281203508377075, "learning_rate": 3.1019753429062383e-09, "loss": 0.5925, "num_input_tokens_seen": 77609856, "step": 133675 }, { "epoch": 19.910634495084896, "grad_norm": 1.2753697633743286, "learning_rate": 3.050999838294377e-09, "loss": 0.7052, "num_input_tokens_seen": 77612608, "step": 133680 }, { "epoch": 19.911379207625856, "grad_norm": 4.848307132720947, "learning_rate": 3.000446631729048e-09, "loss": 0.614, "num_input_tokens_seen": 77615584, "step": 133685 }, { "epoch": 19.912123920166817, "grad_norm": 1.0996237993240356, "learning_rate": 2.9503157240651226e-09, "loss": 0.5698, "num_input_tokens_seen": 77618272, "step": 133690 }, { "epoch": 19.912868632707774, "grad_norm": 1.168952465057373, "learning_rate": 2.900607116151921e-09, "loss": 0.5382, "num_input_tokens_seen": 77621504, "step": 133695 }, { "epoch": 19.913613345248734, "grad_norm": 2.152338981628418, "learning_rate": 2.8513208088248867e-09, "loss": 0.5955, "num_input_tokens_seen": 77624064, "step": 133700 }, { "epoch": 19.914358057789695, "grad_norm": 1.6020320653915405, "learning_rate": 2.802456802919462e-09, "loss": 0.6727, "num_input_tokens_seen": 77627168, "step": 133705 }, { "epoch": 19.915102770330652, "grad_norm": 1.421623706817627, "learning_rate": 2.7540150992627633e-09, "loss": 0.6271, "num_input_tokens_seen": 77630048, "step": 133710 }, { "epoch": 19.915847482871612, "grad_norm": 0.939083456993103, "learning_rate": 2.705995698668029e-09, "loss": 0.6055, "num_input_tokens_seen": 77632768, "step": 133715 }, { "epoch": 19.91659219541257, "grad_norm": 1.0656379461288452, "learning_rate": 2.658398601951273e-09, "loss": 0.4872, "num_input_tokens_seen": 77635968, "step": 133720 }, { "epoch": 19.91733690795353, "grad_norm": 1.4867775440216064, "learning_rate": 2.6112238099146315e-09, "loss": 0.8043, "num_input_tokens_seen": 77638752, "step": 133725 }, { "epoch": 19.91808162049449, "grad_norm": 1.3341622352600098, "learning_rate": 2.564471323354689e-09, "loss": 0.6585, "num_input_tokens_seen": 77641728, "step": 133730 }, { "epoch": 19.918826333035447, "grad_norm": 0.9303901195526123, "learning_rate": 2.5181411430597045e-09, "loss": 0.6321, "num_input_tokens_seen": 77644672, "step": 133735 }, { "epoch": 19.919571045576408, "grad_norm": 1.4683003425598145, "learning_rate": 2.472233269817936e-09, "loss": 0.5644, "num_input_tokens_seen": 77647808, "step": 133740 }, { "epoch": 19.92031575811737, "grad_norm": 1.872445821762085, "learning_rate": 2.4267477043982134e-09, "loss": 0.6687, "num_input_tokens_seen": 77650848, "step": 133745 }, { "epoch": 19.921060470658325, "grad_norm": 1.0587794780731201, "learning_rate": 2.3816844475749167e-09, "loss": 0.4509, "num_input_tokens_seen": 77653664, "step": 133750 }, { "epoch": 19.921805183199286, "grad_norm": 0.7818024754524231, "learning_rate": 2.337043500102998e-09, "loss": 0.5467, "num_input_tokens_seen": 77656352, "step": 133755 }, { "epoch": 19.922549895740243, "grad_norm": 1.2961777448654175, "learning_rate": 2.2928248627429595e-09, "loss": 0.7238, "num_input_tokens_seen": 77659264, "step": 133760 }, { "epoch": 19.923294608281203, "grad_norm": 1.6990201473236084, "learning_rate": 2.249028536238651e-09, "loss": 0.4979, "num_input_tokens_seen": 77662144, "step": 133765 }, { "epoch": 19.924039320822164, "grad_norm": 2.9572999477386475, "learning_rate": 2.205654521331146e-09, "loss": 0.612, "num_input_tokens_seen": 77664960, "step": 133770 }, { "epoch": 19.92478403336312, "grad_norm": 2.086700439453125, "learning_rate": 2.162702818753193e-09, "loss": 0.6035, "num_input_tokens_seen": 77667936, "step": 133775 }, { "epoch": 19.92552874590408, "grad_norm": 3.0086257457733154, "learning_rate": 2.120173429226435e-09, "loss": 0.4447, "num_input_tokens_seen": 77670816, "step": 133780 }, { "epoch": 19.92627345844504, "grad_norm": 2.2387266159057617, "learning_rate": 2.0780663534752944e-09, "loss": 0.4201, "num_input_tokens_seen": 77673696, "step": 133785 }, { "epoch": 19.927018170986, "grad_norm": 2.9328525066375732, "learning_rate": 2.036381592207537e-09, "loss": 0.7156, "num_input_tokens_seen": 77676608, "step": 133790 }, { "epoch": 19.92776288352696, "grad_norm": 1.3066803216934204, "learning_rate": 1.9951191461281547e-09, "loss": 0.3289, "num_input_tokens_seen": 77679392, "step": 133795 }, { "epoch": 19.928507596067917, "grad_norm": 0.8769239783287048, "learning_rate": 1.9542790159365887e-09, "loss": 0.6714, "num_input_tokens_seen": 77682176, "step": 133800 }, { "epoch": 19.929252308608877, "grad_norm": 1.3178857564926147, "learning_rate": 1.913861202318401e-09, "loss": 0.4989, "num_input_tokens_seen": 77684736, "step": 133805 }, { "epoch": 19.929997021149838, "grad_norm": 1.5197259187698364, "learning_rate": 1.873865705959155e-09, "loss": 0.556, "num_input_tokens_seen": 77687584, "step": 133810 }, { "epoch": 19.930741733690795, "grad_norm": 1.1931774616241455, "learning_rate": 1.83429252753331e-09, "loss": 0.5165, "num_input_tokens_seen": 77690336, "step": 133815 }, { "epoch": 19.931486446231755, "grad_norm": 1.0515152215957642, "learning_rate": 1.7951416677097766e-09, "loss": 0.6907, "num_input_tokens_seen": 77693280, "step": 133820 }, { "epoch": 19.932231158772712, "grad_norm": 1.1808894872665405, "learning_rate": 1.7564131271519123e-09, "loss": 0.6572, "num_input_tokens_seen": 77696096, "step": 133825 }, { "epoch": 19.932975871313673, "grad_norm": 1.1289938688278198, "learning_rate": 1.7181069065119736e-09, "loss": 0.5272, "num_input_tokens_seen": 77698912, "step": 133830 }, { "epoch": 19.933720583854633, "grad_norm": 1.721986174583435, "learning_rate": 1.6802230064366653e-09, "loss": 0.6099, "num_input_tokens_seen": 77701600, "step": 133835 }, { "epoch": 19.93446529639559, "grad_norm": 1.9131801128387451, "learning_rate": 1.642761427567141e-09, "loss": 0.7218, "num_input_tokens_seen": 77704448, "step": 133840 }, { "epoch": 19.93521000893655, "grad_norm": 2.2871131896972656, "learning_rate": 1.605722170536228e-09, "loss": 0.7873, "num_input_tokens_seen": 77707424, "step": 133845 }, { "epoch": 19.93595472147751, "grad_norm": 1.0228041410446167, "learning_rate": 1.5691052359684266e-09, "loss": 0.6092, "num_input_tokens_seen": 77710112, "step": 133850 }, { "epoch": 19.93669943401847, "grad_norm": 2.1194779872894287, "learning_rate": 1.5329106244854618e-09, "loss": 0.3944, "num_input_tokens_seen": 77712992, "step": 133855 }, { "epoch": 19.93744414655943, "grad_norm": 2.464508533477783, "learning_rate": 1.4971383366951807e-09, "loss": 0.8334, "num_input_tokens_seen": 77715936, "step": 133860 }, { "epoch": 19.938188859100386, "grad_norm": 1.2712478637695312, "learning_rate": 1.4617883732026549e-09, "loss": 0.419, "num_input_tokens_seen": 77718880, "step": 133865 }, { "epoch": 19.938933571641346, "grad_norm": 0.9294423460960388, "learning_rate": 1.4268607346074048e-09, "loss": 0.6859, "num_input_tokens_seen": 77721952, "step": 133870 }, { "epoch": 19.939678284182307, "grad_norm": 0.6896476149559021, "learning_rate": 1.3923554214978485e-09, "loss": 0.4492, "num_input_tokens_seen": 77724736, "step": 133875 }, { "epoch": 19.940422996723264, "grad_norm": 2.1100270748138428, "learning_rate": 1.3582724344568532e-09, "loss": 0.5539, "num_input_tokens_seen": 77727648, "step": 133880 }, { "epoch": 19.941167709264224, "grad_norm": 2.3689401149749756, "learning_rate": 1.3246117740589592e-09, "loss": 0.5643, "num_input_tokens_seen": 77730688, "step": 133885 }, { "epoch": 19.941912421805185, "grad_norm": 2.143934488296509, "learning_rate": 1.2913734408759314e-09, "loss": 0.741, "num_input_tokens_seen": 77733472, "step": 133890 }, { "epoch": 19.942657134346142, "grad_norm": 1.1649914979934692, "learning_rate": 1.258557435465657e-09, "loss": 0.4607, "num_input_tokens_seen": 77736416, "step": 133895 }, { "epoch": 19.943401846887102, "grad_norm": 1.5445395708084106, "learning_rate": 1.226163758386023e-09, "loss": 0.5878, "num_input_tokens_seen": 77739104, "step": 133900 }, { "epoch": 19.94414655942806, "grad_norm": 1.2598358392715454, "learning_rate": 1.1941924101838142e-09, "loss": 0.5083, "num_input_tokens_seen": 77741952, "step": 133905 }, { "epoch": 19.94489127196902, "grad_norm": 2.013902187347412, "learning_rate": 1.1626433913947132e-09, "loss": 0.7201, "num_input_tokens_seen": 77744832, "step": 133910 }, { "epoch": 19.94563598450998, "grad_norm": 1.681982159614563, "learning_rate": 1.1315167025571782e-09, "loss": 0.5603, "num_input_tokens_seen": 77747552, "step": 133915 }, { "epoch": 19.946380697050937, "grad_norm": 7.100974082946777, "learning_rate": 1.1008123441957896e-09, "loss": 0.8084, "num_input_tokens_seen": 77750432, "step": 133920 }, { "epoch": 19.947125409591898, "grad_norm": 1.9329185485839844, "learning_rate": 1.070530316826801e-09, "loss": 0.6031, "num_input_tokens_seen": 77753344, "step": 133925 }, { "epoch": 19.947870122132855, "grad_norm": 1.112789273262024, "learning_rate": 1.0406706209636908e-09, "loss": 0.5343, "num_input_tokens_seen": 77756224, "step": 133930 }, { "epoch": 19.948614834673815, "grad_norm": 3.0129544734954834, "learning_rate": 1.01123325711161e-09, "loss": 0.7206, "num_input_tokens_seen": 77758944, "step": 133935 }, { "epoch": 19.949359547214776, "grad_norm": 1.0762555599212646, "learning_rate": 9.82218225767384e-10, "loss": 0.5724, "num_input_tokens_seen": 77761824, "step": 133940 }, { "epoch": 19.950104259755733, "grad_norm": 1.9063777923583984, "learning_rate": 9.536255274195105e-10, "loss": 0.5879, "num_input_tokens_seen": 77764672, "step": 133945 }, { "epoch": 19.950848972296694, "grad_norm": 1.0155713558197021, "learning_rate": 9.254551625509367e-10, "loss": 0.6166, "num_input_tokens_seen": 77767776, "step": 133950 }, { "epoch": 19.951593684837654, "grad_norm": 1.3359379768371582, "learning_rate": 8.977071316418339e-10, "loss": 0.4237, "num_input_tokens_seen": 77770720, "step": 133955 }, { "epoch": 19.95233839737861, "grad_norm": 2.3794445991516113, "learning_rate": 8.703814351557205e-10, "loss": 0.6408, "num_input_tokens_seen": 77773504, "step": 133960 }, { "epoch": 19.95308310991957, "grad_norm": 0.8836321830749512, "learning_rate": 8.434780735561143e-10, "loss": 0.4925, "num_input_tokens_seen": 77776800, "step": 133965 }, { "epoch": 19.95382782246053, "grad_norm": 2.8503944873809814, "learning_rate": 8.169970473009825e-10, "loss": 0.6983, "num_input_tokens_seen": 77779776, "step": 133970 }, { "epoch": 19.95457253500149, "grad_norm": 1.8526281118392944, "learning_rate": 7.909383568316386e-10, "loss": 0.4839, "num_input_tokens_seen": 77782720, "step": 133975 }, { "epoch": 19.95531724754245, "grad_norm": 1.9612535238265991, "learning_rate": 7.65302002592172e-10, "loss": 0.4193, "num_input_tokens_seen": 77785600, "step": 133980 }, { "epoch": 19.956061960083407, "grad_norm": 1.384480595588684, "learning_rate": 7.400879850155695e-10, "loss": 0.4906, "num_input_tokens_seen": 77788832, "step": 133985 }, { "epoch": 19.956806672624367, "grad_norm": 2.6450817584991455, "learning_rate": 7.152963045264915e-10, "loss": 0.5225, "num_input_tokens_seen": 77791776, "step": 133990 }, { "epoch": 19.957551385165328, "grad_norm": 0.7623576521873474, "learning_rate": 6.909269615440472e-10, "loss": 0.5609, "num_input_tokens_seen": 77794720, "step": 133995 }, { "epoch": 19.958296097706285, "grad_norm": 2.004485845565796, "learning_rate": 6.669799564817947e-10, "loss": 0.4337, "num_input_tokens_seen": 77797728, "step": 134000 }, { "epoch": 19.959040810247245, "grad_norm": 1.5018235445022583, "learning_rate": 6.434552897421897e-10, "loss": 0.5809, "num_input_tokens_seen": 77800512, "step": 134005 }, { "epoch": 19.959785522788202, "grad_norm": 1.0139294862747192, "learning_rate": 6.203529617221371e-10, "loss": 0.6016, "num_input_tokens_seen": 77803136, "step": 134010 }, { "epoch": 19.960530235329163, "grad_norm": 1.2891298532485962, "learning_rate": 5.976729728129904e-10, "loss": 0.6164, "num_input_tokens_seen": 77806112, "step": 134015 }, { "epoch": 19.961274947870123, "grad_norm": 1.427058458328247, "learning_rate": 5.754153233977766e-10, "loss": 0.5403, "num_input_tokens_seen": 77808960, "step": 134020 }, { "epoch": 19.96201966041108, "grad_norm": 1.7247438430786133, "learning_rate": 5.535800138539715e-10, "loss": 0.4699, "num_input_tokens_seen": 77811744, "step": 134025 }, { "epoch": 19.96276437295204, "grad_norm": 1.797391414642334, "learning_rate": 5.321670445479488e-10, "loss": 0.6038, "num_input_tokens_seen": 77814848, "step": 134030 }, { "epoch": 19.963509085493, "grad_norm": 1.49617338180542, "learning_rate": 5.111764158433063e-10, "loss": 0.5196, "num_input_tokens_seen": 77817952, "step": 134035 }, { "epoch": 19.96425379803396, "grad_norm": 1.2687726020812988, "learning_rate": 4.906081280953157e-10, "loss": 0.5729, "num_input_tokens_seen": 77820832, "step": 134040 }, { "epoch": 19.96499851057492, "grad_norm": 1.0956549644470215, "learning_rate": 4.704621816481458e-10, "loss": 0.5332, "num_input_tokens_seen": 77823936, "step": 134045 }, { "epoch": 19.965743223115876, "grad_norm": 2.370744466781616, "learning_rate": 4.5073857684319043e-10, "loss": 0.4398, "num_input_tokens_seen": 77826912, "step": 134050 }, { "epoch": 19.966487935656836, "grad_norm": 2.1901166439056396, "learning_rate": 4.3143731401629194e-10, "loss": 0.4918, "num_input_tokens_seen": 77829888, "step": 134055 }, { "epoch": 19.967232648197797, "grad_norm": 1.645005702972412, "learning_rate": 4.1255839349219063e-10, "loss": 0.6053, "num_input_tokens_seen": 77832704, "step": 134060 }, { "epoch": 19.967977360738754, "grad_norm": 2.27091121673584, "learning_rate": 3.9410181559007553e-10, "loss": 0.541, "num_input_tokens_seen": 77835648, "step": 134065 }, { "epoch": 19.968722073279714, "grad_norm": 3.2349355220794678, "learning_rate": 3.760675806180336e-10, "loss": 0.5463, "num_input_tokens_seen": 77838688, "step": 134070 }, { "epoch": 19.969466785820675, "grad_norm": 1.8251231908798218, "learning_rate": 3.5845568888692726e-10, "loss": 0.5281, "num_input_tokens_seen": 77841632, "step": 134075 }, { "epoch": 19.970211498361632, "grad_norm": 1.9580339193344116, "learning_rate": 3.412661406881901e-10, "loss": 0.5699, "num_input_tokens_seen": 77844544, "step": 134080 }, { "epoch": 19.970956210902592, "grad_norm": 1.621867299079895, "learning_rate": 3.244989363188067e-10, "loss": 0.5972, "num_input_tokens_seen": 77847648, "step": 134085 }, { "epoch": 19.97170092344355, "grad_norm": 1.6457029581069946, "learning_rate": 3.0815407605633283e-10, "loss": 0.6382, "num_input_tokens_seen": 77850432, "step": 134090 }, { "epoch": 19.97244563598451, "grad_norm": 1.9781665802001953, "learning_rate": 2.922315601783243e-10, "loss": 0.6599, "num_input_tokens_seen": 77853440, "step": 134095 }, { "epoch": 19.97319034852547, "grad_norm": 1.4472359418869019, "learning_rate": 2.7673138895678574e-10, "loss": 0.6544, "num_input_tokens_seen": 77856192, "step": 134100 }, { "epoch": 19.973935061066427, "grad_norm": 1.5933518409729004, "learning_rate": 2.61653562649844e-10, "loss": 0.4811, "num_input_tokens_seen": 77859392, "step": 134105 }, { "epoch": 19.974679773607388, "grad_norm": 2.4855923652648926, "learning_rate": 2.469980815128503e-10, "loss": 0.688, "num_input_tokens_seen": 77862304, "step": 134110 }, { "epoch": 19.97542448614835, "grad_norm": 1.3224841356277466, "learning_rate": 2.3276494579560492e-10, "loss": 0.5523, "num_input_tokens_seen": 77865344, "step": 134115 }, { "epoch": 19.976169198689306, "grad_norm": 2.2022452354431152, "learning_rate": 2.1895415573680578e-10, "loss": 0.6748, "num_input_tokens_seen": 77868320, "step": 134120 }, { "epoch": 19.976913911230266, "grad_norm": 2.640936851501465, "learning_rate": 2.055657115695997e-10, "loss": 0.5279, "num_input_tokens_seen": 77871456, "step": 134125 }, { "epoch": 19.977658623771223, "grad_norm": 3.1303181648254395, "learning_rate": 1.925996135215824e-10, "loss": 0.7429, "num_input_tokens_seen": 77874656, "step": 134130 }, { "epoch": 19.978403336312184, "grad_norm": 4.020655155181885, "learning_rate": 1.8005586181202295e-10, "loss": 0.7211, "num_input_tokens_seen": 77877536, "step": 134135 }, { "epoch": 19.979148048853144, "grad_norm": 1.3595393896102905, "learning_rate": 1.6793445664908813e-10, "loss": 0.6079, "num_input_tokens_seen": 77880704, "step": 134140 }, { "epoch": 19.9798927613941, "grad_norm": 1.1771633625030518, "learning_rate": 1.5623539824372035e-10, "loss": 0.5893, "num_input_tokens_seen": 77883840, "step": 134145 }, { "epoch": 19.98063747393506, "grad_norm": 2.5308432579040527, "learning_rate": 1.4495868678743307e-10, "loss": 0.6881, "num_input_tokens_seen": 77886720, "step": 134150 }, { "epoch": 19.98138218647602, "grad_norm": 1.5863063335418701, "learning_rate": 1.3410432247173977e-10, "loss": 0.5771, "num_input_tokens_seen": 77889664, "step": 134155 }, { "epoch": 19.98212689901698, "grad_norm": 1.0039122104644775, "learning_rate": 1.2367230548537834e-10, "loss": 0.5623, "num_input_tokens_seen": 77892448, "step": 134160 }, { "epoch": 19.98287161155794, "grad_norm": 1.3424934148788452, "learning_rate": 1.1366263599765781e-10, "loss": 0.6554, "num_input_tokens_seen": 77895072, "step": 134165 }, { "epoch": 19.983616324098897, "grad_norm": 3.105884313583374, "learning_rate": 1.040753141834383e-10, "loss": 0.51, "num_input_tokens_seen": 77897792, "step": 134170 }, { "epoch": 19.984361036639857, "grad_norm": 1.3691104650497437, "learning_rate": 9.491034020092659e-11, "loss": 0.6766, "num_input_tokens_seen": 77900672, "step": 134175 }, { "epoch": 19.985105749180818, "grad_norm": 1.3833528757095337, "learning_rate": 8.616771420555391e-11, "loss": 0.5784, "num_input_tokens_seen": 77903488, "step": 134180 }, { "epoch": 19.985850461721775, "grad_norm": 1.8124014139175415, "learning_rate": 7.784743634720038e-11, "loss": 0.6325, "num_input_tokens_seen": 77906336, "step": 134185 }, { "epoch": 19.986595174262735, "grad_norm": 3.324493408203125, "learning_rate": 6.994950676186829e-11, "loss": 0.6033, "num_input_tokens_seen": 77908960, "step": 134190 }, { "epoch": 19.987339886803692, "grad_norm": 3.7338008880615234, "learning_rate": 6.247392558833553e-11, "loss": 0.7018, "num_input_tokens_seen": 77911872, "step": 134195 }, { "epoch": 19.988084599344653, "grad_norm": 1.5252044200897217, "learning_rate": 5.542069295150221e-11, "loss": 0.6398, "num_input_tokens_seen": 77914496, "step": 134200 }, { "epoch": 19.988829311885613, "grad_norm": 1.869326114654541, "learning_rate": 4.878980896794172e-11, "loss": 0.4893, "num_input_tokens_seen": 77917472, "step": 134205 }, { "epoch": 19.98957402442657, "grad_norm": 1.6138745546340942, "learning_rate": 4.258127375145193e-11, "loss": 0.5559, "num_input_tokens_seen": 77920352, "step": 134210 }, { "epoch": 19.99031873696753, "grad_norm": 2.3857929706573486, "learning_rate": 3.679508740472848e-11, "loss": 0.5346, "num_input_tokens_seen": 77923168, "step": 134215 }, { "epoch": 19.99106344950849, "grad_norm": 1.2052228450775146, "learning_rate": 3.143125003046699e-11, "loss": 0.7162, "num_input_tokens_seen": 77926368, "step": 134220 }, { "epoch": 19.99180816204945, "grad_norm": 1.4352258443832397, "learning_rate": 2.648976171470974e-11, "loss": 0.6405, "num_input_tokens_seen": 77929440, "step": 134225 }, { "epoch": 19.99255287459041, "grad_norm": 1.445339560508728, "learning_rate": 2.197062254349902e-11, "loss": 0.392, "num_input_tokens_seen": 77932192, "step": 134230 }, { "epoch": 19.993297587131366, "grad_norm": 1.7384591102600098, "learning_rate": 1.7873832591774885e-11, "loss": 0.7583, "num_input_tokens_seen": 77935136, "step": 134235 }, { "epoch": 19.994042299672326, "grad_norm": 1.933046817779541, "learning_rate": 1.4199391928926276e-11, "loss": 0.6885, "num_input_tokens_seen": 77938048, "step": 134240 }, { "epoch": 19.994787012213287, "grad_norm": 2.199538230895996, "learning_rate": 1.0947300618791013e-11, "loss": 0.6064, "num_input_tokens_seen": 77940864, "step": 134245 }, { "epoch": 19.995531724754244, "grad_norm": 1.0329171419143677, "learning_rate": 8.11755871410469e-12, "loss": 0.5342, "num_input_tokens_seen": 77943904, "step": 134250 }, { "epoch": 19.996276437295204, "grad_norm": 1.3925074338912964, "learning_rate": 5.710166262051786e-12, "loss": 0.5311, "num_input_tokens_seen": 77946912, "step": 134255 }, { "epoch": 19.997021149836165, "grad_norm": 1.916348934173584, "learning_rate": 3.725123307041223e-12, "loss": 0.6923, "num_input_tokens_seen": 77949824, "step": 134260 }, { "epoch": 19.997765862377122, "grad_norm": 1.1171884536743164, "learning_rate": 2.162429879604133e-12, "loss": 0.5514, "num_input_tokens_seen": 77953024, "step": 134265 }, { "epoch": 19.998510574918082, "grad_norm": 1.6039687395095825, "learning_rate": 1.022086004720535e-12, "loss": 0.4371, "num_input_tokens_seen": 77956032, "step": 134270 }, { "epoch": 19.99925528745904, "grad_norm": 1.5345945358276367, "learning_rate": 3.040917045948888e-13, "loss": 0.5022, "num_input_tokens_seen": 77959072, "step": 134275 }, { "epoch": 20.0, "grad_norm": 1.9757906198501587, "learning_rate": 8.446993104982426e-15, "loss": 0.5324, "num_input_tokens_seen": 77961608, "step": 134280 }, { "epoch": 20.0, "eval_loss": 0.6561894416809082, "eval_runtime": 46.9889, "eval_samples_per_second": 63.504, "eval_steps_per_second": 15.876, "num_input_tokens_seen": 77961608, "step": 134280 }, { "epoch": 20.0, "num_input_tokens_seen": 77961608, "step": 134280, "total_flos": 3.5106679393429094e+18, "train_loss": 0.63635343176945, "train_runtime": 27673.5035, "train_samples_per_second": 19.407, "train_steps_per_second": 4.852 } ], "logging_steps": 5, "max_steps": 134280, "num_input_tokens_seen": 77961608, "num_train_epochs": 20, "save_steps": 6714, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.5106679393429094e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }